diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..ae66414
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,37 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+2b-pt-sfp.sbs filter=lfs diff=lfs merge=lfs -text
+tokenizer.spm filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b796814..2052a82 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -17,12 +17,10 @@ jobs:
       fail-fast: false
       matrix:
         # When adding another, also add to copybara's github_check_runs.
-        os: ['ubuntu-latest', 'macos-latest', 'windows-latest', 'ubuntu-20.04']
+        os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
         build_type: ['Release']
         preset: ['make', 'windows']
         exclude:
-          - os: ubuntu-20.04
-            preset: windows
           - os: ubuntu-latest
             preset: windows
           - os: macos-latest
@@ -62,44 +60,6 @@ jobs:
           ${{ github.workspace }}/build/gemma
           ${{ github.workspace }}/build/libgemma.a
 
-    - if: matrix.os == 'ubuntu-20.04'
-      name: Upload build artifacts to Kaggle
-      uses: pculliton/push-kaggle-dataset@v1.0.0
-      env:
-        KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
-        KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
-      with:
-        id:  "phillipculliton/gemma-build-artifacts"
-        files: |
-          build/gemma
-          build/_deps/sentencepiece-build/src/libsentencepiece.so.0
-
-    - if: matrix.os == 'ubuntu-20.04'
-      name: Create code for new test notebook version
-      run: |
-        cat > runner.py << EOF
-        import subprocess
-        subprocess.run(["cp", "/kaggle/input/gemma-build-artifacts/gemma", "/kaggle/working"])
-        subprocess.run(["chmod", "700", "/kaggle/working/gemma"])
-        subprocess.run(["cp", "/kaggle/input/gemma-build-artifacts/_deps/sentencepiece-build/src/libsentencepiece.so.0", "/kaggle/working"])
-        output = subprocess.run(["/kaggle/working/gemma", "--tokenizer", "/kaggle/input/gemma/gemmacpp/2b-it-sfp/4/tokenizer.spm", "--compressed_weights", "/kaggle/input/gemma/gemmacpp/2b-it-sfp/4/2b-it-sfp.sbs", "--model", "2b-it", "--verbosity", "0", "--max_generated_tokens", "128"], stdout=subprocess.PIPE, input='Write an email to the moon.', encoding='ascii').stdout
-        assert("write an email to the moon." not in output.lower());
-        assert("moon" in output.lower());
-        EOF
-
-    - if: matrix.os == 'ubuntu-20.04'
-      name: Run kaggle test notebook
-      uses: pculliton/kaggle-action@v1.0.28
-      with:
-        username: ${{ secrets.KAGGLE_USERNAME }}
-        key: ${{ secrets.KAGGLE_KEY }}
-        title: GemmaCPP-CI-2
-        code_file: runner.py
-        dataset_sources: "phillipculliton/gemma-build-artifacts"
-        model_sources: "google/gemma/gemmaCpp/2b-it-sfp/4"
-        enable_gpu: False
-        kernel_type: script
-
   bazel:
     runs-on: ubuntu-latest
     steps:
@@ -116,4 +76,4 @@ jobs:
         with:
           path: ~/.cache/bazel
           key: bazel-${{ runner.os }}
-      - run: bazel build --cxxopt=-std=c++20 //:all
+      - run: bazel build --cxxopt=-std=c++20 //:gemma --jobs=10 --show_progress_rate_limit=1
diff --git a/.gitignore b/.gitignore
index d4264cb..1c13032 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,25 @@
+# Build directories
 .cache/
 bazel-*/
 build-*/
+build/
+
+# Python cache
 python/*/__pycache__
+
+# Model files
+*.sbs
+*.spm
+*.data
+*.bin
+*.weights
+
+# IDE and editor files
+.vscode/
+.idea/
+*.swp
+*~
+
+# Local development
+.env
+.env.local
\ No newline at end of file
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000..64d3f90
--- /dev/null
+++ b/.vscode/c_cpp_properties.json
@@ -0,0 +1,15 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**"
+            ],
+            "defines": [],
+            "cStandard": "c17",
+            "cppStandard": "c++17",
+            "intelliSenseMode": "linux-clang-x64"
+        }
+    ],
+    "version": 4
+}
\ No newline at end of file
diff --git a/BUILD.bazel b/BUILD.bazel
index 8c32631..2628bc3 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -19,7 +19,10 @@ license(
 # Dual-licensed Apache 2 and 3-clause BSD.
 licenses(["notice"])
 
-exports_files(["LICENSE"])
+exports_files([
+    "LICENSE",
+    ".github/workflows/build.yml",
+])
 
 cc_library(
     name = "basics",
@@ -29,6 +32,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "args",
+    hdrs = ["util/args.h"],
+    deps = [
+        ":basics",
+        "//io",  # Path
+        "@highway//:hwy",
+    ],
+)
+
 # Split from :threading to break a circular dependency with :allocator.
 cc_library(
     name = "topology",
@@ -59,6 +72,7 @@ cc_library(
     hdrs = ["util/threading.h"],
     deps = [
         ":allocator",
+        ":args",
         ":basics",
         ":topology",
         # Placeholder for container detection, do not remove
@@ -68,14 +82,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "threading_context",
+    srcs = ["util/threading_context.cc"],
+    hdrs = ["util/threading_context.h"],
+    deps = [
+        ":allocator",
+        ":args",
+        ":basics",
+        ":threading",
+        ":topology",
+        "@highway//:hwy",
+        "@highway//:profiler",
+    ],
+)
+
 cc_test(
     name = "threading_test",
     srcs = ["util/threading_test.cc"],
     deps = [
-        ":allocator",
         ":basics",
-        ":threading",
-        "@googletest//:gtest_main",
+        ":threading_context",
+        "@googletest//:gtest_main",  # buildcleaner: keep
         "@highway//:auto_tune",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
@@ -97,6 +125,124 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "configs",
+    srcs = ["gemma/configs.cc"],
+    hdrs = ["gemma/configs.h"],
+    deps = [
+        ":basics",
+        "//compression:types",
+        "//io",
+        "//io:fields",
+        "@highway//:hwy",  # base.h
+    ],
+)
+
+cc_test(
+    name = "configs_test",
+    srcs = ["gemma/configs_test.cc"],
+    deps = [
+        ":configs",
+        "@googletest//:gtest_main",  # buildcleaner: keep
+        "//compression:types",
+        "//io:fields",
+    ],
+)
+
+cc_library(
+    name = "tensor_info",
+    srcs = ["gemma/tensor_info.cc"],
+    hdrs = ["gemma/tensor_info.h"],
+    deps = [
+        ":basics",
+        ":configs",
+        "//compression:types",
+    ],
+)
+
+cc_library(
+    name = "mat",
+    srcs = ["util/mat.cc"],
+    hdrs = ["util/mat.h"],
+    deps = [
+        ":allocator",
+        ":basics",
+        ":tensor_info",
+        ":threading_context",
+        "//compression:types",
+        "//io:fields",
+        "@highway//:hwy",
+        "@highway//:profiler",
+    ],
+)
+
+cc_library(
+    name = "tokenizer",
+    srcs = ["gemma/tokenizer.cc"],
+    hdrs = ["gemma/tokenizer.h"],
+    deps = [
+        ":configs",
+        "@highway//:hwy",
+        "@highway//:profiler",
+        "@com_google_sentencepiece//:sentencepiece_processor",
+    ],
+)
+
+cc_library(
+    name = "model_store",
+    srcs = ["gemma/model_store.cc"],
+    hdrs = ["gemma/model_store.h"],
+    deps = [
+        ":allocator",
+        ":basics",
+        ":configs",
+        ":mat",
+        ":tensor_info",
+        ":threading_context",
+        ":tokenizer",
+        "//compression:types",
+        "//io",
+        "//io:blob_store",
+        "//io:fields",
+        "@highway//:hwy",
+        "@highway//:thread_pool",
+    ],
+)
+
+cc_library(
+    name = "weights",
+    srcs = ["gemma/weights.cc"],
+    hdrs = ["gemma/weights.h"],
+    deps = [
+        ":configs",
+        ":gemma_args",
+        ":mat",
+        ":matmul",
+        ":model_store",
+        ":tensor_info",
+        ":threading_context",
+        "//compression:compress",
+        "//io:blob_store",
+        "@highway//:hwy",
+        "@highway//:profiler",
+        "@highway//:thread_pool",
+    ],
+)
+
+cc_test(
+    name = "tensor_info_test",
+    srcs = ["gemma/tensor_info_test.cc"],
+    deps = [
+        ":configs",
+        ":mat",
+        ":tensor_info",
+        ":weights",
+        "@googletest//:gtest_main",  # buildcleaner: keep
+        "//compression:compress",
+        "@highway//:hwy",  # aligned_allocator.h
+    ],
+)
+
 # For building all tests in one command, so we can test several.
 test_suite(
     name = "ops_tests",
@@ -104,34 +250,75 @@ test_suite(
 )
 
 cc_library(
-    name = "ops",
+    name = "matmul",
+    srcs = ["ops/matmul.cc"],
+    hdrs = ["ops/matmul.h"],
+    textual_hdrs = ["ops/matmul-inl.h"],
+    deps = [
+        ":allocator",
+        ":basics",
+        ":mat",
+        ":threading_context",
+        "//compression:compress",
+        "@highway//:bit_set",
+        "@highway//:hwy",
+        "@highway//:nanobenchmark",
+        "@highway//:profiler",
+    ],
+)
+
+cc_library(
+    name = "matmul_static",
     srcs = [
-        "ops/matmul.cc",
+        # single-file build time is ~30sec for msan, hence shard.
+        "ops/matmul_static_bf16.cc",
+        "ops/matmul_static_f32.cc",
+        "ops/matmul_static_nuq.cc",
+        "ops/matmul_static_sfp.cc",
     ],
     hdrs = [
-        "ops/matmul.h",
-        "ops/ops.h",
+        "ops/matmul_static.h",
     ],
+    textual_hdrs = [
+        "ops/matmul_static-inl.h",
+        "ops/matmul-inl.h",
+    ],
+    deps = [
+        ":allocator",
+        ":basics",
+        ":mat",
+        ":matmul",
+        ":threading_context",
+        "//compression:compress",
+        "//compression:types",
+        "@highway//:hwy",
+        "@highway//:profiler",
+        "@highway//:timer",
+    ],
+)
+
+cc_library(
+    name = "ops",
+    hdrs = ["ops/ops.h"],
     textual_hdrs = [
         "ops/dot-inl.h",
         "ops/sum-inl.h",
         "ops/fp_arith-inl.h",
-        "ops/matmul-inl.h",
         "ops/matvec-inl.h",
         "ops/ops-inl.h",
     ],
     deps = [
         ":allocator",
         ":basics",
-        ":threading",
-        ":topology",
+        ":mat",
+        ":matmul",
+        ":matmul_static",
+        ":threading_context",
         "//compression:compress",
         "@highway//:algo",
-        "@highway//:bit_set",
         "@highway//:hwy",
         "@highway//:math",
         "@highway//:matvec",
-        "@highway//:nanobenchmark",
         "@highway//:profiler",
         "@highway//:thread_pool",
         "@highway//hwy/contrib/sort:vqsort",
@@ -143,15 +330,15 @@ cc_test(
     size = "small",
     timeout = "long",
     srcs = ["ops/dot_test.cc"],
+    linkstatic = True,
     local_defines = ["HWY_IS_TEST"],
     # for test_suite.
     tags = ["ops_tests"],
     deps = [
         ":allocator",
-        ":app",
         ":ops",
         ":test_util",
-        ":threading",
+        ":threading_context",
         "@googletest//:gtest_main",  # buildcleaner: keep
         "//compression:compress",
         "//compression:test_util",
@@ -167,20 +354,23 @@ cc_test(
 cc_test(
     name = "ops_test",
     size = "small",
-    timeout = "eternal",
+    timeout = "long",
     srcs = ["ops/ops_test.cc"],
+    linkstatic = True,
     local_defines = ["HWY_IS_TEST"],
     # for test_suite.
     tags = ["ops_tests"],
     deps = [
         ":allocator",
-        ":app",
-        ":common",
+        ":basics",
+        ":configs",
+        ":gemma_lib",
+        ":mat",
         ":ops",
         ":test_util",
-        ":threading",
+        ":threading_context",
         "@googletest//:gtest_main",  # buildcleaner: keep
-        "//compression:compress",
+        "//compression:types",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
         "@highway//:nanobenchmark",  #buildcleaner: keep
@@ -192,11 +382,14 @@ cc_test(
     size = "small",
     timeout = "long",
     srcs = ["ops/gemma_matvec_test.cc"],
+    linkstatic = True,
     local_defines = ["HWY_IS_TEST"],
     # for test_suite.
     tags = ["ops_tests"],
     deps = [
+        ":mat",
         ":ops",
+        ":threading_context",
         "@googletest//:gtest_main",  # buildcleaner: keep
         "//compression:compress",
         "@highway//:hwy",
@@ -210,18 +403,23 @@ cc_test(
     size = "small",
     timeout = "long",
     srcs = ["ops/matmul_test.cc"],
+    linkstatic = True,
     local_defines = ["HWY_IS_TEST"],
     # for test_suite.
     tags = ["ops_tests"],
     deps = [
-        ":allocator",
         ":basics",
+        ":mat",
+        ":matmul",
+        ":matmul_static",
         ":ops",
-        ":threading",
+        ":threading_context",
         "@googletest//:gtest_main",  # buildcleaner: keep
         "//compression:compress",
+        "//compression:test_util",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
+        "@highway//:nanobenchmark",
         "@highway//:thread_pool",
     ],
 )
@@ -231,6 +429,7 @@ cc_test(
     size = "small",
     timeout = "long",
     srcs = ["ops/bench_matmul.cc"],
+    linkstatic = True,
     local_defines = ["HWY_IS_TEST"],
     tags = [
         "manual",
@@ -238,12 +437,12 @@ cc_test(
         "ops_tests",  # for test_suite.
     ],
     deps = [
-        ":allocator",
         ":basics",
-        ":ops",
-        ":threading",
+        ":matmul",
+        ":threading_context",
         "@googletest//:gtest_main",  # buildcleaner: keep
         "//compression:compress",
+        "//compression:test_util",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
         "@highway//:nanobenchmark",
@@ -252,101 +451,47 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "common",
-    srcs = [
-        "gemma/common.cc",
-        "gemma/configs.cc",
-        "gemma/tensor_index.cc",
-    ],
-    hdrs = [
-        "gemma/common.h",
-        "gemma/configs.h",
-        "gemma/tensor_index.h",
-    ],
-    deps = [
-        "//compression:fields",
-        "//compression:sfp",
-        "@highway//:hwy",  # base.h
-        "@highway//:thread_pool",
-    ],
-)
-
-cc_test(
-    name = "configs_test",
-    srcs = ["gemma/configs_test.cc"],
-    deps = [
-        ":common",
-        "@googletest//:gtest_main",
-        "@highway//:hwy",
-    ],
-)
-
-cc_test(
-    name = "tensor_index_test",
-    srcs = ["gemma/tensor_index_test.cc"],
-    deps = [
-        ":basics",
-        ":common",
-        ":weights",
-        "@googletest//:gtest_main",
-        "//compression:compress",
-        "@highway//:hwy",
-    ],
-)
-
-cc_library(
-    name = "weights",
-    srcs = ["gemma/weights.cc"],
-    hdrs = ["gemma/weights.h"],
-    deps = [
-        ":common",
-        "//compression:blob_store",
-        "//compression:compress",
-        "//compression:io",
-        "@highway//:hwy",
-        "@highway//:profiler",
-        "@highway//:stats",
-        "@highway//:thread_pool",
-    ],
-)
-
-cc_library(
-    name = "tokenizer",
-    srcs = ["gemma/tokenizer.cc"],
-    hdrs = ["gemma/tokenizer.h"],
-    deps = [
-        ":common",
-        "//compression:io",
-        "//compression:sfp",
-        "@highway//:hwy",
-        "@highway//:profiler",
-        "@com_google_sentencepiece//:sentencepiece_processor",
-    ],
-)
-
 cc_library(
     name = "kv_cache",
     srcs = ["gemma/kv_cache.cc"],
     hdrs = ["gemma/kv_cache.h"],
     deps = [
-        ":common",
+        ":basics",
+        ":configs",
+        ":gemma_args",
+        ":mat",
         "@highway//:hwy",
     ],
 )
 
+cc_library(
+    name = "gemma_args",
+    hdrs = ["gemma/gemma_args.h"],
+    deps = [
+        ":args",
+        ":basics",
+        ":mat",
+        ":matmul",
+        "//io",
+        "@highway//:hwy",
+        "@highway//:profiler",
+    ],
+)
+
 cc_library(
     name = "gemma_lib",
     srcs = [
+        "gemma/attention.cc",
         "gemma/gemma.cc",
-        "gemma/instantiations/bf16.cc",
-        "gemma/instantiations/f32.cc",
-        "gemma/instantiations/nuq.cc",
-        "gemma/instantiations/sfp.cc",
+        "gemma/griffin.cc",
+        "gemma/vit.cc",
     ],
     hdrs = [
         "gemma/activations.h",
+        "gemma/attention.h",
         "gemma/gemma.h",
+        "gemma/griffin.h",
+        "gemma/vit.h",
     ],
     exec_properties = {
         # Avoid linker OOMs when building with sanitizer instrumentation.
@@ -354,23 +499,26 @@ cc_library(
     },
     textual_hdrs = [
         "gemma/gemma-inl.h",
-        # Placeholder for internal file2, do not remove,
     ],
     deps = [
         ":allocator",
         ":basics",
-        ":common",
-        ":ops",
-        ":tokenizer",
+        ":configs",
+        ":gemma_args",
         ":kv_cache",
-        ":weights",
+        ":mat",
+        ":matmul",
+        ":model_store",
+        ":ops",
         ":threading",
+        ":threading_context",
+        ":weights",
         "//compression:compress",
-        "//compression:io",
-        "//compression:sfp",
+        "//compression:types",
+        "//io:blob_store",
+        "//io",
         "//paligemma:image",
         "@highway//:hwy",
-        "@highway//:bit_set",
         "@highway//:nanobenchmark",  # timer
         "@highway//:profiler",
         "@highway//:thread_pool",
@@ -382,35 +530,9 @@ cc_library(
     srcs = ["evals/cross_entropy.cc"],
     hdrs = ["evals/cross_entropy.h"],
     deps = [
-        ":common",
         ":gemma_lib",
         ":ops",
-        "@highway//:hwy",
-    ],
-)
-
-cc_library(
-    name = "args",
-    hdrs = ["util/args.h"],
-    deps = [
-        ":basics",
-        "//compression:io",
-        "@highway//:hwy",
-    ],
-)
-
-cc_library(
-    name = "app",
-    hdrs = ["util/app.h"],
-    deps = [
-        ":args",
-        ":basics",
-        ":common",
-        ":gemma_lib",
-        ":ops",
-        ":threading",
-        "//compression:io",
-        "//compression:sfp",
+        "//compression:types",
         "@highway//:hwy",
     ],
 )
@@ -420,20 +542,49 @@ cc_library(
     srcs = ["evals/benchmark_helper.cc"],
     hdrs = ["evals/benchmark_helper.h"],
     deps = [
-        ":app",
-        ":args",
-        ":common",
+        ":configs",
         ":cross_entropy",
+        ":gemma_args",
         ":gemma_lib",
-        ":kv_cache",
+        ":matmul",
         ":ops",
-        ":threading",
-        # Placeholder for internal dep, do not remove.,
+        ":threading_context",
+        ":tokenizer",
         "@google_benchmark//:benchmark",
         "//compression:compress",
         "@highway//:hwy",
         "@highway//:nanobenchmark",
-        "@highway//:topology",
+        "@highway//:profiler",
+    ],
+)
+
+cc_library(
+    name = "gemma_shared_lib",
+    srcs = [
+        "gemma/bindings/c_api.cc",
+        "gemma/bindings/context.cc",
+    ],
+    hdrs = [
+        "gemma/bindings/c_api.h",
+        "gemma/bindings/context.h",
+    ],
+    exec_properties = {
+        # Avoid linker OOMs when building with sanitizer instrumentation.
+        "mem": "28g",
+    },
+    deps = [
+        ":benchmark_helper",
+        ":gemma_args",
+        ":gemma_lib",
+        ":kv_cache",
+        ":matmul",
+        ":threading",
+        ":threading_context",
+        ":tokenizer",
+        "//paligemma:image",
+        "@highway//:hwy",
+        "@highway//:profiler",
+        "@highway//:timer",
     ],
 )
 
@@ -449,9 +600,10 @@ cc_test(
     ],
     deps = [
         ":benchmark_helper",
-        ":common",
+        ":configs",
         ":gemma_lib",
-        "@googletest//:gtest_main",
+        "@googletest//:gtest_main",  # buildcleaner: keep
+        "//io",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
     ],
@@ -460,6 +612,7 @@ cc_test(
 cc_test(
     name = "gemma_batch_bench",
     srcs = ["evals/gemma_batch_bench.cc"],
+    linkstatic = True,
     # Requires model files
     tags = [
         "local",
@@ -468,12 +621,12 @@ cc_test(
     ],
     deps = [
         ":benchmark_helper",
-        ":common",
         ":gemma_lib",
-        ":tokenizer",
-        "@googletest//:gtest_main",
+        "@googletest//:gtest_main",  # buildcleaner: keep
         "@highway//:hwy",
         "@highway//:hwy_test_util",
+        "@highway//:nanobenchmark",
+        "@highway//:profiler",
     ],
 )
 
@@ -481,15 +634,13 @@ cc_binary(
     name = "gemma",
     srcs = ["gemma/run.cc"],
     deps = [
-        ":app",
         ":args",
         ":benchmark_helper",
-        ":common",
+        ":gemma_args",
         ":gemma_lib",
-        ":ops",
-        ":threading",
-        # Placeholder for internal dep, do not remove.,
-        "//compression:sfp",
+        ":matmul",
+        ":tokenizer",
+        "//compression:types",
         "//paligemma:image",
         "@highway//:hwy",
         "@highway//:profiler",
@@ -502,22 +653,15 @@ cc_binary(
     deps = [
         ":args",
         ":benchmark_helper",
-        ":common",
         ":cross_entropy",
         ":gemma_lib",
-        "//compression:io",
+        "//io",
         "@highway//:hwy",
         "@highway//:nanobenchmark",
         "@nlohmann_json//:json",
     ],
 )
 
-cc_library(
-    name = "benchmark_prompts",
-    hdrs = ["evals/prompts.h"],
-    deps = ["@highway//:hwy"],
-)
-
 cc_binary(
     name = "benchmarks",
     srcs = [
@@ -526,7 +670,6 @@ cc_binary(
     ],
     deps = [
         ":benchmark_helper",
-        ":benchmark_prompts",
         "@google_benchmark//:benchmark",
         "@highway//:hwy",  # base.h
     ],
@@ -534,14 +677,12 @@ cc_binary(
 
 cc_binary(
     name = "debug_prompt",
-    srcs = [
-        "evals/debug_prompt.cc",
-    ],
+    srcs = ["evals/debug_prompt.cc"],
     deps = [
         ":args",
         ":benchmark_helper",
         ":gemma_lib",
-        "//compression:io",
+        "//io",
         "@highway//:hwy",
         "@nlohmann_json//:json",
     ],
@@ -554,158 +695,9 @@ cc_binary(
         ":args",
         ":benchmark_helper",
         ":gemma_lib",
-        "//compression:io",
+        "//io",
         "@highway//:hwy",
         "@highway//:profiler",
-        "@highway//:thread_pool",
         "@nlohmann_json//:json",
     ],
 )
-
-cc_library(
-    name = "prompt",
-    hdrs = ["backprop/prompt.h"],
-    deps = [],
-)
-
-cc_library(
-    name = "sampler",
-    hdrs = ["backprop/sampler.h"],
-    deps = [
-        ":prompt",
-    ],
-)
-
-cc_library(
-    name = "backprop",
-    srcs = [
-        "backprop/backward.cc",
-        "backprop/forward.cc",
-    ],
-    hdrs = [
-        "backprop/activations.h",
-        "backprop/backward.h",
-        "backprop/forward.h",
-    ],
-    textual_hdrs = [
-        "backprop/backward-inl.h",
-        "backprop/forward-inl.h",
-    ],
-    deps = [
-        ":allocator",
-        ":common",
-        ":ops",
-        ":prompt",
-        ":weights",
-        "//compression:compress",
-        "@highway//:dot",
-        "@highway//:hwy",  # base.h
-        "@highway//:thread_pool",
-    ],
-)
-
-cc_library(
-    name = "backprop_scalar",
-    hdrs = [
-        "backprop/activations.h",
-        "backprop/backward_scalar.h",
-        "backprop/common_scalar.h",
-        "backprop/forward_scalar.h",
-    ],
-    deps = [
-        ":common",
-        ":prompt",
-        ":weights",
-        "//compression:compress",
-        "@highway//:hwy",
-    ],
-)
-
-cc_test(
-    name = "backward_scalar_test",
-    size = "large",
-    srcs = [
-        "backprop/backward_scalar_test.cc",
-        "backprop/test_util.h",
-    ],
-    deps = [
-        ":backprop_scalar",
-        ":common",
-        ":prompt",
-        ":sampler",
-        ":weights",
-        "@googletest//:gtest_main",
-        "//compression:compress",
-        "@highway//:thread_pool",
-    ],
-)
-
-cc_test(
-    name = "backward_test",
-    size = "large",
-    srcs = [
-        "backprop/backward_test.cc",
-        "backprop/test_util.h",
-    ],
-    exec_properties = {
-        # Avoid linker OOMs when building with sanitizer instrumentation.
-        "mem": "28g",
-    },
-    deps = [
-        ":allocator",
-        ":backprop",
-        ":backprop_scalar",
-        ":common",
-        ":ops",
-        ":prompt",
-        ":sampler",
-        ":threading",
-        ":weights",
-        "@googletest//:gtest_main",
-        "//compression:compress",
-        "@highway//:hwy",
-        "@highway//:hwy_test_util",
-        "@highway//:thread_pool",
-    ],
-)
-
-cc_library(
-    name = "optimizer",
-    srcs = ["backprop/optimizer.cc"],
-    hdrs = ["backprop/optimizer.h"],
-    deps = [
-        ":allocator",
-        ":common",
-        ":weights",
-        "//compression:compress",
-        "@highway//:hwy",
-        "@highway//:thread_pool",
-    ],
-)
-
-cc_test(
-    name = "optimize_test",
-    srcs = [
-        "backprop/optimize_test.cc",
-    ],
-    exec_properties = {
-        # Avoid linker OOMs when building with sanitizer instrumentation.
-        "mem": "28g",
-    },
-    deps = [
-        ":allocator",
-        ":backprop",
-        ":basics",
-        ":common",
-        ":gemma_lib",
-        ":ops",
-        ":optimizer",
-        ":prompt",
-        ":sampler",
-        ":threading",
-        ":weights",
-        "@googletest//:gtest_main",
-        "//compression:sfp",
-        "@highway//:thread_pool",
-    ],
-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1737c2d..ef2f2c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG c5bebf84ad01edec97e336f5c97ca4e0df6b4d06 EXCLUDE_FROM_ALL)
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 9414b48aeec251b69e6cadbfa42bebb5ddae1c34 EXCLUDE_FROM_ALL)
 FetchContent_MakeAvailable(highway)
 
 ## Note: absl needs to be installed by sentencepiece. This will only happen if
@@ -39,58 +39,54 @@ set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
 FetchContent_Declare(benchmark GIT_REPOSITORY https://github.com/google/benchmark.git GIT_TAG v1.8.2 EXCLUDE_FROM_ALL)
 FetchContent_MakeAvailable(benchmark)
 
+# Base source files
 set(SOURCES
-  compression/blob_store.cc
-  compression/blob_store.h
+  compression/compress-inl.h
   compression/compress.cc
   compression/compress.h
-  compression/compress-inl.h
-  compression/fields.cc
-  compression/fields.h
-  compression/io_win.cc
-  compression/io.cc
-  compression/io.h
   compression/nuq-inl.h
   compression/sfp-inl.h
-  compression/shared.h
+  compression/types.h
   compression/test_util-inl.h
-  backprop/activations.h
-  backprop/backward.cc
-  backprop/backward.h
-  backprop/backward-inl.h
-  backprop/backward_scalar.h
-  backprop/common_scalar.h
-  backprop/forward.cc
-  backprop/forward.h
-  backprop/forward-inl.h
-  backprop/forward_scalar.h
-  backprop/optimizer.cc
-  backprop/optimizer.h
   evals/benchmark_helper.cc
   evals/benchmark_helper.h
   evals/cross_entropy.cc
   evals/cross_entropy.h
   gemma/activations.h
-  gemma/common.cc
-  gemma/common.h
+  gemma/attention.cc
+  gemma/attention.h
   gemma/configs.cc
   gemma/configs.h
+  gemma/gemma_args.h
   gemma/gemma-inl.h
   gemma/gemma.cc
   gemma/gemma.h
-  gemma/instantiations/bf16.cc
-  gemma/instantiations/f32.cc
-  gemma/instantiations/nuq.cc
-  gemma/instantiations/sfp.cc
+  gemma/griffin.cc
+  gemma/griffin.h
   gemma/kv_cache.cc
   gemma/kv_cache.h
-  gemma/tensor_index.cc
-  gemma/tensor_index.h
+  gemma/model_store.cc
+  gemma/model_store.h
+  gemma/tensor_info.cc
+  gemma/tensor_info.h
   gemma/tokenizer.cc
   gemma/tokenizer.h
+  gemma/vit.cc
+  gemma/vit.h
   gemma/weights.cc
   gemma/weights.h
+  io/blob_store.cc
+  io/blob_store.h
+  io/fields.cc
+  io/fields.h
+  io/io_win.cc
+  io/io.cc
+  io/io.h
   ops/dot-inl.h
+  ops/matmul_static_bf16.cc
+  ops/matmul_static_f32.cc
+  ops/matmul_static_nuq.cc
+  ops/matmul_static_sfp.cc
   ops/matmul-inl.h
   ops/matmul.cc
   ops/matmul.h
@@ -102,15 +98,28 @@ set(SOURCES
   paligemma/image.h
   util/allocator.cc
   util/allocator.h
-  util/app.h
-  util/args.h
   util/basics.h
+  util/mat.cc
+  util/mat.h
   util/test_util.h
+  util/threading_context.cc
+  util/threading_context.h
   util/threading.cc
   util/threading.h
   util/topology.cc
   util/topology.h
+)
+
+# Add C API sources only when building DLL
+if(BUILD_GEMMA_DLL)
+  list(APPEND SOURCES
+    gemma/bindings/context.h
+    gemma/bindings/context.cc
+    gemma/bindings/c_api.h
+    gemma/bindings/c_api.cc
   )
+  message(STATUS "Including C API files for DLL build")
+endif()
 
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release")
@@ -131,6 +140,33 @@ target_compile_definitions(libgemma PRIVATE $<$<PLATFORM_ID:Windows>:_CRT_SECURE
 target_compile_options(libgemma PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-declarations>)
 install(TARGETS libgemma DESTINATION lib)
 
+# Shared library target for C# interop
+if(BUILD_GEMMA_DLL)
+    add_library(gemma_shared SHARED ${SOURCES})
+set_property(TARGET gemma_shared PROPERTY CXX_STANDARD 17)
+set_target_properties(gemma_shared PROPERTIES 
+    PREFIX ""
+    OUTPUT_NAME "gemma"
+)
+set_property(TARGET gemma_shared PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(gemma_shared PUBLIC ./)
+target_link_libraries(gemma_shared PRIVATE 
+    $<LINK_LIBRARY:WHOLE_ARCHIVE,hwy>
+    $<LINK_LIBRARY:WHOLE_ARCHIVE,hwy_contrib>
+    $<LINK_LIBRARY:WHOLE_ARCHIVE,sentencepiece-static>
+)
+target_include_directories(gemma_shared PUBLIC ${sentencepiece_SOURCE_DIR})
+target_compile_definitions(gemma_shared 
+    PRIVATE 
+    GEMMA_EXPORTS
+    $<$<PLATFORM_ID:Windows>:_CRT_SECURE_NO_WARNINGS NOMINMAX>
+)
+target_compile_options(gemma_shared PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-declarations>)
+install(TARGETS gemma_shared DESTINATION lib)
+install(FILES gemma/c_api.h DESTINATION include/gemma)
+install(FILES gemma/GemmaInterop.cs DESTINATION include/gemma)
+endif()
+
 # Executable Target
 
 add_executable(gemma gemma/run.cc)
@@ -154,17 +190,14 @@ enable_testing()
 include(GoogleTest)
 
 set(GEMMA_TEST_FILES
-  backprop/backward_scalar_test.cc
-  backprop/backward_test.cc
-  backprop/optimize_test.cc
-  compression/blob_store_test.cc
   compression/compress_test.cc
   compression/distortion_test.cc
-  compression/fields_test.cc
   compression/nuq_test.cc
   compression/sfp_test.cc
   evals/gemma_test.cc
-  gemma/tensor_index_test.cc
+  gemma/tensor_info_test.cc
+  io/blob_store_test.cc
+  io/fields_test.cc
   ops/bench_matmul.cc
   ops/dot_test.cc
   ops/gemma_matvec_test.cc
@@ -197,8 +230,5 @@ endif()  # GEMMA_ENABLE_TESTS
 
 ## Tools
 
-add_executable(compress_weights compression/compress_weights.cc)
-target_link_libraries(compress_weights libgemma hwy hwy_contrib)
-
-add_executable(migrate_weights compression/migrate_weights.cc)
+add_executable(migrate_weights io/migrate_weights.cc)
 target_link_libraries(migrate_weights libgemma hwy hwy_contrib)
diff --git a/CMakePresets.json b/CMakePresets.json
index 5fe13c8..a34b5bf 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -31,6 +31,24 @@
           "lhs": "${hostSystemName}",
           "rhs": "Windows"
         }
+      },
+      {
+        "name": "windows-dll",
+        "inherits": "__defaults__",
+        "displayName": "Windows DLL",
+        "description": "Visual Studio 2022 with Clang/LLVM frontend (DLL build)",
+        "generator": "Visual Studio 17 2022",
+        "toolset": "ClangCL",
+        "condition": {
+          "type": "equals",
+          "lhs": "${hostSystemName}",
+          "rhs": "Windows"
+        },
+        "cacheVariables": {
+          "BUILD_SHARED_LIBS": "OFF",
+          "CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS": "ON",
+          "BUILD_GEMMA_DLL": "ON"
+        }
       }
     ],
     "buildPresets": [
@@ -54,6 +72,15 @@
         "displayName": "Windows",
         "configuration": "Release",
         "configurePreset": "windows"
+      },
+      {
+        "name": "windows-dll",
+        "displayName": "Windows DLL",
+        "configuration": "Release",
+        "configurePreset": "windows-dll",
+        "targets": [
+            "gemma_shared"
+        ]
       }
     ]
   }
diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index fdebad4..5d70fdb 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -96,21 +96,10 @@ https://github.com/keras-team/keras-nlp/blob/master/tools/gemma/export_gemma_to_
 From Pytorch, use the following script to generate uncompressed weights:
 https://github.com/google/gemma.cpp/blob/dev/compression/convert_weights.py
 
-Then run `compression/compress_weights.cc` (Bazel target
-`compression:compress_weights`), specifying the resulting file as `--weights`
-and the desired .sbs name as the `--compressed_weights`.
+For PaliGemma, use `python/convert_from_safetensors` to create an SBS file
+directly.
 
-## Compile-Time Flags (Advanced)
-
-There are several compile-time flags to be aware of (note these may or may not
-be exposed to the build system):
-
-- `GEMMA_MAX_SEQ_LEN` : Sets maximum sequence length to preallocate for the KV
-  Cache. The default is 4096 tokens but can be overridden. This is not exposed
-  through `CMakeLists.txt` yet.
-
-In the medium term this will likely be deprecated in favor of handling options
-at runtime - dynamically resizing the KV cache as needed.
+For other models, `gemma_export_main.py` is not yet open sourced.
 
 ## Using gemma.cpp as a Library (Advanced)
 
@@ -164,7 +153,7 @@ constrained decoding type of use cases where you want to force the generation to
 fit a grammar. If you're not doing this, you can send an empty lambda or
 `std::function` as a no-op which is what `run.cc` does.
 
-### `Transformer()` implements the inference (i.e. `forward()` method in PyTorch or Jax) computation of the neural network
+### `Transformer()` implements inference (i.e. `forward()` in PyTorch or Jax)
 
 For high-level applications, you might only call `model.Generate()` and never
 interact directly with the neural network, but if you're doing something a bit
@@ -172,9 +161,6 @@ more custom you can call transformer which performs a single inference operation
 on a single token and mutates the Activations and the KVCache through the neural
 network computation.
 
-Note that an experimental backward pass is available in backprop/, which may be
-useful for fine tuning.
-
 ### For low level operations, defining new architectures, call `ops.h` functions directly
 
 You use `ops.h` if you're writing other NN architectures or modifying the
diff --git a/MODULE.bazel b/MODULE.bazel
index 77690fa..95fb5cc 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -18,7 +18,7 @@ bazel_dep(name = "google_benchmark", version = "1.8.5")
 # Require a more recent version.
 git_override(
     module_name = "highway",
-    commit = "c5bebf84ad01edec97e336f5c97ca4e0df6b4d06",
+    commit = "9414b48aeec251b69e6cadbfa42bebb5ddae1c34",
     remote = "https://github.com/google/highway",
 )
 
@@ -71,6 +71,7 @@ pip.parse(
     requirements_lock = "//compression/python:requirements.txt",
 )
 use_repo(pip, "compression_deps")
+
 pip.parse(
     hub_name = "python_deps",
     python_version = "3.11",
diff --git a/README.md b/README.md
index e9a6745..b389934 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ foundation models from Google.
 For additional information about Gemma, see
 [ai.google.dev/gemma](https://ai.google.dev/gemma). Model weights, including
 gemma.cpp specific artifacts, are
-[available on kaggle](https://www.kaggle.com/models/google/gemma).
+[available on kaggle](https://www.kaggle.com/models/google/gemma-2).
 
 ## Who is this project for?
 
@@ -18,8 +18,8 @@ deployment-oriented C++ inference runtimes, which are not designed for
 experimentation, and Python-centric ML research frameworks, which abstract away
 low-level computation through compilation.
 
-gemma.cpp provides a minimalist implementation of Gemma-1, Gemma-2, Gemma-3, and
-PaliGemma models, focusing on simplicity and directness rather than full
+gemma.cpp provides a minimalist implementation of Gemma-2, Gemma-3, and
+PaliGemma-2 models, focusing on simplicity and directness rather than full
 generality. This is inspired by vertically-integrated model implementations such
 as [ggml](https://github.com/ggerganov/ggml),
 [llama.c](https://github.com/karpathy/llama2.c), and
@@ -45,9 +45,41 @@ this invite link](https://discord.gg/H5jCBAWxAe). This project follows
 [Google's Open Source Community
 Guidelines](https://opensource.google.com/conduct/).
 
-*Active development is currently done on the `dev` branch. Please open pull
-requests targeting `dev` branch instead of `main`, which is intended to be more
-stable.*
+> [!NOTE] Active development is currently done on the `dev` branch. Please open
+> pull requests targeting `dev` branch instead of `main`, which is intended to
+> be more stable.
+
+## What's inside?
+
+-   LLM
+
+    -   CPU-only inference for: Gemma 2-3, Griffin(SSM), PaliGemma 2.
+    -   Sampling with TopK and temperature.
+    -   Backward pass (VJP) and Adam optimizer for Gemma research.
+
+-   Optimizations
+
+    -   Mixed-precision (fp8, bf16, fp32, fp64 bit) GEMM:
+        -   Designed for BF16 instructions, can efficiently emulate them.
+        -   Automatic runtime autotuning 7 parameters per matrix shape.
+    -   Weight compression integrated directly into GEMM:
+        -   Custom fp8 format with 2..3 mantissa bits; tensor scaling.
+        -   Also bf16, f32 and non-uniform 4-bit (NUQ); easy to add new formats.
+
+-   Infrastructure
+
+    -   SIMD: single implementation via Highway. Chooses ISA at runtime.
+    -   Tensor parallelism: CCX-aware, multi-socket thread pool.
+    -   Disk I/O: memory map or parallel read (heuristic with user override).
+    -   Custom format with forward/backward-compatible metadata serialization.
+    -   Model conversion from Safetensors, not yet open sourced.
+    -   Portability: Linux, Windows/OS X supported. CMake/Bazel. 'Any' CPU.
+
+-   Frontends
+
+    -   C++ APIs with streaming for single query and batched inference.
+    -   Basic interactive command-line app.
+    -   Basic Python bindings (pybind11).
 
 ## Quick Start
 
@@ -74,57 +106,20 @@ winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "-
 
 Visit the
 [Kaggle page for Gemma-2](https://www.kaggle.com/models/google/gemma-2/gemmaCpp)
-[or Gemma-1](https://www.kaggle.com/models/google/gemma/frameworks/gemmaCpp),
 and select `Model Variations |> Gemma C++`.
 
 On this tab, the `Variation` dropdown includes the options below. Note bfloat16
 weights are higher fidelity, while 8-bit switched floating point weights enable
 faster inference. In general, we recommend starting with the `-sfp` checkpoints.
 
-If you are unsure which model to start with, we recommend starting with the
-smallest Gemma-2 model, i.e. `2.0-2b-it-sfp`.
-
-Alternatively, visit the
-[gemma.cpp](https://huggingface.co/models?other=gemma.cpp) models on the Hugging
-Face Hub. First go the model repository of the model of interest (see
-recommendations below). Then, click the `Files and versions` tab and download
-the model and tokenizer files. For programmatic downloading, if you have
-`huggingface_hub` installed, you can also download by running:
-
-```
-huggingface-cli login # Just the first time
-huggingface-cli download google/gemma-2b-sfp-cpp --local-dir build/
-```
-
-Gemma-1 2B instruction-tuned (`it`) and pre-trained (`pt`) models:
-
-| Model name  | Description |
-| ----------- | ----------- |
-| `2b-it`     | 2 billion parameter instruction-tuned model, bfloat16 |
-| `2b-it-sfp` | 2 billion parameter instruction-tuned model, 8-bit switched floating point |
-| `2b-pt`     | 2 billion parameter pre-trained model, bfloat16 |
-| `2b-pt-sfp` | 2 billion parameter pre-trained model, 8-bit switched floating point |
-
-Gemma-1 7B instruction-tuned (`it`) and pre-trained (`pt`) models:
-
-| Model name  | Description |
-| ----------- | ----------- |
-| `7b-it`     | 7 billion parameter instruction-tuned model, bfloat16 |
-| `7b-it-sfp` | 7 billion parameter instruction-tuned model, 8-bit switched floating point |
-| `7b-pt`     | 7 billion parameter pre-trained model, bfloat16 |
-| `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |
-
-> [!NOTE]
-> **Important**: We strongly recommend starting off with the `2b-it-sfp` model to
-> get up and running.
+> [!NOTE] **Important**: We strongly recommend starting off with the
+> `gemma2-2b-it-sfp` model to get up and running.
 
 Gemma 2 models are named `gemma2-2b-it` for 2B and `9b-it` or `27b-it`. See the
-`kModelFlags` definition in `common.cc`.
+`ModelPrefix` function in `configs.cc`.
 
 ### Step 2: Extract Files
 
-If you downloaded the models from Hugging Face, skip to step 3.
-
 After filling out the consent form, the download should proceed to retrieve a
 tar archive file `archive.tar.gz`. Extract files from `archive.tar.gz` (this can
 take a few minutes):
@@ -162,10 +157,9 @@ cmake --build --preset make -j [number of parallel threads to use]
 ```
 
 Replace `[number of parallel threads to use]` with a number - the number of
-cores available on your system is a reasonable heuristic.  For example,
-`make -j4 gemma` will build using 4 threads. If the `nproc` command is
-available, you can use `make -j$(nproc) gemma` as a reasonable default
-for the number of threads.
+cores available on your system is a reasonable heuristic. For example, `make -j4
+gemma` will build using 4 threads. If the `nproc` command is available, you can
+use `make -j$(nproc) gemma` as a reasonable default for the number of threads.
 
 If you aren't sure of the right value for the `-j` flag, you can simply run
 `make gemma` instead and it should still build the `./gemma` executable.
@@ -174,7 +168,8 @@ If you aren't sure of the right value for the `-j` flag, you can simply run
 > On Windows Subsystem for Linux (WSL) users should set the number of
 > parallel threads to 1. Using a larger number may result in errors.
 
-If the build is successful, you should now have a `gemma` executable in the `build/` directory.
+If the build is successful, you should now have a `gemma` executable in the
+`build/` directory.
 
 #### Windows
 
@@ -186,7 +181,8 @@ cmake --preset windows
 cmake --build --preset windows -j [number of parallel threads to use]
 ```
 
-If the build is successful, you should now have a `gemma.exe` executable in the `build/` directory.
+If the build is successful, you should now have a `gemma.exe` executable in the
+`build/` directory.
 
 #### Bazel
 
@@ -194,7 +190,8 @@ If the build is successful, you should now have a `gemma.exe` executable in the
 bazel build -c opt --cxxopt=-std=c++20 :gemma
 ```
 
-If the build is successful, you should now have a `gemma` executable in the `bazel-bin/` directory.
+If the build is successful, you should now have a `gemma` executable in the
+`bazel-bin/` directory.
 
 #### Make
 
@@ -208,33 +205,21 @@ You can now run `gemma` from inside the `build/` directory.
 
 `gemma` has the following required arguments:
 
-Argument        | Description                  | Example value
---------------- | ---------------------------- | -----------------------
-`--model`       | The model type.              | `2b-it` ... (see below)
-`--weights`     | The compressed weights file. | `2b-it-sfp.sbs`
-`--weight_type` | The compressed weight type.  | `sfp`
-`--tokenizer`   | The tokenizer file.          | `tokenizer.spm`
-
-`gemma` is invoked as:
-
-```sh
-./gemma \
---tokenizer [tokenizer file] \
---weights [compressed weights file] \
---weight_type [f32 or bf16 or sfp (default:sfp)] \
---model [2b-it or 2b-pt or 7b-it or 7b-pt or ...]
-```
+Argument      | Description                  | Example value
+------------- | ---------------------------- | ---------------
+`--weights`   | The compressed weights file. | `2b-it-sfp.sbs`
+`--tokenizer` | The tokenizer file.          | `tokenizer.spm`
 
 Example invocation for the following configuration:
 
-- Compressed weights file `2b-it-sfp.sbs` (2B instruction-tuned model, 8-bit
-  switched floating point).
-- Tokenizer file `tokenizer.spm`.
+-   weights file `gemma2-2b-it-sfp.sbs` (Gemma2 2B instruction-tuned model,
+    8-bit switched floating point).
+-   Tokenizer file `tokenizer.spm` (can omit for single-format weights files
+    created after 2025-05-06, or output by migrate_weights.cc).
 
 ```sh
 ./gemma \
---tokenizer tokenizer.spm \
---weights 2b-it-sfp.sbs --model 2b-it
+--tokenizer tokenizer.spm --weights gemma2-2b-it-sfp.sbs
 ```
 
 ### RecurrentGemma
@@ -256,23 +241,20 @@ Step 1, and run the binary as follows:
 
 ### PaliGemma Vision-Language Model
 
-This repository includes a version of the PaliGemma VLM
-([paper](https://arxiv.org/abs/2407.07726),
-[code](https://github.com/google-research/big_vision/tree/main/big_vision/configs/proj/paligemma))
-and its successor PaliGemma 2 ([paper](https://arxiv.org/abs/2412.03555)). We
-provide a C++ implementation of the PaliGemma model family here.
+This repository includes a version of the PaliGemma 2 VLM
+([paper](https://arxiv.org/abs/2412.03555)). We provide a C++ implementation of
+the PaliGemma 2 model here.
 
 To use the version of PaliGemma included in this repository, build the gemma
 binary as noted above in Step 3. Download the compressed weights and tokenizer
 from
-[Kaggle](https://www.kaggle.com/models/google/paligemma/gemmaCpp/paligemma-3b-mix-224)
+[Kaggle](https://www.kaggle.com/models/google/paligemma-2/gemmaCpp/paligemma2-3b-mix-224)
 and run the binary as follows:
 
 ```sh
 ./gemma \
 --tokenizer paligemma_tokenizer.model \
---model paligemma-224 \
---weights paligemma-3b-mix-224-sfp.sbs \
+--weights paligemma2-3b-mix-224-sfp.sbs \
 --image_file paligemma/testdata/image.ppm
 ```
 
@@ -312,12 +294,12 @@ allows to contain the tokenizer (and the model type) directly. A tool to migrate
 from the multi-file format to the single-file format is available.
 
 ```sh
-compression/migrate_weights \
+io/migrate_weights \
   --tokenizer .../tokenizer.spm --weights .../gemma2-2b-it-sfp.sbs \
-  --model gemma2-2b-it --output_weights .../gemma2-2b-it-sfp-single.sbs
+  --output_weights .../gemma2-2b-it-sfp-single.sbs
 ```
 
-After migration, you can use the new weights file with gemma.cpp like this:
+After migration, you can omit the tokenizer argument like this:
 
 ```sh
 ./gemma --weights .../gemma2-2b-it-sfp-single.sbs
@@ -325,15 +307,6 @@ After migration, you can use the new weights file with gemma.cpp like this:
 
 ### Troubleshooting and FAQs
 
-**Running `./gemma` fails with "Failed to read cache gating_ein_0 (error 294) ..."**
-
-The most common problem is that the `--weight_type` argument does not match that
-of the model file. Revisit step #3 and check which weights you downloaded.
-
-Note that we have already moved weight type from a compile-time decision to a
-runtime argument. In a subsequent step, we plan to bake this information into
-the weights.
-
 **Problems building in Windows / Visual Studio**
 
 Currently if you're using Windows, we recommend building in WSL (Windows
@@ -344,22 +317,22 @@ configurations, see issues for active discussion.
 
 A common issue is that you are using a pre-trained model, which is not
 instruction-tuned and thus does not respond to instructions. Make sure you are
-using an instruction-tuned model (`2b-it-sfp`, `2b-it`, `7b-it-sfp`, `7b-it`)
-and not a pre-trained model (any model with a `-pt` suffix).
+using an instruction-tuned model (`gemma2-2b-it-sfp`) and not a pre-trained
+model (any model with a `-pt` suffix).
 
 **What sequence lengths are supported?**
 
-See `seq_len` in `configs.cc`. For the Gemma 3 models larger than 1B, this is
-typically 32K but 128K would also work given enough RAM. Note that long
-sequences will be slow due to the quadratic cost of attention.
+See `max_seq_len` in `configs.cc` and `InferenceArgs.seq_len`. For the Gemma 3
+models larger than 1B, this is typically 32K but 128K would also work given
+enough RAM. Note that long sequences will be slow due to the quadratic cost of
+attention.
 
 **How do I convert my fine-tune to a `.sbs` compressed model file?**
 
-For PaliGemma (1 and 2) checkpoints, you can use
-python/convert_from_safetensors.py to convert from safetensors format (tested
-with building via bazel). For an adapter model, you will likely need to call
-merge_and_unload() to convert the adapter model to a single-file format before
-converting it.
+For PaliGemma 2 checkpoints, you can use python/convert_from_safetensors.py to
+convert from safetensors format (tested with building via bazel). For an adapter
+model, you will likely need to call merge_and_unload() to convert the adapter
+model to a single-file format before converting it.
 
 Here is how to use it using a bazel build of the compression library assuming
 locally installed (venv) torch, numpy, safetensors, absl-py, etc.:
@@ -373,22 +346,18 @@ ln -s $BAZEL_OUTPUT_DIR [...]/site-packages/compression
 python3 python/convert_from_safetensors.py --load_path [...].safetensors.index.json
 ```
 
-See also compression/convert_weights.py for a slightly older option to convert a
-pytorch checkpoint. (The code may need updates to work with Gemma-2 models.)
-
 **What are some easy ways to make the model run faster?**
 
 1.  Make sure you are using the 8-bit switched floating point `-sfp` models.
     These are half the size of bf16 and thus use less memory bandwidth and cache
     space.
-2.  If you're on a laptop, make sure power mode is set to maximize performance
+2.  Due to auto-tuning, the second and especially third query will be faster.
+3.  If you're on a laptop, make sure power mode is set to maximize performance
     and saving mode is **off**. For most laptops, the power saving modes get
     activated automatically if the computer is not plugged in.
-3.  Close other unused cpu-intensive applications.
-4.  On macs, anecdotally we observe a "warm-up" ramp-up in speed as performance
+4.  Close other unused cpu-intensive applications.
+5.  On macs, anecdotally we observe a "warm-up" ramp-up in speed as performance
     cores get engaged.
-5.  Experiment with the `--num_threads` argument value. Depending on the device,
-    larger numbers don't always mean better performance.
 
 We're also working on algorithmic and optimization approaches for faster
 inference, stay tuned.
@@ -411,7 +380,7 @@ newline input.
 By default, verbosity is set to 1, bringing up a terminal-based interactive
 interface when `gemma` is invoked:
 
-```console
+```sh
 $ ./gemma [...]
   __ _  ___ _ __ ___  _ __ ___   __ _   ___ _ __  _ __
  / _` |/ _ \ '_ ` _ \| '_ ` _ \ / _` | / __| '_ \| '_ \
@@ -420,11 +389,7 @@ $ ./gemma [...]
   __/ |                                    | |   | |
  |___/                                     |_|   |_|
 
-tokenizer                     : tokenizer.spm
-compressed_weights            : 2b-it-sfp.sbs
-model                         : 2b-it
-weights                       : [no path specified]
-max_generated_tokens          : 2048
+...
 
 *Usage*
   Enter an instruction and press enter (%C reset conversation, %Q quits).
@@ -462,7 +427,7 @@ For using the `gemma` executable as a command line tool, it may be useful to
 create an alias for gemma.cpp with arguments fully specified:
 
 ```sh
-alias gemma2b="~/gemma.cpp/build/gemma -- --tokenizer ~/gemma.cpp/build/tokenizer.spm --weights ~/gemma.cpp/build/gemma2-2b-it-sfp.sbs --model gemma2-2b-it --verbosity 0"
+alias gemma2b="~/gemma.cpp/build/gemma -- --tokenizer ~/gemma.cpp/build/tokenizer.spm --weights ~/gemma.cpp/build/gemma2-2b-it-sfp.sbs --verbosity 0"
 ```
 
 Replace the above paths with your own paths to the model and tokenizer paths
@@ -481,7 +446,7 @@ cat configs.h | tail -n 35 | tr '\n' ' ' | xargs -0 echo "What does this C++ cod
 
 The output of the above command should look like:
 
-```console
+```sh
 [ Reading prompt ] [...]
 This C++ code snippet defines a set of **constants** used in a large language model (LLM) implementation, likely related to the **attention mechanism**.
 
@@ -492,8 +457,8 @@ Let's break down the code:
 ### Incorporating gemma.cpp as a Library in your Project
 
 The easiest way to incorporate gemma.cpp in your own project is to pull in
-gemma.cpp and dependencies using `FetchContent`. You can add the following to your
-CMakeLists.txt:
+gemma.cpp and dependencies using `FetchContent`. You can add the following to
+your CMakeLists.txt:
 
 ```
 include(FetchContent)
@@ -562,9 +527,10 @@ submit a PR with a `README.md` edit.
 
 ## Acknowledgements and Contacts
 
-gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.com)
-and [Jan Wassenberg](mailto:janwas@google.com), and subsequently released February 2024
-thanks to contributions from Phil Culliton, Paul Chang, and Dan Zheng.
+gemma.cpp was started in fall 2023 by
+[Austin Huang](mailto:austinvhuang@google.com) and
+[Jan Wassenberg](mailto:janwas@google.com), and subsequently released February
+2024 thanks to contributions from Phil Culliton, Paul Chang, and Dan Zheng.
 
 Griffin support was implemented in April 2024 thanks to contributions by Andrey
 Mikhaylov, Eugene Kliuchnikov, Jan Wassenberg, Jyrki Alakuijala, Lode
diff --git a/backprop/activations.h b/backprop/activations.h
deleted file mode 100644
index c616759..0000000
--- a/backprop/activations.h
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_BACKPROP_ACTIVATIONS_H_
-#define THIRD_PARTY_GEMMA_CPP_BACKPROP_ACTIVATIONS_H_
-
-#include <stddef.h>
-
-#include <vector>
-
-#include "compression/compress.h"  // MatStorageT
-#include "gemma/configs.h"         // ModelConfig
-
-namespace gcpp {
-
-template <typename T>
-struct ForwardLayer {
-  ForwardLayer(const LayerConfig& config, size_t seq_len)
-      : input("input", seq_len, config.model_dim),
-        pre_att_rms_out("pre_att_rms_out", seq_len, config.model_dim),
-        qkv("qkv", seq_len * (config.heads + 2), config.qkv_dim),
-        att("att", seq_len * config.heads, seq_len),
-        att_out("att_out", seq_len * config.heads, config.qkv_dim),
-        att_post1("att_post1", seq_len, config.model_dim),
-        attention_out("attention_out", seq_len, config.model_dim),
-        bf_pre_ffw_rms_out("bf_pre_ffw_rms_out", seq_len, config.model_dim),
-        ffw_hidden("ffw_hidden", seq_len, config.ff_hidden_dim * 2),
-        ffw_hidden_gated("ffw_hidden_gated", seq_len, config.ff_hidden_dim),
-        layer_config(config) {}
-
-  MatStorageT<T> input;
-  MatStorageT<T> pre_att_rms_out;
-  MatStorageT<T> qkv;
-  MatStorageT<T> att;
-  MatStorageT<T> att_out;
-  MatStorageT<T> att_post1;
-  MatStorageT<T> attention_out;
-  MatStorageT<T> bf_pre_ffw_rms_out;
-  MatStorageT<T> ffw_hidden;
-  MatStorageT<T> ffw_hidden_gated;
-  const LayerConfig& layer_config;
-};
-
-template <typename T>
-struct ForwardPass {
-  ForwardPass(const ModelConfig& config)
-      : final_layer_output("final_layer_output", config.seq_len,
-                           config.model_dim),
-        final_norm_output("final_norm_output", config.seq_len,
-                          config.model_dim),
-        logits("logits", config.seq_len, config.vocab_size),
-        probs("probs", config.seq_len, config.vocab_size),
-        weights_config(config) {
-    for (const auto& layer_config : config.layer_configs) {
-      layers.emplace_back(layer_config, config.seq_len);
-    }
-  }
-
-  std::vector<ForwardLayer<T>> layers;
-  MatStorageT<T> final_layer_output;
-  MatStorageT<T> final_norm_output;
-  MatStorageT<T> logits;
-  MatStorageT<T> probs;
-  const ModelConfig& weights_config;
-};
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_BACKPROP_ACTIVATIONS_H_
diff --git a/backprop/backward-inl.h b/backprop/backward-inl.h
deleted file mode 100644
index 2a0f330..0000000
--- a/backprop/backward-inl.h
+++ /dev/null
@@ -1,404 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Implementation of the Vector-Jacobian Products (VJP) of the individual
-// operations of the forward pass.
-
-// Include guard for non-SIMD code.
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_INL_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_INL_H_
-
-#include <stddef.h>
-
-#include <cmath>
-#include <vector>
-
-#include "backprop/activations.h"
-#include "backprop/prompt.h"
-#include "gemma/common.h"
-#include "gemma/weights.h"
-#include "util/allocator.h"
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_INL_H_
-
-// Include guard for (potentially) SIMD code.
-#if defined(THIRD_PARTY_GEMMA_CPP_BACKWARD_TOGGLE) == defined(HWY_TARGET_TOGGLE)
-#ifdef THIRD_PARTY_GEMMA_CPP_BACKWARD_TOGGLE
-#undef THIRD_PARTY_GEMMA_CPP_BACKWARD_TOGGLE
-#else
-#define THIRD_PARTY_GEMMA_CPP_BACKWARD_TOGGLE
-#endif
-
-#include "hwy/highway.h"
-// After highway.h
-#include "ops/matmul-inl.h"
-#include "ops/ops-inl.h"
-#include "hwy/contrib/dot/dot-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace gcpp {
-namespace HWY_NAMESPACE {
-namespace hn = hwy::HWY_NAMESPACE;
-
-HWY_INLINE void MatMulVJP(const float* HWY_RESTRICT weights,  // kRows * kCols,
-                          const float* HWY_RESTRICT x,  // num_tokens * kCols
-                          const float* HWY_RESTRICT v,  // num_tokens * kRows
-                          size_t cols, size_t rows, size_t num_tokens,
-                          float* HWY_RESTRICT grad_w,  // kRows * kCols,
-                          float* HWY_RESTRICT grad_x,  // num_tokens * kCols
-                          hwy::ThreadPool& pool) {
-  hwy::ZeroBytes(grad_x, num_tokens * cols * sizeof(grad_x[0]));
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t voffs = pos * rows;
-    const size_t xoffs = pos * cols;
-    for (size_t j = 0; j < rows; ++j) {
-      MulByConstAndAdd(v[voffs + j], &x[xoffs], &grad_w[j * cols], cols);
-      MulByConstAndAdd(v[voffs + j], &weights[j * cols], &grad_x[xoffs], cols);
-    }
-  }
-}
-
-HWY_INLINE void MultiHeadMatMulVJP(
-    const float* HWY_RESTRICT weights,  // heads * kRows * kCols
-    const float* HWY_RESTRICT x,        // num_tokens * heads * kCols
-    const float* HWY_RESTRICT v,        // num_tokens * kRows
-    size_t heads, size_t cols, size_t rows, size_t num_tokens,
-    float* HWY_RESTRICT grad_w,  // heads * kRows * kCols
-    float* HWY_RESTRICT grad_x,  // num_tokens * heads * kCols
-    hwy::ThreadPool& pool) {
-  hwy::ZeroBytes(grad_x, num_tokens * heads * cols * sizeof(grad_x[0]));
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t j = 0; j < rows; ++j) {
-      for (size_t h = 0; h < heads; ++h) {
-        MulByConstAndAdd(v[pos * rows + j], &x[pos * heads * cols + h * cols],
-                         &grad_w[h * rows * cols + j * cols], cols);
-        MulByConstAndAdd(v[pos * rows + j],
-                         &weights[h * rows * cols + j * cols],
-                         &grad_x[pos * heads * cols + h * cols], cols);
-      }
-    }
-  }
-}
-
-template <class D, HWY_IF_F32_D(D)>
-static HWY_INLINE hn::Vec<D> DGelu(D d, hn::Vec<D> v) {
-  const hn::Vec<D> kMul = hn::Set(d, 0.044715f);
-  const hn::Vec<D> kSqrt2OverPi = hn::Set(d, 0.797884560804236f);
-  const hn::Vec<D> kHalf = hn::Set(d, 0.5f);
-  const hn::Vec<D> kOne = hn::Set(d, 1.0f);
-  // kSqrtOverPi*3*kMul
-  const hn::Vec<D> kMulv2 = hn::Set(d, 0.1070322244f);
-
-  const hn::Vec<D> v2 = hn::Mul(v, v);
-  const hn::Vec<D> v3 = hn::Mul(v2, v);
-  const hn::Vec<D> arg = hn::Mul(kSqrt2OverPi, hn::MulAdd(kMul, v3, v));
-  const hn::Vec<D> tanh = hn::Tanh(d, arg);
-  const hn::Vec<D> cdf = hn::MulAdd(kHalf, tanh, kHalf);
-  const hn::Vec<D> dtanh = hn::Sub(kOne, hn::Mul(tanh, tanh));
-  const hn::Vec<D> darg = hn::MulAdd(kMulv2, v2, kSqrt2OverPi);
-  return hn::MulAdd(kHalf, hn::Mul(v, hn::Mul(dtanh, darg)), cdf);
-}
-
-static HWY_NOINLINE void SoftmaxVJP(const float* HWY_RESTRICT forward,
-                                    float* HWY_RESTRICT backward,
-                                    const size_t size) {
-  namespace hn = hwy::HWY_NAMESPACE;
-  using D = hn::ScalableTag<float>;
-  const D d;
-
-  const auto offset =
-      hn::Set(d, hn::Dot::Compute<0>(d, forward, backward, size));
-  hn::Transform1(
-      d, backward, size, forward,
-      [&offset](const auto d, const auto v, const auto y)
-      HWY_ATTR { return hn::Mul(y, hn::Sub(v, offset)); });
-}
-
-static HWY_NOINLINE void RMSNormVJP(
-    const float* HWY_RESTRICT weights, const float* HWY_RESTRICT x,
-    const float* HWY_RESTRICT v, size_t model_dim, size_t num_tokens,
-    float* HWY_RESTRICT grad_w, float* HWY_RESTRICT grad_x,
-    hwy::ThreadPool& pool) {
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t offset = pos * model_dim;
-    const float ss = detail::RMSNormMul(x + offset, model_dim);
-
-    for (size_t i = 0; i < model_dim; ++i) {
-      grad_w[i] += v[offset + i] * x[offset + i] * ss;
-    }
-    const float ss3 = ss * ss * ss / StaticCast<float>(model_dim);
-    float tmp = 0.0f;
-    for (size_t i = 0; i < model_dim; ++i) {
-      tmp += (1.0f + weights[i]) * v[offset + i] * x[offset + i];
-    }
-    tmp *= ss3;
-    for (size_t i = 0; i < model_dim; ++i) {
-      grad_x[offset + i] = ss * (1.0f + weights[i]) * v[offset + i] -
-                           tmp * x[offset + i];
-    }
-  }
-}
-
-static HWY_NOINLINE void InputEmbeddingVJP(
-    const float* weights, const std::vector<int>& prompt,
-    const float scaling, const float* HWY_RESTRICT v,
-    float* HWY_RESTRICT grad, size_t model_dim) {
-  HWY_ASSERT(!prompt.empty());
-  for (size_t pos = 0; pos < prompt.size() - 1; ++pos) {
-    int token = prompt[pos];
-    MulByConstAndAdd(scaling, v + pos * model_dim,
-                     grad + token * model_dim, model_dim);
-  }
-}
-
-template <typename T>
-void LayerVJP(const LayerWeightsPtrs<T>& weights,
-              const ForwardLayer<float>& forward,
-              const float* HWY_RESTRICT next_layer_grad, size_t num_tokens,
-              LayerWeightsPtrs<T>& grad, ForwardLayer<float>& backward,
-              const RowVectorBatch<float>& inv_timescale,
-              hwy::ThreadPool& pool) {
-  const LayerConfig& config = weights.layer_config;
-  const size_t model_dim = config.model_dim;
-  const size_t qkv_dim = config.qkv_dim;
-  const size_t heads = config.heads;
-  const size_t seq_len = forward.input.Rows();
-  const size_t ff_hidden_dim = config.ff_hidden_dim;
-  const float query_scale =
-      static_cast<float>(1.0 / sqrt(static_cast<double>(qkv_dim)));
-  HWY_ASSERT(num_tokens <= seq_len);
-
-  MatMulVJP(weights.linear_w.data(), forward.ffw_hidden_gated.data(),
-            next_layer_grad, ff_hidden_dim, model_dim, num_tokens,
-            grad.linear_w.data(), backward.ffw_hidden_gated.data(), pool);
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t hidden_offset = pos * ff_hidden_dim * 2;
-    const float* HWY_RESTRICT f_out = forward.ffw_hidden.data() + hidden_offset;
-    const float* HWY_RESTRICT f_out_mul = f_out + ff_hidden_dim;
-    const float* HWY_RESTRICT b_out_gated =
-        backward.ffw_hidden_gated.data() + pos * ff_hidden_dim;
-    float* HWY_RESTRICT b_out = backward.ffw_hidden.data() + hidden_offset;
-    float* HWY_RESTRICT b_out_mul = b_out + ff_hidden_dim;
-    namespace hn = hwy::HWY_NAMESPACE;
-    using DF = hn::ScalableTag<float>;
-    DF df;
-    for (size_t i = 0; i < ff_hidden_dim; i += Lanes(df)) {
-      const auto y = Load(df, f_out + i);
-      const auto x = Load(df, f_out_mul + i);
-      const auto v = Load(df, b_out_gated + i);
-      hn::Store(hn::Mul(v, Gelu(df, y)), df, b_out_mul + i);
-      hn::Store(hn::Mul(v, hn::Mul(x, DGelu(df, y))), df, b_out + i);
-    }
-  }
-
-  MatMulVJP(weights.gating_einsum_w.data(), forward.bf_pre_ffw_rms_out.data(),
-            backward.ffw_hidden.data(), model_dim, ff_hidden_dim * 2,
-            num_tokens, grad.gating_einsum_w.data(),
-            backward.bf_pre_ffw_rms_out.data(), pool);
-  RMSNormVJP(weights.pre_ffw_norm_scale.data(), forward.attention_out.data(),
-             backward.bf_pre_ffw_rms_out.data(), model_dim, num_tokens,
-             grad.pre_ffw_norm_scale.data(), backward.attention_out.data(),
-             pool);
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(next_layer_grad + pos * model_dim,
-            backward.attention_out.data() + pos * model_dim, model_dim);
-  }
-
-  backward.qkv.ZeroInit();
-
-  MultiHeadMatMulVJP(weights.attn_vec_einsum_w.data(), forward.att_out.data(),
-                     backward.attention_out.data(), heads, qkv_dim, model_dim,
-                     num_tokens, grad.attn_vec_einsum_w.data(),
-                     backward.att_out.data(), pool);
-
-  for (size_t head = 0; head < heads; ++head) {
-    for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t aoffset = head * seq_len + pos * heads * seq_len;
-      const float* HWY_RESTRICT f_head_att = forward.att.data() + aoffset;
-      const float* HWY_RESTRICT b_att_out =
-          backward.att_out.data() + (pos * heads + head) * qkv_dim;
-      float* HWY_RESTRICT b_head_att = backward.att.data() + aoffset;
-      for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const size_t v2offs = (pos2 * (heads + 2) + heads + 1) * qkv_dim;
-        const float* HWY_RESTRICT f_v2 = forward.qkv.data() + v2offs;
-        float* HWY_RESTRICT b_v2 = backward.qkv.data() + v2offs;
-        b_head_att[pos2] = Dot(b_att_out, f_v2, qkv_dim);
-        MulByConstAndAdd(f_head_att[pos2], b_att_out, b_v2, qkv_dim);
-      }
-    }
-  }
-
-  for (size_t head = 0; head < heads; ++head) {
-    for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t aoffset = head * seq_len + pos * heads * seq_len;
-      const float* HWY_RESTRICT f_head_att = forward.att.data() + aoffset;
-      float* HWY_RESTRICT b_head_att = backward.att.data() + aoffset;
-      SoftmaxVJP(f_head_att, b_head_att, pos + 1);
-    }
-  }
-
-  for (size_t head = 0; head < heads; ++head) {
-    for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t qoffs = (pos * (heads + 2) + head) * qkv_dim;
-      const size_t aoffs = head * seq_len + pos * heads * seq_len;
-      const float* HWY_RESTRICT f_q = forward.qkv.data() + qoffs;
-      const float* HWY_RESTRICT b_head_att = backward.att.data() + aoffs;
-      float* HWY_RESTRICT b_q = backward.qkv.data() + qoffs;
-      for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const size_t k2offs = (pos2 * (heads + 2) + heads) * qkv_dim;
-        const float* HWY_RESTRICT f_k2 = forward.qkv.data() + k2offs;
-        float* HWY_RESTRICT b_k2 = backward.qkv.data() + k2offs;
-        MulByConstAndAdd(b_head_att[pos2], f_k2, b_q, qkv_dim);
-        MulByConstAndAdd(b_head_att[pos2], f_q, b_k2, qkv_dim);
-      }
-    }
-  }
-
-  for (int pos = 0; pos < static_cast<int>(num_tokens); ++pos) {
-    float* HWY_RESTRICT b_kv =
-        backward.qkv.data() + (pos * (heads + 2) + heads) * qkv_dim;
-    Rope(b_kv, qkv_dim, inv_timescale.Const(), -pos);
-  }
-
-  for (size_t head = 0; head < heads; ++head) {
-    for (size_t pos = 0; pos < num_tokens; ++pos) {
-      float* HWY_RESTRICT b_q =
-          backward.qkv.data() + (pos * (heads + 2) + head) * qkv_dim;
-      MulByConst(query_scale, b_q, qkv_dim);
-      Rope(b_q, qkv_dim, inv_timescale.Const(), -pos);
-    }
-  }
-
-  MatMulVJP(weights.qkv_einsum_w.data(), forward.pre_att_rms_out.data(),
-            backward.qkv.data(), model_dim, (heads + 2) * qkv_dim, num_tokens,
-            grad.qkv_einsum_w.data(), backward.pre_att_rms_out.data(), pool);
-  RMSNormVJP(weights.pre_attention_norm_scale.data(), forward.input.data(),
-             backward.pre_att_rms_out.data(), model_dim, num_tokens,
-             grad.pre_attention_norm_scale.data(), backward.input.data(), pool);
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(backward.attention_out.data() + pos * model_dim,
-            backward.input.data() + pos * model_dim, model_dim);
-  }
-}
-
-static HWY_NOINLINE void SoftcapVJP(const float cap,
-                                    const float* HWY_RESTRICT forward,
-                                    float* HWY_RESTRICT backward,
-                                    const size_t size) {
-  namespace hn = hwy::HWY_NAMESPACE;
-  using D = hn::ScalableTag<float>;
-  const D d;
-
-  const auto one = hn::Set(d, 1.0f);
-  const auto vcap = hn::Set(d, cap);
-  const auto vinv_cap = hn::Div(hn::Set(d, 1.0f), vcap);
-  hn::Transform1(d, backward, size, forward,
-                 [&](const auto d, const auto v, const auto y) HWY_ATTR {
-                   const auto scaled = hn::Mul(vinv_cap, y);  // = tanh
-                   return hn::Mul(v, hn::Sub(one, hn::Mul(scaled, scaled)));
-                 });
-}
-
-static HWY_NOINLINE void CrossEntropyLossGrad(
-    const float* HWY_RESTRICT x, float* HWY_RESTRICT grad,
-    const Prompt& prompt, size_t vocab_size) {
-  HWY_ASSERT(!prompt.tokens.empty());
-  const float scaling = -1.0 / std::log(2.0);
-  size_t num_tokens = prompt.tokens.size() - 1;
-  hwy::ZeroBytes(grad, num_tokens * vocab_size * sizeof(grad[0]));
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    if (pos + 1 < prompt.context_size) {
-      continue;
-    }
-    const int next_token = prompt.tokens[pos + 1];
-    grad[pos * vocab_size + next_token] =
-        scaling / x[pos * vocab_size + next_token];
-  }
-}
-
-template <typename T>
-void CrossEntropyLossBackwardPassInl(const Prompt& prompt,
-                                     const ModelWeightsPtrs<T>& weights,
-                                     const ForwardPass<float>& forward,
-                                     ModelWeightsPtrs<T>& grad,
-                                     ForwardPass<float>& backward,
-                                     RowVectorBatch<float>& inv_timescale,
-                                     hwy::ThreadPool& pool) {
-  const ModelConfig& config = weights.weights_config;
-  const size_t kVocabSize = config.vocab_size;
-  const size_t model_dim = config.model_dim;
-  const size_t kLayers = config.layer_configs.size();
-  const float kEmbScaling = EmbeddingScaling(model_dim);
-  HWY_ASSERT(!config.absolute_pe);
-  HWY_ASSERT(config.layer_configs[0].post_norm == PostNormType::None);
-  HWY_ASSERT(config.layer_configs[0].kv_heads == 1);
-
-  HWY_DASSERT(prompt.context_size > 0);
-  HWY_DASSERT(prompt.context_size < prompt.tokens.size());
-  const size_t num_tokens = prompt.tokens.size() - 1;
-
-  CrossEntropyLossGrad(forward.probs.data(), backward.logits.data(), prompt,
-                       kVocabSize);
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    SoftmaxVJP(forward.probs.data() + pos * kVocabSize,
-               backward.logits.data() + pos * kVocabSize,
-               kVocabSize);
-  }
-
-  if (config.final_cap > 0.0f) {
-    for (size_t pos = 0; pos < num_tokens; ++pos) {
-      SoftcapVJP(config.final_cap, forward.logits.data() + pos * kVocabSize,
-                 backward.logits.data() + pos * kVocabSize, kVocabSize);
-    }
-  }
-
-  MatMulVJP(weights.embedder_input_embedding.data(),
-            forward.final_norm_output.data(), backward.logits.data(), model_dim,
-            kVocabSize, num_tokens, grad.embedder_input_embedding.data(),
-            backward.final_norm_output.data(), pool);
-
-  RMSNormVJP(weights.final_norm_scale.data(), forward.final_layer_output.data(),
-             backward.final_norm_output.data(), model_dim, num_tokens,
-             grad.final_norm_scale.data(), backward.final_layer_output.data(),
-             pool);
-
-  for (int layer = static_cast<int>(kLayers) - 1; layer >= 0; --layer) {
-    auto layer_config = config.layer_configs[layer];
-    // TODO(szabadka) Implement Griffin layer vjp.
-    HWY_ASSERT(layer_config.type == LayerAttentionType::kGemma);
-    float* next_layer_grad = layer + 1 < kLayers
-                             ? backward.layers[layer + 1].input.data()
-                             : backward.final_layer_output.data();
-    LayerVJP(*weights.GetLayer(layer), forward.layers[layer], next_layer_grad,
-             num_tokens, *grad.GetLayer(layer), backward.layers[layer],
-             inv_timescale, pool);
-  }
-
-  InputEmbeddingVJP(weights.embedder_input_embedding.data(), prompt.tokens,
-                    kEmbScaling, backward.layers[0].input.data(),
-                    grad.embedder_input_embedding.data(), model_dim);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace gcpp
-HWY_AFTER_NAMESPACE();
-
-#endif  // NOLINT
diff --git a/backprop/backward.cc b/backprop/backward.cc
deleted file mode 100644
index 868b391..0000000
--- a/backprop/backward.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "backprop/backward.h"
-
-#include "backprop/activations.h"
-#include "backprop/prompt.h"
-#include "gemma/common.h"
-#include "gemma/weights.h"
-#include "util/allocator.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-// Compiles this file for multiple architectures via "foreach_target.h", to
-// which we pass the filename via macro 'argument'.
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "backprop/backward.cc"  // NOLINT
-// clang-format on
-#include "hwy/foreach_target.h"        // IWYU pragma: keep
-
-#include "hwy/highway.h"
-// After highway.h
-#include "backprop/backward-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace gcpp {
-namespace HWY_NAMESPACE {
-
-void CrossEntropyLossBackwardPassT(const Prompt& prompt,
-                                   const ModelWeightsPtrs<float>& weights,
-                                   const ForwardPass<float>& forward,
-                                   ModelWeightsPtrs<float>& grad,
-                                   ForwardPass<float>& backward,
-                                   RowVectorBatch<float>& inv_timescale,
-                                   hwy::ThreadPool& pool) {
-  CrossEntropyLossBackwardPassInl(prompt, weights, forward, grad, backward,
-                                  inv_timescale, pool);
-}
-
-}  // namespace HWY_NAMESPACE
-}  // namespace gcpp
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace gcpp {
-
-HWY_EXPORT(CrossEntropyLossBackwardPassT);
-
-void CrossEntropyLossBackwardPass(const Prompt& prompt,
-                                  const ModelWeightsPtrs<float>& weights,
-                                  const ForwardPass<float>& forward,
-                                  ModelWeightsPtrs<float>& grad,
-                                  ForwardPass<float>& backward,
-                                  RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool) {
-  return HWY_DYNAMIC_DISPATCH(CrossEntropyLossBackwardPassT)(
-      prompt, weights, forward, grad, backward, inv_timescale, pool);
-}
-
-}  // namespace gcpp
-#endif  // HWY_ONCE
diff --git a/backprop/backward.h b/backprop/backward.h
deleted file mode 100644
index d8e50c7..0000000
--- a/backprop/backward.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_H_
-
-#include "backprop/activations.h"
-#include "backprop/prompt.h"
-#include "gemma/weights.h"
-#include "util/allocator.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-namespace gcpp {
-
-void CrossEntropyLossBackwardPass(const Prompt& prompt,
-                                  const ModelWeightsPtrs<float>& weights,
-                                  const ForwardPass<float>& forward,
-                                  ModelWeightsPtrs<float>& grad,
-                                  ForwardPass<float>& backward,
-                                  RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool);
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_H_
diff --git a/backprop/backward_scalar.h b/backprop/backward_scalar.h
deleted file mode 100644
index b0a37b3..0000000
--- a/backprop/backward_scalar.h
+++ /dev/null
@@ -1,349 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_SCALAR_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_SCALAR_H_
-
-#include <stddef.h>
-#include <string.h>
-
-#include <cmath>
-#include <vector>
-
-#include "backprop/activations.h"
-#include "backprop/common_scalar.h"
-#include "backprop/prompt.h"
-#include "gemma/common.h"  // EmbeddingScaling
-#include "gemma/weights.h"
-
-namespace gcpp {
-template<typename T>
-void MatMulVJPT(const T* w, const T* x, const T* dy, T* dw, T* dx,
-                size_t N, size_t M, size_t K) {
-  memset(dx, 0, M * K * sizeof(dx[0]));
-  for (size_t i = 0; i < K; ++i) {
-    for (size_t j = 0; j < N; ++j) {
-      MulByConstAndAddT(dy[i * N + j], &x[i * M], &dw[j * M], M);
-      MulByConstAndAddT(dy[i * N + j], &w[j * M], &dx[i * M], M);
-    }
-  }
-}
-template<typename T>
-void MultiHeadMatMulVJPT(const T* w, const T* x, const T* dy, T* dw, T* dx,
-                         size_t H, size_t N, size_t M, size_t K) {
-  memset(dx, 0, H * M * K * sizeof(dx[0]));
-  for (size_t i = 0; i < K; ++i) {
-    for (size_t j = 0; j < N; ++j) {
-      for (size_t h = 0; h < H; ++h) {
-        MulByConstAndAddT(dy[i * N + j], &x[i * H * M + h * M],
-                          &dw[h * N * M + j * M], M);
-        MulByConstAndAddT(dy[i * N + j], &w[h * N * M + j * M],
-                          &dx[i * H * M + h * M], M);
-      }
-    }
-  }
-}
-
-template<typename T>
-void RMSNormVJPT(const T* w, const T* x, const T* dy, T* dw, T* dx,
-                 size_t N, size_t K) {
-  for (size_t i = 0; i < K; ++i) {
-    constexpr T eps(1e-6);
-    T ss = SquaredL2(x + i * N, N);
-    ss = T(1.0) / std::sqrt(ss / T(N) + eps);
-    for (size_t j = 0; j < N; ++j) {
-      dw[j] += dy[i * N + j] * x[i * N + j] * ss;
-    }
-    const T ss3 = ss * ss * ss / T(N);
-    T tmp = 0.0;
-    for (size_t j = 0; j < N; ++j) {
-      tmp += (T(1.0) + w[j]) * dy[i* N + j] * x[i * N + j];
-    }
-    tmp *= ss3;
-    for (size_t j = 0; j < N; ++j) {
-      dx[i * N + j] = ss * (T(1.0) + w[j]) * dy[i* N + j] - tmp * x[i * N + j];
-    }
-  }
-}
-template<typename T>
-void SoftmaxVJPT(const T* y, T* dy, size_t N) {
-  T sum = {};
-  for (size_t i = 0; i < N; ++i) {
-    sum += y[i] * dy[i];
-  }
-  for (size_t i = 0; i < N; ++i) {
-    dy[i] = y[i] * (dy[i] - sum);
-  }
-}
-template<typename T>
-void SoftmaxVJPT(const T* y, T* dy, size_t N, size_t K) {
-  for (size_t i = 0; i < K; ++i) {
-    SoftmaxVJPT(y + i * N, dy + i * N, N);
-  }
-}
-
-template<typename T>
-T GeluDerivative(T x) {
-  static const T kMul = 0.044715;
-  static const T kSqrt2OverPi = 0.797884560804236;
-  static const T kMul2 = kSqrt2OverPi * T(3.0) * kMul;
-
-  const T x2 = x * x;
-  const T x3 = x2 * x;
-  const T arg = kSqrt2OverPi * (kMul * x3 + x);
-  const T tanh = std::tanh(arg);
-  const T cdf = T(0.5) * (T(1.0) + tanh);
-  const T dtanh = T(1.0) - tanh * tanh;
-  const T darg = kMul2 * x2 + kSqrt2OverPi;
-  return T(0.5) * x * dtanh * darg + cdf;
-}
-
-template<typename T>
-void GatedGeluVJP(const T* in, const T* d_out, T* d_in, size_t N, size_t K) {
-  for (size_t i = 0; i < K; ++i) {
-    const T* x1 = in + i * 2 * N;
-    const T* x2 = x1 + N;
-    const T* v = d_out + i * N;
-    T* dx1 = d_in + i * 2 * N;
-    T* dx2 = dx1 + N;
-    for (size_t j = 0; j < N; ++j) {
-      dx1[j] = v[j] * x2[j] * GeluDerivative(x1[j]);
-      dx2[j] = v[j] * Gelu(x1[j]);
-    }
-  }
-}
-
-template <typename T>
-void MaskedAttentionVJP(const T* qkv, const T* doutput, T* dqkv,
-                        size_t num_tokens, size_t kHeads, size_t qkv_dim,
-                        size_t seq_len) {
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t offset = pos * (kHeads + 2) * qkv_dim;
-    memset(dqkv + offset, 0, (kHeads + 1) * qkv_dim * sizeof(qkv[0]));
-  }
-  for (size_t head = 0; head < kHeads; ++head) {
-    for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t qoffs = (pos * (kHeads + 2) + head) * qkv_dim;
-      const size_t aoffs = head * seq_len + pos * kHeads * seq_len;
-      const T* q = qkv + qoffs;
-      const T* dout = doutput + aoffs;
-      T* dq = dqkv + qoffs;
-      for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const size_t koffs = (pos2 * (kHeads + 2) + kHeads) * qkv_dim;
-        const T* k = qkv + koffs;
-        T* dk = dqkv + koffs;
-        MulByConstAndAddT(dout[pos2], k, dq, qkv_dim);
-        MulByConstAndAddT(dout[pos2], q, dk, qkv_dim);
-      }
-    }
-  }
-}
-
-template <typename T>
-void MaskedSoftmaxVJPT(const T* y, T* dy, size_t num_tokens, size_t kHeads,
-                       size_t seq_len) {
-  for (size_t head = 0; head < kHeads; ++head) {
-    for (size_t pos = 0; pos < num_tokens; ++pos) {
-      size_t offset = pos * kHeads * seq_len + head * seq_len;
-      SoftmaxVJPT(y + offset, dy + offset, pos + 1);
-      memset(dy + offset + pos + 1, 0, (seq_len - pos - 1) * sizeof(T));
-    }
-  }
-}
-
-template <typename T>
-void MixByAttentionVJP(const T* qkv, const T* attention, const T* doutput,
-                       T* dqkv, T* dattention, size_t num_tokens, size_t kHeads,
-                       size_t qkv_dim, size_t seq_len) {
-  auto v_offset = [&](size_t pos) {
-    return (pos * (kHeads + 2) + kHeads + 1) * qkv_dim;
-  };
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    memset(&dqkv[v_offset(pos)], 0, qkv_dim * sizeof(qkv[0]));
-  }
-  for (size_t head = 0; head < kHeads; ++head) {
-    for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t offset = head * qkv_dim + pos * kHeads * qkv_dim;
-      const size_t aoffset = head * seq_len + pos * kHeads * seq_len;
-      const T* att = &attention[aoffset];
-      const T* dout = &doutput[offset];
-      T* datt = &dattention[aoffset];
-      for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        datt[pos2] = DotT(dout, &qkv[v_offset(pos2)], qkv_dim);
-        MulByConstAndAddT(att[pos2], dout, &dqkv[v_offset(pos2)], qkv_dim);
-      }
-    }
-  }
-}
-
-template<typename T>
-void InputEmbeddingVJPT(const T* w, const std::vector<int>& tokens, T scaling,
-                        const T* dy, T* dw, size_t N) {
-  const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
-  for (size_t i = 0; i < num_tokens; ++i) {
-    int token = tokens[i];
-    MulByConstAndAddT(scaling, dy + i * N, dw + token * N, N);
-  }
-}
-
-template <typename T>
-void LayerVJP(const LayerWeightsPtrs<T>& weights,
-              const ForwardLayer<T>& forward, const T* dy,
-              LayerWeightsPtrs<T>& grad, ForwardLayer<T>& backward,
-              size_t num_tokens) {
-  const LayerConfig& layer_config = weights.layer_config;
-  const size_t model_dim = layer_config.model_dim;
-  const size_t seq_len = forward.input.Rows();
-  const size_t qkv_dim = layer_config.qkv_dim;
-  const size_t kHeads = layer_config.heads;
-  const size_t kFFHiddenDim = layer_config.ff_hidden_dim;
-  const T kQueryScale = 1.0 / std::sqrt(T(qkv_dim));
-
-  MatMulVJPT(weights.linear_w.data(), forward.ffw_hidden_gated.data(), dy,
-             grad.linear_w.data(), backward.ffw_hidden_gated.data(), model_dim,
-             kFFHiddenDim, num_tokens);
-
-  GatedGeluVJP(forward.ffw_hidden.data(), backward.ffw_hidden_gated.data(),
-               backward.ffw_hidden.data(), kFFHiddenDim, num_tokens);
-
-  MatMulVJPT(weights.gating_einsum_w.data(), forward.bf_pre_ffw_rms_out.data(),
-             backward.ffw_hidden.data(), grad.gating_einsum_w.data(),
-             backward.bf_pre_ffw_rms_out.data(), kFFHiddenDim * 2, model_dim,
-             num_tokens);
-
-  RMSNormVJPT(weights.pre_ffw_norm_scale.data(), forward.attention_out.data(),
-              backward.bf_pre_ffw_rms_out.data(),
-              grad.pre_ffw_norm_scale.data(), backward.attention_out.data(),
-              model_dim, num_tokens);
-
-  AddFromT(dy, backward.attention_out.data(), num_tokens * model_dim);
-
-  MultiHeadMatMulVJPT(weights.attn_vec_einsum_w.data(), forward.att_out.data(),
-                      backward.attention_out.data(),
-                      grad.attn_vec_einsum_w.data(), backward.att_out.data(),
-                      kHeads, model_dim, qkv_dim, num_tokens);
-
-  MixByAttentionVJP(forward.qkv.data(), forward.att.data(),
-                    backward.att_out.data(), backward.qkv.data(),
-                    backward.att.data(), num_tokens, kHeads, qkv_dim, seq_len);
-
-  MaskedSoftmaxVJPT(forward.att.data(), backward.att.data(), num_tokens, kHeads,
-                    seq_len);
-
-  MaskedAttentionVJP(forward.qkv.data(), backward.att.data(),
-                     backward.qkv.data(), num_tokens, kHeads, qkv_dim, seq_len);
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = backward.qkv.data() + pos * (kHeads + 2) * qkv_dim;
-    MulByConstT(kQueryScale, qkv, kHeads * qkv_dim);
-  }
-
-  for (int pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = backward.qkv.data() + pos * (kHeads + 2) * qkv_dim;
-    for (size_t h = 0; h <= kHeads; ++h) {
-      Rope(qkv + h * qkv_dim, qkv_dim, -pos);
-    }
-  }
-
-  MatMulVJPT(weights.qkv_einsum_w.data(), forward.pre_att_rms_out.data(),
-             backward.qkv.data(), grad.qkv_einsum_w.data(),
-             backward.pre_att_rms_out.data(), (kHeads + 2) * qkv_dim, model_dim,
-             num_tokens);
-  RMSNormVJPT(weights.pre_attention_norm_scale.data(), forward.input.data(),
-              backward.pre_att_rms_out.data(),
-              grad.pre_attention_norm_scale.data(), backward.input.data(),
-              model_dim, num_tokens);
-
-  AddFromT(backward.attention_out.data(), backward.input.data(),
-           num_tokens * model_dim);
-}
-
-template <typename T>
-void SoftcapVJPT(float cap, const T* y, T* dy, size_t N) {
-  const T inv_cap = T{1.0} / static_cast<T>(cap);
-  for (size_t i = 0; i < N; ++i) {
-    T scaled = y[i] * inv_cap;  // tanh
-    dy[i] *= (T{1.0} - scaled * scaled);
-  }
-}
-
-template<typename T>
-void CrossEntropyLossGrad(const T* x, T* dx, const Prompt& prompt, size_t V) {
-  T scaling = -1.0 / std::log(2.0);
-  const std::vector<int> tokens = prompt.tokens;
-  const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
-  memset(dx, 0, V * num_tokens * sizeof(x[0]));
-  for (size_t i = 0; i < num_tokens; ++i) {
-    if (i + 1 < prompt.context_size) {
-      continue;
-    }
-    const int next_token = tokens[i + 1];
-    dx[i * V + next_token] = scaling / x[i * V + next_token];
-  }
-}
-
-template <typename T>
-void CrossEntropyLossBackwardPass(const Prompt& prompt,
-                                  const ModelWeightsPtrs<T>& weights,
-                                  const ForwardPass<T>& forward,
-                                  ModelWeightsPtrs<T>& grad,
-                                  ForwardPass<T>& backward) {
-  const ModelConfig& config = weights.weights_config;
-  const size_t model_dim = config.model_dim;
-  const size_t vocab_size = config.vocab_size;
-  const size_t layers = config.layer_configs.size();
-  const std::vector<int> tokens = prompt.tokens;
-  const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
-
-  CrossEntropyLossGrad(forward.probs.data(), backward.logits.data(), prompt,
-                       vocab_size);
-
-  SoftmaxVJPT(forward.probs.data(), backward.logits.data(), vocab_size,
-              num_tokens);
-
-  if (config.final_cap > 0.0f) {
-    for (size_t i = 0; i < num_tokens; ++i) {
-      SoftcapVJPT(config.final_cap, forward.logits.data() + i * vocab_size,
-                  backward.logits.data() + i * vocab_size, vocab_size);
-    }
-  }
-
-  MatMulVJPT(
-      weights.embedder_input_embedding.data(), forward.final_norm_output.data(),
-      backward.logits.data(), grad.embedder_input_embedding.data(),
-      backward.final_norm_output.data(), vocab_size, model_dim, num_tokens);
-
-  RMSNormVJPT(weights.final_norm_scale.data(),
-              forward.final_layer_output.data(),
-              backward.final_norm_output.data(), grad.final_norm_scale.data(),
-              backward.final_layer_output.data(), model_dim, num_tokens);
-
-  for (int layer = static_cast<int>(layers) - 1; layer >= 0; --layer) {
-    T* next_layer_grad = layer + 1 < layers
-                             ? backward.layers[layer + 1].input.data()
-                             : backward.final_layer_output.data();
-    LayerVJP(*weights.GetLayer(layer), forward.layers[layer], next_layer_grad,
-             *grad.GetLayer(layer), backward.layers[layer], num_tokens);
-  }
-
-  const T kEmbScaling = EmbeddingScaling(model_dim);
-  InputEmbeddingVJPT(weights.embedder_input_embedding.data(), tokens,
-                     kEmbScaling, backward.layers[0].input.data(),
-                     grad.embedder_input_embedding.data(), model_dim);
-}
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_SCALAR_H_
diff --git a/backprop/backward_scalar_test.cc b/backprop/backward_scalar_test.cc
deleted file mode 100644
index e40f3ed..0000000
--- a/backprop/backward_scalar_test.cc
+++ /dev/null
@@ -1,635 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "backprop/backward_scalar.h"
-
-#include <stddef.h>
-#include <stdio.h>
-#include <string.h>  // memcpy
-
-#include <complex>
-#include <limits>
-#include <random>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "backprop/activations.h"
-#include "backprop/common_scalar.h"
-#include "backprop/forward_scalar.h"
-#include "backprop/prompt.h"
-#include "backprop/sampler.h"
-#include "backprop/test_util.h"
-#include "compression/compress.h"
-#include "gemma/configs.h"
-#include "gemma/weights.h"
-
-namespace gcpp {
-
-TEST(BackPropTest, MatMulVJP) {
-  static const size_t kRows = 8;
-  static const size_t kCols = 64;
-  static const size_t kTokens = 5;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> weights("weights", kRows, kCols);
-  MatStorageT<T> x("x", kTokens, kCols);
-  MatStorageT<T> grad("grad", kRows, kCols);
-  MatStorageT<T> dx("dx", kTokens, kCols);
-  MatStorageT<TC> c_weights("c_weights", kRows, kCols);
-  MatStorageT<TC> c_x("c_x", kTokens, kCols);
-  MatStorageT<TC> c_y("c_y", kTokens, kRows);
-  MatStorageT<T> dy("dy", kTokens, kRows);
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(weights, 1.0 * (1 << iter), gen);
-    RandInit(x, 1.0 * (1 << iter), gen);
-    RandInit(dy, 1.0, gen);
-    Complexify(weights, c_weights);
-    Complexify(x, c_x);
-    auto func = [&]() {
-      MatMulT(c_weights.data(), c_x.data(), c_y.data(), kRows, kCols, kTokens);
-      return DotT(dy.data(), c_y.data(), kTokens * kRows);
-    };
-    grad.ZeroInit();
-    MatMulVJPT(weights.data(), x.data(), dy.data(), grad.data(), dx.data(),
-               kRows, kCols, kTokens);
-    TestGradient(dx, c_x, func, 1e-11, 1e-12, __LINE__);
-    TestGradient(grad, c_weights, func, 1e-14, 1e-12, __LINE__);
-  }
-}
-
-TEST(BackPropTest, MultiHeadMatMulVJP) {
-  static const size_t kRows = 2;
-  static const size_t kCols = 16;
-  static const size_t kHeads = 4;
-  static const size_t kTokens = 3;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> weights("weights", kRows, kCols * kHeads);
-  MatStorageT<T> x("x", kTokens, kCols * kHeads);
-  MatStorageT<T> grad("grad", kRows, kCols * kHeads);
-  MatStorageT<T> dx("dx", kTokens, kCols * kHeads);
-  MatStorageT<TC> c_weights("c_weights", kRows, kCols * kHeads);
-  MatStorageT<TC> c_x("c_x", kTokens, kCols * kHeads);
-  MatStorageT<TC> c_y("c_y", kTokens, kRows);
-  MatStorageT<T> dy("dy", kTokens, kRows);
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(weights, 1.0 * (1 << iter), gen);
-    RandInit(x, 1.0 * (1 << iter), gen);
-    RandInit(dy, 1.0, gen);
-    Complexify(weights, c_weights);
-    Complexify(x, c_x);
-    auto func = [&]() {
-      MultiHeadMatMul(c_weights.data(), c_x.data(), c_y.data(), kHeads, kRows,
-                      kCols, kTokens);
-      return DotT(dy.data(), c_y.data(), kTokens * kRows);
-    };
-    grad.ZeroInit();
-    MultiHeadMatMulVJPT(weights.data(), x.data(), dy.data(), grad.data(),
-                        dx.data(), kHeads, kRows, kCols, kTokens);
-    TestGradient(dx, c_x, func, 1e-15, 1e-13, __LINE__);
-    TestGradient(grad, c_weights, func, 1e-15, 1e-13, __LINE__);
-  }
-}
-
-TEST(BackPropTest, RMSNormVJP) {
-  static const size_t K = 2;
-  static const size_t N = 64;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> weights("weights", N, 1);
-  MatStorageT<T> grad("grad", N, 1);
-  MatStorageT<T> x("x", K, N);
-  MatStorageT<T> dx("dx", K, N);
-  MatStorageT<T> dy("dy", K, N);
-  MatStorageT<TC> c_weights("c_weights", N, 1);
-  MatStorageT<TC> c_x("c_x", K, N);
-  MatStorageT<TC> c_y("c_y", K, N);
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(weights, 1.0 * (1 << iter), gen);
-    RandInit(x, 1.0 * (1 << iter), gen);
-    Complexify(weights, c_weights);
-    Complexify(x, c_x);
-    RandInit(dy, 1.0, gen);
-    auto func = [&]() {
-      RMSNormT(c_weights.data(), c_x.data(), c_y.data(), N, K);
-      return DotT(dy.data(), c_y.data(), K * N);
-    };
-    grad.ZeroInit();
-    RMSNormVJPT(weights.data(), x.data(), dy.data(), grad.data(), dx.data(),
-                N, K);
-    TestGradient(dx, c_x, func, 1e-15, 1e-14, __LINE__);
-    TestGradient(grad, c_weights, func, 1e-15, 1e-14, __LINE__);
-  }
-}
-
-TEST(BackPropTest, SoftmaxVJP) {
-  static const size_t N = 64;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> x("x", N, 1);
-  MatStorageT<T> dx("dx", N, 1);
-  MatStorageT<T> dy("dy", N, 1);
-  MatStorageT<TC> c_x("c_x", N, 1);
-  MatStorageT<TC> c_y("c_y", N, 1);
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(x, 1.0 * (1 << iter), gen);
-    Complexify(x, c_x);
-    RandInit(dy, 1.0, gen);
-    auto func = [&]() {
-      memcpy(c_y.data(), c_x.data(), c_x.SizeBytes());
-      Softmax(c_y.data(), N);
-      return DotT(dy.data(), c_y.data(), N);
-    };
-    Softmax(x.data(), N);
-    memcpy(dx.data(), dy.data(), dx.SizeBytes());
-    SoftmaxVJPT(x.data(), dx.data(), N);
-    TestGradient(dx, c_x, func, 1e-15, 1e-15, __LINE__);
-  }
-}
-
-TEST(BackPropTest, MaskedSoftmaxVJP) {
-  static const size_t kSeqLen = 16;
-  static const size_t kHeads = 2;
-  static const size_t kTokens = 14;
-  static const size_t N = kTokens * kHeads * kSeqLen;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> x("x", N, 1);
-  MatStorageT<T> dy("dy", N, 1);
-  MatStorageT<T> dx("dx", N, 1);
-  MatStorageT<TC> c_x("c_x", N, 1);
-  MatStorageT<TC> c_y("c_y", N, 1);
-  dx.ZeroInit();
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(x, 1.0 * (1 << iter), gen);
-    Complexify(x, c_x);
-    RandInit(dy, 1.0, gen);
-    auto func = [&]() {
-      memcpy(c_y.data(), c_x.data(),
-             kTokens * kHeads * kSeqLen * sizeof(c_x.At(0)));
-      MaskedSoftmax(c_y.data(), kTokens, kHeads, kSeqLen);
-      return DotT(dy.data(), c_y.data(), N);
-    };
-    MaskedSoftmax(x.data(), kTokens, kHeads, kSeqLen);
-    memcpy(dx.data(), dy.data(), kTokens * kHeads * kSeqLen * sizeof(dx.At(0)));
-    MaskedSoftmaxVJPT(x.data(), dx.data(), kTokens, kHeads, kSeqLen);
-    TestGradient(dx, c_x, func, 1e-14, 1e-15, __LINE__);
-  }
-}
-
-TEST(BackPropTest, SoftcapVJP) {
-  static const size_t N = 64;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> x("x", N, 1);
-  MatStorageT<T> dx("dx", N, 1);
-  MatStorageT<T> dy("dy", N, 1);
-  MatStorageT<TC> c_x("c_x", N, 1);
-  MatStorageT<TC> c_y("c_y", N, 1);
-
-  constexpr float kCap = 30.0f;
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(x, 1.0 * (1 << iter), gen);
-    Complexify(x, c_x);
-    RandInit(dy, 1.0, gen);
-    auto func = [&]() {
-      memcpy(c_y.data(), c_x.data(), N * sizeof(c_x.At(0)));
-      Softcap(kCap, c_y.data(), N);
-      return DotT(dy.data(), c_y.data(), N);
-    };
-    Softcap(kCap, x.data(), N);
-    memcpy(dx.data(), dy.data(), dx.SizeBytes());
-    SoftcapVJPT(kCap, x.data(), dx.data(), N);
-    TestGradient(dx, c_x, func, 1e-15, 1e-14, __LINE__);
-  }
-}
-
-TEST(BackPropTest, CrossEntropyLossGrad) {
-  static const size_t K = 8;
-  static const size_t V = 64;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> x("x", K, V);
-  MatStorageT<T> dx("dx", K, V);
-  MatStorageT<TC> c_x("c_x", K, V);
-  Prompt prompt;
-  prompt.tokens = { 0, 1, 2, 3, 0, 3, 2, 1, 0 };
-
-  const float kCap = 30.0f;
-  for (int iter = 0; iter < 10; ++iter) {
-    prompt.context_size = 1 + (iter % 6);
-    RandInit(x, 1.0 * (1 << iter), gen);
-    Softcap(kCap, x.data(), V * K);
-    Softmax(x.data(), V, K);
-    CrossEntropyLossGrad(x.data(), dx.data(), prompt, V);
-    Complexify(x, c_x);
-    auto func = [&]() {
-      return CrossEntropyLoss(c_x.data(), prompt, V);
-    };
-    TestGradient(dx, c_x, func, 1e-100, 1e-15, __LINE__);
-  }
-}
-
-TEST(BackPropTest, GatedGeluVJP) {
-  static const size_t K = 2;
-  static const size_t N = 64;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> x("x", K, 2 * N);
-  MatStorageT<T> dx("dx", K, 2 * N);
-  MatStorageT<T> dy("dy", K, N);
-  MatStorageT<TC> c_x("c_x", K, 2 * N);
-  MatStorageT<TC> c_y("c_y", K, N);
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(x, 1.0, gen);
-    Complexify(x, c_x);
-    RandInit(dy, 1.0, gen);
-    auto func = [&]() {
-      GatedGelu(c_x.data(), c_y.data(), N, K);
-      return DotT(dy.data(), c_y.data(), N * K);
-    };
-    GatedGeluVJP(x.data(), dy.data(), dx.data(), N, K);
-    TestGradient(dx, c_x, func, 1e-15, 1e-15, __LINE__);
-  }
-}
-
-TEST(BackPropTest, MaskedAttentionVJP) {
-  static const size_t kSeqLen = 16;
-  static const size_t kHeads = 2;
-  static const size_t kQKVDim = 8;
-  static const size_t kTokens = 14;
-  static const size_t kQKVSize = kSeqLen * (kHeads + 2) * kQKVDim;
-  static const size_t kOutSize = kTokens * kHeads * kSeqLen;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> x("x", kQKVSize, 1);
-  MatStorageT<T> dx("dx", kQKVSize, 1);
-  MatStorageT<T> dy("dy", kOutSize, 1);
-  MatStorageT<TC> c_x("c_x", kQKVSize, 1);
-  MatStorageT<TC> c_y("c_y", kOutSize, 1);
-  dx.ZeroInit();
-  c_y.ZeroInit();
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(x, 1.0, gen);
-    Complexify(x, c_x);
-    RandInit(dy, 1.0, gen);
-    auto func = [&]() {
-      MaskedAttention(c_x.data(), c_y.data(), kTokens, kHeads, kQKVDim,
-                      kSeqLen);
-      return DotT(dy.data(), c_y.data(), kOutSize);
-    };
-    MaskedAttentionVJP(x.data(), dy.data(), dx.data(),
-                       kTokens, kHeads, kQKVDim, kSeqLen);
-    TestGradient(dx, c_x, func, 1e-14, 1e-15, __LINE__);
-  }
-}
-
-TEST(BackPropTest, MixByAttentionVJP) {
-  static const size_t kSeqLen = 16;
-  static const size_t kHeads = 2;
-  static const size_t kQKVDim = 8;
-  static const size_t kTokens = 14;
-  static const size_t kQKVSize = kSeqLen * (kHeads + 2) * kQKVDim;
-  static const size_t kAttnSize = kSeqLen * kHeads * kSeqLen;
-  static const size_t kOutSize = kSeqLen * kHeads * kQKVDim;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> qkv("qkv", kQKVSize, 1);
-  MatStorageT<T> dqkv("dqkv", kQKVSize, 1);
-  MatStorageT<T> attn("attn", kAttnSize, 1);
-  MatStorageT<T> dattn("dattn", kAttnSize, 1);
-  MatStorageT<T> dy("dy", kOutSize, 1);
-  MatStorageT<TC> c_qkv("c_qkv", kQKVSize, 1);
-  MatStorageT<TC> c_attn("c_attn", kAttnSize, 1);
-  MatStorageT<TC> c_y("c_y", kOutSize, 1);
-  dqkv.ZeroInit();
-  dattn.ZeroInit();
-  c_y.ZeroInit();
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(qkv, 1.0, gen);
-    RandInit(attn, 1.0, gen);
-    Complexify(qkv, c_qkv);
-    Complexify(attn, c_attn);
-    RandInit(dy, 1.0, gen);
-    auto func = [&]() {
-      MixByAttention(c_qkv.data(), c_attn.data(), c_y.data(),
-                     kTokens, kHeads, kQKVDim, kSeqLen);
-      return DotT(dy.data(), c_y.data(), kOutSize);
-    };
-    MixByAttentionVJP(qkv.data(), attn.data(), dy.data(), dqkv.data(),
-                      dattn.data(), kTokens, kHeads, kQKVDim, kSeqLen);
-    TestGradient(dqkv, c_qkv, func, 1e-14, 1e-15, __LINE__);
-    TestGradient(dattn, c_attn, func, 1e-14, 1e-15, __LINE__);
-  }
-}
-
-TEST(BackPropTest, InputEmbeddingVJP) {
-  static const size_t kSeqLen = 8;
-  static const size_t kVocabSize = 4;
-  static const size_t kModelDim = 16;
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  MatStorageT<T> weights("weights", kVocabSize, kModelDim);
-  MatStorageT<T> grad("grad", kVocabSize, kModelDim);
-  MatStorageT<T> dy("dy", kSeqLen, kModelDim);
-  MatStorageT<TC> c_weights("c_weights", kVocabSize, kModelDim);
-  MatStorageT<TC> c_y("c_y", kSeqLen, kModelDim);
-  std::vector<int> tokens = { 0, 1, 2, 3, 0, 1, 2 };
-  size_t num_tokens = tokens.size() - 1;
-
-  for (size_t iter = 0; iter < 10; ++iter) {
-    RandInit(weights, 1.0, gen);
-    RandInit(dy, 1.0, gen);
-    Complexify(weights, c_weights);
-    auto func = [&]() {
-      InputEmbedding(c_weights.data(), tokens, TC(3.0), c_y.data(), kModelDim);
-      return DotT(dy.data(), c_y.data(), num_tokens * kModelDim);
-    };
-    grad.ZeroInit();
-    InputEmbeddingVJPT(weights.data(), tokens, 3.0, dy.data(), grad.data(),
-                       kModelDim);
-    TestGradient(grad, c_weights, func, 1e-16, 1e-14, __LINE__);
-  }
-}
-
-static ModelConfig TestConfig() {
-  ModelConfig config;
-  config.scale_names = {"att_ein",      "qkv_ein",   "gr_lin_x_w", "gr_lin_y_w",
-                        "gr_lin_out_w", "gr_gate_w", "gating_ein", "linear_w"};
-  config.model_dim = 32;
-  config.vocab_size = 12;
-  config.seq_len = 18;
-  LayerConfig layer_config;
-  layer_config.model_dim = config.model_dim;
-  layer_config.ff_hidden_dim = 48;
-  layer_config.heads = 3;
-  layer_config.kv_heads = 1;
-  layer_config.qkv_dim = 12;
-  config.layer_configs = {2, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
-  config.query_scale = QueryScaleType::SqrtKeySize;
-  config.attention_window_sizes = FixedAttentionWindowSizes<2>(32);
-  // This is required for optimize_test to pass.
-  config.final_cap = 30.0f;
-  return config;
-}
-
-TEST(BackPropTest, LayerVJP) {
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  ModelConfig config = TestConfig();
-  TensorIndex tensor_index(config, /*llm_layer_idx=*/0, /*img_layer_idx=*/-1,
-                           /*reshape_att=*/false);
-  const size_t kOutputSize = config.seq_len * config.model_dim;
-  LayerWeightsPtrs<T> weights(config.layer_configs[0], tensor_index);
-  LayerWeightsPtrs<T> grad(config.layer_configs[0], tensor_index);
-  ForwardLayer<T> forward(config.layer_configs[0], config.seq_len);
-  ForwardLayer<T> backward(config.layer_configs[0], config.seq_len);
-  LayerWeightsPtrs<TC> c_weights(config.layer_configs[0], tensor_index);
-  ForwardLayer<TC> c_forward(config.layer_configs[0], config.seq_len);
-  MatStorageT<T> y("y", kOutputSize, 1);
-  MatStorageT<T> dy("dy", kOutputSize, 1);
-  MatStorageT<TC> c_y("c_y", kOutputSize, 1);
-  const size_t num_tokens = 3;
-  std::vector<MatStorage> layer_storage;
-  weights.Allocate(layer_storage);
-  grad.Allocate(layer_storage);
-  c_weights.Allocate(layer_storage);
-  backward.input.ZeroInit();
-
-  for (size_t iter = 0; iter < 10; ++iter) {
-    RandInit(weights, 1.0, gen);
-    RandInit(forward.input, 1.0, gen);
-    RandInit(dy, 1.0, gen);
-    Complexify(weights, c_weights);
-    Complexify(forward.input, c_forward.input);
-    auto func = [&]() {
-      ApplyLayer(c_weights, c_forward, num_tokens, c_y.data());
-      return DotT(dy.data(), c_y.data(), num_tokens * config.model_dim);
-    };
-    grad.ZeroInit(/*layer_idx=*/0);
-    ApplyLayer(weights, forward, num_tokens, y.data());
-    LayerVJP(weights, forward, dy.data(), grad, backward, num_tokens);
-    TestGradient(backward.input, c_forward.input, func, 1e-11, 5e-11,
-                 __LINE__);
-    TestGradient(grad, c_weights, func, 1e-11);
-  }
-}
-
-TEST(BackPropTest, EndToEnd) {
-  std::mt19937 gen(42);
-  using T = double;
-  using TC = std::complex<T>;
-  ModelConfig config = TestConfig();
-  WeightsWrapper<T> weights(config);
-  WeightsWrapper<T> grad(config);
-  ForwardPass<T> forward(config);
-  ForwardPass<T> backward(config);
-  WeightsWrapper<TC> c_weights(config);
-  ForwardPass<TC> c_forward(config);
-
-  ReverseSequenceSampler training_task({0, 0, 1, 1});
-  std::vector<Prompt> batch = training_task.SampleBatch(3, gen);
-
-  for (const Prompt& prompt : batch) {
-    ReverseSequenceSampler::LogPrompt(prompt);
-    RandInit(weights.get(), 1.0, gen);
-    CrossEntropyLossForwardPass(prompt, weights.get(), forward);
-    grad.ZeroInit();
-    CrossEntropyLossBackwardPass(
-        prompt, weights.get(), forward, grad.get(), backward);
-
-    Complexify(weights.get(), c_weights.get());
-    auto func = [&]() {
-      return CrossEntropyLossForwardPass(prompt, c_weights.get(), c_forward);
-    };
-
-    TestGradient(grad.get(), c_weights.get(), func, 1e-11);
-  }
-}
-
-template <typename T>
-void MulByConstAndAddT(T c, const LayerWeightsPtrs<T>& x,
-                       LayerWeightsPtrs<T>& out) {
-  MulByConstAndAddT(c, x.pre_attention_norm_scale,
-                    out.pre_attention_norm_scale);
-  MulByConstAndAddT(c, x.attn_vec_einsum_w, out.attn_vec_einsum_w);
-  MulByConstAndAddT(c, x.qkv_einsum_w, out.qkv_einsum_w);
-  MulByConstAndAddT(c, x.pre_ffw_norm_scale, out.pre_ffw_norm_scale);
-  MulByConstAndAddT(c, x.gating_einsum_w, out.gating_einsum_w);
-  MulByConstAndAddT(c, x.linear_w, out.linear_w);
-}
-
-template <typename T>
-void MulByConstAndAddT(T c, const ModelWeightsPtrs<T>& x,
-                       ModelWeightsPtrs<T>& out) {
-  const size_t layers = x.c_layers.size();
-  MulByConstAndAddT(c, x.embedder_input_embedding,
-                    out.embedder_input_embedding);
-  MulByConstAndAddT(c, x.final_norm_scale, out.final_norm_scale);
-  for (size_t i = 0; i < layers; ++i) {
-    MulByConstAndAddT(c, *x.GetLayer(i), *out.GetLayer(i));
-  }
-}
-
-// Evaluates forward pass on a batch.
-template <typename T>
-T CrossEntropyLossForwardPass(const std::vector<Prompt>& batch,
-                              const WeightsWrapper<T>& weights,
-                              ForwardPass<T>& forward) {
-  T loss = 0.0;
-  for (const Prompt& prompt : batch) {
-    loss += CrossEntropyLossForwardPass(prompt, weights.get(), forward);
-  }
-  T scale = 1.0 / batch.size();
-  return loss * scale;
-}
-
-// Evaluates forward pass on a batch by applying gradient with the given
-// learning rate. Does not update weights, but uses the given tmp weights
-// instead.
-template <typename T>
-T CrossEntropyLossForwardPass(T learning_rate, const std::vector<Prompt>& batch,
-                              const WeightsWrapper<T>& weights,
-                              const WeightsWrapper<T>& grad,
-                              WeightsWrapper<T>& tmp, ForwardPass<T>& forward) {
-  tmp.CopyFrom(weights);
-  const T scale = -learning_rate / batch.size();
-  MulByConstAndAddT(scale, grad.get(), tmp.get());
-  return CrossEntropyLossForwardPass(batch, tmp, forward);
-}
-
-// Uses line search in the negative gradient direction to update weights. We do
-// this so that we can test that each step during the gradient descent can
-// decrease the objective function value.
-template <typename T>
-T FindOptimalUpdate(const WeightsWrapper<T>& grad, WeightsWrapper<T>& weights,
-                    WeightsWrapper<T>& tmp, ForwardPass<T>& forward,
-                    const std::vector<Prompt>& batch, T loss,
-                    T initial_learning_rate) {
-  T lr0 = initial_learning_rate;
-  T loss0 = CrossEntropyLossForwardPass(
-      lr0, batch, weights, grad, tmp, forward);
-  for (size_t iter = 0; iter < 30; ++iter) {
-    T lr1 = lr0 * 0.5;
-    T loss1 = CrossEntropyLossForwardPass(
-        lr1, batch, weights, grad, tmp, forward);
-    if (loss0 < loss && loss1 >= loss0) {
-      break;
-    }
-    loss0 = loss1;
-    lr0 = lr1;
-  }
-  for (size_t iter = 0; iter < 30; ++iter) {
-    T lr1 = lr0 * 2.0;
-    T loss1 = CrossEntropyLossForwardPass(
-        lr1, batch, weights, grad, tmp, forward);
-    if (loss1 >= loss0) {
-      break;
-    }
-    loss0 = loss1;
-    lr0 = lr1;
-  }
-  const T scale = -lr0 / batch.size();
-  MulByConstAndAddT(scale, grad.get(), weights.get());
-  return lr0;
-}
-
-TEST(BackProptest, Convergence) {
-  std::mt19937 gen(42);
-  using T = float;
-  using TC = std::complex<double>;
-  ModelConfig config = TestConfig();
-  WeightsWrapper<T> weights(config);
-  WeightsWrapper<T> grad(config);
-  WeightsWrapper<T> tmp(config);
-  ForwardPass<T> forward(config);
-  ForwardPass<T> backward(config);
-  WeightsWrapper<TC> c_weights(config);
-  ForwardPass<TC> c_forward(config);
-  constexpr size_t kBatchSize = 5;
-  ReverseSequenceSampler training_task({0, 0, 0, 1, 1});
-  T learning_rate = 0.01;
-
-  RandInit(weights.get(), T(1.0), gen);
-
-  printf("Sample batch:\n");
-  for (size_t i = 0; i < 10; ++i) {
-    ReverseSequenceSampler::LogPrompt(training_task.Sample(gen));
-  }
-
-  T prev_loss = std::numeric_limits<T>::max();
-  bool stop = false;
-  size_t step = 0;
-  while (!stop) {
-    T loss = 0.0;
-    grad.ZeroInit();
-    std::mt19937 sgen(42);
-    std::vector<Prompt> batch = training_task.SampleBatch(kBatchSize, sgen);
-    for (const Prompt& prompt : batch) {
-      loss += CrossEntropyLossForwardPass(prompt, weights.get(), forward);
-      CrossEntropyLossBackwardPass(
-          prompt, weights.get(), forward, grad.get(), backward);
-    }
-
-    if (step % 250 == 0) {
-      printf("Checking gradient...\n");
-      Complexify(weights.get(), c_weights.get());
-      auto func = [&]() {
-        TC scale = batch.size();
-        return CrossEntropyLossForwardPass(batch, c_weights, c_forward) * scale;
-      };
-
-      TestGradient(grad.get(), c_weights.get(), func, 5e-3f);
-    }
-
-    loss /= batch.size();
-    EXPECT_LT(loss, prev_loss);
-    stop = step >= 10000 || loss < 1e-2;
-    if (step % 10 == 0 || stop) {
-      printf("step: %5zu  loss: %.15f  learning_rate: %.15f\n",
-             step, loss, learning_rate);
-    }
-    if (!stop) {
-      learning_rate = FindOptimalUpdate(
-          grad, weights, tmp, forward, batch, loss, learning_rate);
-      ++step;
-    }
-    prev_loss = loss;
-  }
-  EXPECT_LT(step, 1000);
-}
-
-}  // namespace gcpp
diff --git a/backprop/backward_test.cc b/backprop/backward_test.cc
deleted file mode 100644
index f1c97b2..0000000
--- a/backprop/backward_test.cc
+++ /dev/null
@@ -1,279 +0,0 @@
-// Copyright 2023 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HWY_DISABLED_TARGETS
-#define HWY_DISABLED_TARGETS HWY_SCALAR
-#endif
-
-#include <stddef.h>
-
-#include <complex>
-#include <cstdlib>  // std::abs
-#include <random>
-#include <vector>
-
-#include "backprop/activations.h"
-#include "backprop/backward_scalar.h"
-#include "backprop/common_scalar.h"
-#include "backprop/forward_scalar.h"
-#include "backprop/prompt.h"
-#include "backprop/sampler.h"
-#include "backprop/test_util.h"
-#include "gemma/configs.h"
-#include "ops/ops.h"
-#include "util/threading.h"
-#include "hwy/base.h"
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "backprop/backward_test.cc"  //NOLINT
-// clang-format on
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-// After highway.h
-#include "backprop/backward-inl.h"
-#include "backprop/forward-inl.h"
-#include "compression/compress.h"
-#include "ops/ops-inl.h"
-#include "util/allocator.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace gcpp {
-namespace HWY_NAMESPACE {
-
-void TestMatMulVJP() {
-  static const size_t kRows = 8;
-  static const size_t kCols = 64;
-  static const size_t kTokens = 5;
-  const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 8));
-  Allocator::Init(topology);
-  gcpp::NestedPools pools(topology, 1, /*pin=*/Tristate::kFalse);
-  std::mt19937 gen(42);
-  MatStorageT<float> weights("weights", kRows, kCols);
-  MatStorageT<float> x("x", kTokens, kCols);
-  MatStorageT<float> dy("dy", kTokens, kRows);
-  MatStorageT<float> grad("grad", kRows, kCols);
-  MatStorageT<float> dx("dx", kTokens, kCols);
-  MatStorageT<float> grad_scalar("grad_scalar", kRows, kCols);
-  MatStorageT<float> dx_scalar("dx_scalar", kTokens, kCols);
-  using TC = std::complex<double>;
-  MatStorageT<TC> c_weights("c_weights", kRows, kCols);
-  MatStorageT<TC> c_x("c_x", kTokens, kCols);
-  MatStorageT<TC> c_y("c_y", kTokens, kRows);
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(weights, 1.0f * (1 << iter), gen);
-    RandInit(x, 1.0f * (1 << iter), gen);
-    RandInit(dy, 1.0f, gen);
-    Complexify(weights, c_weights);
-    Complexify(x, c_x);
-    auto func = [&]() {
-      MatMulT(c_weights.data(), c_x.data(), c_y.data(), kRows, kCols, kTokens);
-      return DotT(dy.data(), c_y.data(), kTokens * kRows);
-    };
-
-    grad.ZeroInit();
-    MatMulVJP(weights.data(), x.data(), dy.data(), kCols, kRows, kTokens,
-              grad.data(), dx.data(), pools.Pool());
-    TestGradient(dx, c_x, func, 5e-5f, 5e-5f, __LINE__);
-    TestGradient(grad, c_weights, func, 5e-5f, 5e-5f, __LINE__);
-
-    grad_scalar.ZeroInit();
-    MatMulVJPT(weights.data(), x.data(), dy.data(), grad_scalar.data(),
-               dx_scalar.data(), kRows, kCols, kTokens);
-    TestNear(dx, dx_scalar, 5e-5, 1e-4, __LINE__);
-    TestNear(grad, grad_scalar, 5e-5, 5e-5, __LINE__);
-  }
-}
-
-void TestMultiHeadMatMulVJP() {
-  static const size_t kRows = 2;
-  static const size_t kCols = 16;
-  static const size_t kHeads = 4;
-  static const size_t kTokens = 3;
-  const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 8));
-  Allocator::Init(topology);
-  gcpp::NestedPools pools(topology, 1, /*pin=*/Tristate::kFalse);
-  std::mt19937 gen(42);
-  MatStorageT<float> weights("weights", kRows, kCols * kHeads);
-  MatStorageT<float> x("x", kTokens, kCols * kHeads);
-  MatStorageT<float> grad("grad", kRows, kCols * kHeads);
-  MatStorageT<float> dx("dx", kTokens, kCols * kHeads);
-  MatStorageT<float> dy("dy", kTokens, kRows);
-  MatStorageT<float> grad_scalar("grad_scalar", kRows, kCols * kHeads);
-  MatStorageT<float> dx_scalar("dx_scalar", kTokens, kCols * kHeads);
-  using TC = std::complex<double>;
-  MatStorageT<TC> c_weights("c_weights", kRows, kCols * kHeads);
-  MatStorageT<TC> c_x("c_x", kTokens, kCols * kHeads);
-  MatStorageT<TC> c_y("c_y", kTokens, kRows);
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(weights, 1.0f * (1 << iter), gen);
-    RandInit(x, 1.0f * (1 << iter), gen);
-    RandInit(dy, 1.0f, gen);
-    Complexify(weights, c_weights);
-    Complexify(x, c_x);
-    auto func = [&]() {
-      MultiHeadMatMul(c_weights.data(), c_x.data(), c_y.data(), kHeads, kRows,
-                      kCols, kTokens);
-      return DotT(dy.data(), c_y.data(), kTokens * kRows);
-    };
-
-    grad.ZeroInit();
-    MultiHeadMatMulVJP(weights.data(), x.data(), dy.data(), kHeads, kCols,
-                       kRows, kTokens, grad.data(), dx.data(), pools.Pool());
-    TestGradient(dx, c_x, func, 5e-5f, 5e-5f, __LINE__);
-    TestGradient(grad, c_weights, func, 5e-5f, 5e-5f, __LINE__);
-
-    grad_scalar.ZeroInit();
-    MultiHeadMatMulVJPT(weights.data(), x.data(), dy.data(), grad_scalar.data(),
-                        dx_scalar.data(), kHeads, kRows, kCols, kTokens);
-    TestNear(dx, dx_scalar, 5e-5, 5e-5, __LINE__);
-    TestNear(grad, grad_scalar, 5e-5, 5e-5, __LINE__);
-  }
-}
-
-void TestRMSNormVJP() {
-  static const size_t K = 2;
-  static const size_t N = 64;
-  const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 8));
-  Allocator::Init(topology);
-  gcpp::NestedPools pools(topology, 1, /*pin=*/Tristate::kFalse);
-  std::mt19937 gen(42);
-  MatStorageT<float> weights("weights", N, 1);
-  MatStorageT<float> x("x", K, N);
-  MatStorageT<float> grad("grad", N, 1);
-  MatStorageT<float> dx("dx", K, N);
-  MatStorageT<float> dy("dy", K, N);
-  MatStorageT<float> grad_scalar("grad_scalar", N, 1);
-  MatStorageT<float> dx_scalar("dx_scalar", K, N);
-  using TC = std::complex<double>;
-  MatStorageT<TC> c_weights("c_weights", N, 1);
-  MatStorageT<TC> c_x("c_x", K, N);
-  MatStorageT<TC> c_y("c_y", K, N);
-
-  for (int iter = 0; iter < 10; ++iter) {
-    RandInit(weights, 1.0f * (1 << iter), gen);
-    RandInit(x, 1.0f * (1 << iter), gen);
-    RandInit(dy, 1.0f, gen);
-    Complexify(weights, c_weights);
-    Complexify(x, c_x);
-    auto func = [&]() {
-      RMSNormT(c_weights.data(), c_x.data(), c_y.data(), N, K);
-      return DotT(dy.data(), c_y.data(), K * N);
-    };
-
-    grad.ZeroInit();
-    RMSNormVJP(weights.data(), x.data(), dy.data(), N, K, grad.data(),
-               dx.data(), pools.Pool());
-    TestGradient(dx, c_x, func, 5e-5f, 5e-5f, __LINE__);
-    TestGradient(grad, c_weights, func, 5e-5f, 5e-5f, __LINE__);
-
-    grad_scalar.ZeroInit();
-    RMSNormVJPT(weights.data(), x.data(), dy.data(), grad_scalar.data(),
-                dx_scalar.data(), N, K);
-    TestNear(dx, dx_scalar, 0, 2e-5, __LINE__);
-    TestNear(grad, grad_scalar, 0, 2e-5, __LINE__);
-  }
-}
-
-static ModelConfig TestConfig() {
-  ModelConfig config;
-  config.scale_names = {"att_ein",      "qkv_ein",   "gr_lin_x_w", "gr_lin_y_w",
-                        "gr_lin_out_w", "gr_gate_w", "gating_ein", "linear_w"};
-  config.model_dim = 32;
-  config.vocab_size = 16;
-  config.seq_len = 24;
-  LayerConfig layer_config;
-  layer_config.model_dim = config.model_dim;
-  layer_config.ff_hidden_dim = 64;
-  layer_config.heads = 3;
-  layer_config.kv_heads = 1;
-  layer_config.qkv_dim = 16;
-  config.layer_configs = {2, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
-  config.query_scale = QueryScaleType::SqrtKeySize;
-  config.attention_window_sizes = FixedAttentionWindowSizes<2>(32);
-  // This is required for optimize_test to pass.
-  config.att_cap = 50.0f;
-  config.final_cap = 30.0f;
-  return config;
-}
-
-void TestEndToEnd() {
-  std::mt19937 gen(42);
-  const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 1));
-  Allocator::Init(topology);
-  gcpp::NestedPools pools(topology, 1, /*pin=*/Tristate::kFalse);
-  ModelConfig config = TestConfig();
-  WeightsWrapper<float> weights(config);
-  WeightsWrapper<float> grad(config);
-  ForwardPass<float> forward0(config);
-  ForwardPass<float> forward1(config);
-  ForwardPass<float> backward(config);
-  using TC = std::complex<double>;
-  WeightsWrapper<TC> c_weights(config);
-  ForwardPass<TC> c_forward(config);
-
-  ReverseSequenceSampler training_task({0, 0, 1, 1});
-  std::vector<Prompt> batch = training_task.SampleBatch(3, gen);
-
-  RowVectorBatch<float> inv_timescale = CreateInvTimescale(
-      config.layer_configs[0].qkv_dim,
-      config.layer_configs[0].post_qk == PostQKType::HalfRope);
-  for (const Prompt& prompt : batch) {
-    ReverseSequenceSampler::LogPrompt(prompt);
-    RandInit(weights.get(), 1.0f, gen);
-
-    float loss0 = CrossEntropyLossForwardPass(prompt, weights.get(), forward0);
-
-    float loss1 = CrossEntropyLossForwardPass(
-        prompt.tokens, prompt.context_size, weights.get(), forward1,
-        inv_timescale, pools.Pool());
-
-    EXPECT_NEAR(loss1, loss0, std::abs(loss0) * 2e-5);
-
-    grad.ZeroInit();
-    CrossEntropyLossBackwardPassInl(prompt, weights.get(), forward1, grad.get(),
-                                    backward, inv_timescale, pools.Pool());
-
-    Complexify(weights.get(), c_weights.get());
-    auto func = [&]() {
-      return CrossEntropyLossForwardPass(prompt, c_weights.get(), c_forward);
-    };
-
-    TestGradient(grad.get(), c_weights.get(), func, 2e-3f);
-  }
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace gcpp
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace gcpp {
-HWY_BEFORE_TEST(BackwardTest);
-HWY_EXPORT_AND_TEST_P(BackwardTest, TestMatMulVJP);
-HWY_EXPORT_AND_TEST_P(BackwardTest, TestMultiHeadMatMulVJP);
-HWY_EXPORT_AND_TEST_P(BackwardTest, TestRMSNormVJP);
-HWY_EXPORT_AND_TEST_P(BackwardTest, TestEndToEnd);
-HWY_AFTER_TEST();
-
-}  // namespace gcpp
-
-#endif
diff --git a/backprop/common_scalar.h b/backprop/common_scalar.h
deleted file mode 100644
index c61086d..0000000
--- a/backprop/common_scalar.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_SCALAR_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_SCALAR_H_
-
-#include <stddef.h>
-
-#include <complex>
-
-#include "compression/compress.h"  // MatStorageT
-
-namespace gcpp {
-
-template<typename T, typename U>
-U DotT(const T* a, const U* b, size_t N) {
-  U sum = {};
-  for (size_t i = 0; i < N; ++i) {
-    sum += a[i] * b[i];
-  }
-  return sum;
-}
-
-template<>
-inline std::complex<double> DotT(const float* a, const std::complex<double>* b,
-                                 size_t N) {
-  std::complex<double> sum = {};
-  for (size_t i = 0; i < N; ++i) {
-    sum += static_cast<double>(a[i]) * b[i];
-  }
-  return sum;
-}
-
-template<typename T>
-void MulByConstT(T c, T* x, size_t N) {
-  for (size_t i = 0; i < N; ++i) {
-    x[i] *= c;
-  }
-}
-
-// out += c * x
-template<typename T>
-void MulByConstAndAddT(T c, const T* x, T* out, size_t N) {
-  for (size_t i = 0; i < N; ++i) {
-    out[i] += c * x[i];
-  }
-}
-
-template <typename T>
-void MulByConstAndAddT(T c, const MatPtrT<T>& x, MatPtrT<T>& out) {
-  MulByConstAndAddT(c, x.data(), out.data(), x.NumElements());
-}
-
-template<typename T>
-void AddFromT(const T* a, T* out, size_t N) {
-  for (size_t i = 0; i < N; ++i) {
-    out[i] += a[i];
-  }
-}
-
-template<typename T>
-T SquaredL2(const T* x, size_t N) {
-  T sum = {};
-  for (size_t i = 0; i < N; ++i) {
-    sum += x[i] * x[i];
-  }
-  return sum;
-}
-
-template<typename T>
-T Gelu(T x) {
-  static const T kMul = 0.044715;
-  static const T kSqrt2OverPi = 0.797884560804236;
-
-  const T x3 = x * x * x;
-  const T arg = kSqrt2OverPi * (kMul * x3 + x);
-  const T cdf = T(0.5) * (T(1.0) + std::tanh(arg));
-  return x * cdf;
-}
-
-template<typename T, typename U>
-void Rope(T* x, U base, size_t N, int i) {
-  const size_t N2 = N / 2;
-  for (size_t dim = 0; dim < N2; ++dim) {
-    const T freq_exponents = T(2 * dim) / T(N);
-    const T timescale = std::pow(base, freq_exponents);
-    const T theta = T(i) / timescale;
-    const T cos_val = std::cos(theta);
-    const T sin_val = std::sin(theta);
-    const T x0 = x[dim];
-    const T x1 = x[dim + N2];
-    x[dim] = x0 * cos_val - x1 * sin_val;
-    x[dim + N2] = x0 * sin_val + x1 * cos_val;
-  }
-}
-
-template<typename T>
-void Rope(T* x, size_t N, int i) {
-  Rope(x, T(10000.0), N, i);
-}
-
-template<typename T>
-void Rope(std::complex<T>* x, size_t N, int i) {
-  Rope(x, T(10000.0), N, i);
-}
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_SCALAR_H_
diff --git a/backprop/forward-inl.h b/backprop/forward-inl.h
deleted file mode 100644
index ca969c4..0000000
--- a/backprop/forward-inl.h
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Include guard for non-SIMD code.
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_INL_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_INL_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <cmath>
-#include <vector>
-
-#include "backprop/activations.h"
-#include "gemma/common.h"
-#include "gemma/configs.h"
-#include "gemma/weights.h"
-#include "util/allocator.h"
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_INL_H_
-
-// Include guard for (potentially) SIMD code.
-#if defined(THIRD_PARTY_GEMMA_CPP_FORWARD_TOGGLE) == defined(HWY_TARGET_TOGGLE)
-#ifdef THIRD_PARTY_GEMMA_CPP_FORWARD_TOGGLE
-#undef THIRD_PARTY_GEMMA_CPP_FORWARD_TOGGLE
-#else
-#define THIRD_PARTY_GEMMA_CPP_FORWARD_TOGGLE
-#endif
-
-#include "hwy/highway.h"
-// After highway.h
-#include "ops/matvec-inl.h"
-#include "ops/ops-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace gcpp {
-namespace HWY_NAMESPACE {
-
-template <typename ArrayT>
-void InputEmbedding(const ArrayT& weights, const std::vector<int>& prompt,
-                    const float scaling, float* HWY_RESTRICT output,
-                    size_t model_dim, size_t vocab_size) {
-  const hn::ScalableTag<float> df;
-  HWY_ASSERT(!prompt.empty());
-  for (size_t pos = 0; pos < prompt.size() - 1; ++pos) {
-    int token = prompt[pos];
-    DecompressAndZeroPad(df, MakeSpan(weights.data(), model_dim * vocab_size),
-                         token * model_dim, output + pos * model_dim,
-                         model_dim);
-    MulByConst(scaling, output + pos * model_dim, model_dim);
-  }
-}
-
-template<typename WT, typename XT, typename OutT>
-void ApplyRMSNorm(const WT* HWY_RESTRICT weights, const XT* HWY_RESTRICT x,
-                  size_t model_dim, size_t num_tokens,
-                  OutT* HWY_RESTRICT output,
-                  hwy::ThreadPool& pool) {
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t offset = pos * model_dim;
-    RMSNorm(x + offset, weights, output + offset, model_dim);
-  }
-}
-
-static HWY_NOINLINE float CrossEntropyLoss(const float* HWY_RESTRICT probs,
-                                           const std::vector<int>& prompt,
-                                           size_t context_size,
-                                           size_t vocab_size,
-                                           hwy::ThreadPool& pool) {
-  HWY_ASSERT(!prompt.empty());
-  float loss = 0.0f;
-  for (size_t pos = 0; pos < prompt.size() - 1; ++pos) {
-    if (pos + 1 < context_size) {
-      continue;  // next token is part of context, don't try to predict it
-    }
-    const int next_token = prompt[pos + 1];
-    loss += std::log(probs[pos * vocab_size + next_token]);
-  }
-  float scaling = -1.0 / std::log(2.0);
-  return loss * scaling;
-}
-
-template <typename T>
-void ApplyForwardLayer(const LayerWeightsPtrs<T>& weights,
-                       ForwardLayer<float>& activations, size_t num_tokens,
-                       float* HWY_RESTRICT output,
-                       const RowVectorBatch<float>& inv_timescale,
-                       hwy::ThreadPool& pool) {
-  const LayerConfig& config = weights.layer_config;
-  const size_t model_dim = config.model_dim;
-  const size_t kSeqLen = activations.input.Rows();
-  const size_t kQKVDim = config.qkv_dim;
-  const size_t kHeads = config.heads;
-  static const float query_scale =
-      static_cast<float>(1.0 / sqrt(static_cast<double>(kQKVDim)));
-  HWY_ASSERT(num_tokens <= kSeqLen);
-
-  ApplyRMSNorm(weights.pre_attention_norm_scale.data(),
-               activations.input.data(), model_dim, num_tokens,
-               activations.pre_att_rms_out.data(), pool);
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec(weights.qkv_einsum_w, 0, (kHeads + 2) * kQKVDim, model_dim,
-           activations.pre_att_rms_out.data() + pos * model_dim,
-           activations.qkv.data() + pos * (kHeads + 2) * kQKVDim, pool);
-  }
-  const size_t num_tasks = kHeads * num_tokens;
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    float* HWY_RESTRICT k =
-        activations.qkv.data() + (pos * (kHeads + 2) + kHeads) * kQKVDim;
-    Rope(k, kQKVDim, inv_timescale.Const(), pos);
-  }
-  pool.Run(0, num_tasks, [&](const uint64_t task, size_t thread) HWY_ATTR {
-    const size_t head = task % kHeads;
-    const size_t pos = task / kHeads;
-    float* HWY_RESTRICT q =
-        activations.qkv.data() + (pos * (kHeads + 2) + head) * kQKVDim;
-    Rope(q, kQKVDim, inv_timescale.Const(), pos);
-    MulByConst(query_scale, q, kQKVDim);
-  });
-
-  pool.Run(0, num_tasks, [&](const uint64_t task, size_t thread) HWY_ATTR {
-    const size_t head = task % kHeads;
-    const size_t pos = task / kHeads;
-    const float* HWY_RESTRICT q =
-        activations.qkv.data() + (pos * (kHeads + 2) + head) * kQKVDim;
-    float* HWY_RESTRICT head_att =
-        activations.att.data() + (pos * kHeads + head) * kSeqLen;
-    for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-      const float* HWY_RESTRICT k2 =
-          activations.qkv.data() + (pos2 * (kHeads + 2) + kHeads) * kQKVDim;
-      const float score = Dot(q, k2, kQKVDim);
-      head_att[pos2] = score;
-    }
-  });
-
-  pool.Run(0, num_tasks, [&](const uint64_t task, size_t thread) HWY_ATTR {
-    const size_t head = task % kHeads;
-    const size_t pos = task / kHeads;
-    float* HWY_RESTRICT head_att =
-        activations.att.data() + (pos * kHeads + head) * kSeqLen;
-    Softmax(head_att, pos + 1);
-  });
-
-  pool.Run(0, num_tasks, [&](const uint64_t task, size_t thread) HWY_ATTR {
-    const size_t head = task % kHeads;
-    const size_t pos = task / kHeads;
-    const float* HWY_RESTRICT head_att =
-        activations.att.data() + (pos * kHeads + head) * kSeqLen;
-    float* HWY_RESTRICT att_out =
-        activations.att_out.data() + (pos * kHeads + head) * kQKVDim;
-    hwy::ZeroBytes(att_out, kQKVDim * sizeof(*att_out));
-    for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-      float* HWY_RESTRICT v2 =
-          activations.qkv.data() + (pos2 * (kHeads + 2) + kHeads + 1) * kQKVDim;
-      MulByConstAndAdd(head_att[pos2], v2, att_out, kQKVDim);
-    }
-  });
-
-  activations.attention_out.ZeroInit();
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t head = 0; head < kHeads; ++head) {
-      MatVec(
-          weights.attn_vec_einsum_w, head * model_dim * kQKVDim, model_dim,
-          kQKVDim,
-          activations.att_out.data() + pos * kHeads * kQKVDim + head * kQKVDim,
-          activations.att_post1.data() + pos * model_dim, pool);
-      AddFrom(activations.att_post1.data() + pos * model_dim,
-              activations.attention_out.data() + pos * model_dim, model_dim);
-    }
-  }
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(activations.input.data() + pos * model_dim,
-            activations.attention_out.data() + pos * model_dim, model_dim);
-  }
-
-  ApplyRMSNorm(weights.pre_ffw_norm_scale.data(),
-               activations.attention_out.data(), model_dim, num_tokens,
-               activations.bf_pre_ffw_rms_out.data(), pool);
-  const size_t kFFHiddenDim = config.ff_hidden_dim;
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec(weights.gating_einsum_w, 0, kFFHiddenDim * 2, model_dim,
-           activations.bf_pre_ffw_rms_out.data() + pos * model_dim,
-           activations.ffw_hidden.data() + pos * kFFHiddenDim * 2, pool);
-  }
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t hidden_offset = pos * kFFHiddenDim * 2;
-    const float* HWY_RESTRICT out =
-        activations.ffw_hidden.data() + hidden_offset;
-    const float* HWY_RESTRICT out_mul = out + kFFHiddenDim;
-    float* HWY_RESTRICT out_gated =
-        activations.ffw_hidden_gated.data() + pos * kFFHiddenDim;
-    namespace hn = hwy::HWY_NAMESPACE;
-    using DF = hn::ScalableTag<float>;
-    DF df;
-    for (size_t i = 0; i < kFFHiddenDim; i += Lanes(df)) {
-      const auto y = hn::Load(df, out + i);
-      const auto x = hn::Load(df, out_mul + i);
-      hn::Store(hn::Mul(x, Gelu(df, y)), df, out_gated + i);
-    }
-  }
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec(weights.linear_w, 0, model_dim, kFFHiddenDim,
-           activations.ffw_hidden_gated.data() + pos * kFFHiddenDim,
-           output + pos * model_dim, pool);
-  }
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(activations.attention_out.data() + pos * model_dim,
-            output + pos * model_dim, model_dim);
-  }
-}
-
-template <typename T>
-float CrossEntropyLossForwardPass(const std::vector<int>& prompt,
-                                  size_t context_size,
-                                  const ModelWeightsPtrs<T>& weights,
-                                  ForwardPass<float>& forward,
-                                  const RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool) {
-  const ModelConfig& config = weights.weights_config;
-  const size_t vocab_size = config.vocab_size;
-  const size_t model_dim = config.model_dim;
-  const size_t layers = config.layer_configs.size();
-  const float emb_scaling = EmbeddingScaling(model_dim);
-  HWY_ASSERT(!config.absolute_pe);
-  HWY_ASSERT(config.layer_configs[0].post_norm == PostNormType::None);
-  HWY_ASSERT(config.layer_configs[0].kv_heads == 1);
-
-  HWY_DASSERT(context_size > 0);
-  HWY_DASSERT(context_size < prompt.size());
-  const size_t num_tokens = prompt.size() - 1;
-
-  InputEmbedding(weights.embedder_input_embedding, prompt, emb_scaling,
-                 forward.layers[0].input.data(), model_dim, vocab_size);
-
-  for (size_t layer = 0; layer < config.layer_configs.size(); ++layer) {
-    auto type = config.layer_configs[layer].type;
-    // TODO(szabadka) Implement Griffin layer.
-    HWY_ASSERT(type == LayerAttentionType::kGemma);
-    float* HWY_RESTRICT output = layer + 1 < layers
-                                     ? forward.layers[layer + 1].input.data()
-                                     : forward.final_layer_output.data();
-    ApplyForwardLayer(*weights.GetLayer(layer), forward.layers[layer],
-                      num_tokens, output, inv_timescale, pool);
-  }
-
-  ApplyRMSNorm(weights.final_norm_scale.data(),
-               forward.final_layer_output.data(), model_dim, num_tokens,
-               forward.final_norm_output.data(), pool);
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec(weights.embedder_input_embedding, 0, vocab_size, model_dim,
-           forward.final_norm_output.data() + pos * model_dim,
-           forward.logits.data() + pos * vocab_size, pool);
-  }
-
-  if (config.final_cap > 0.0f) {
-    for (size_t pos = 0; pos < num_tokens; ++pos) {
-      LogitsSoftCap(config.final_cap, forward.logits.data() + pos * vocab_size,
-                    vocab_size);
-    }
-  }
-
-  hwy::CopyBytes(forward.logits.data(), forward.probs.data(),
-                 num_tokens * vocab_size * sizeof(forward.logits.At(0)));
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    Softmax(forward.probs.data() + pos * vocab_size, vocab_size);
-  }
-
-  return CrossEntropyLoss(forward.probs.data(), prompt, context_size,
-                          vocab_size, pool);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace gcpp
-HWY_AFTER_NAMESPACE();
-
-#endif  // NOLINT
diff --git a/backprop/forward.cc b/backprop/forward.cc
deleted file mode 100644
index 0c6cc5c..0000000
--- a/backprop/forward.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "backprop/forward.h"
-
-#include "backprop/activations.h"
-#include "backprop/prompt.h"
-#include "gemma/common.h"
-#include "gemma/configs.h"
-#include "util/allocator.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-// Compiles this file for multiple architectures via "foreach_target.h", to
-// which we pass the filename via macro 'argument'.
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "backprop/forward.cc"  // NOLINT
-#include "hwy/foreach_target.h"        // IWYU pragma: keep
-
-#include "hwy/highway.h"
-// After highway.h
-#include "backprop/forward-inl.h"
-#include "gemma/weights.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace gcpp {
-namespace HWY_NAMESPACE {
-
-float CrossEntropyLossForwardPassT(const Prompt& prompt,
-                                   const ModelWeightsPtrs<float>& weights,
-                                   ForwardPass<float>& forward,
-                                   RowVectorBatch<float>& inv_timescale,
-                                   hwy::ThreadPool& pool) {
-  return CrossEntropyLossForwardPass(prompt.tokens, prompt.context_size,
-                                     weights, forward, inv_timescale, pool);
-}
-
-}  // namespace HWY_NAMESPACE
-}  // namespace gcpp
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace gcpp {
-
-HWY_EXPORT(CrossEntropyLossForwardPassT);
-
-float CrossEntropyLossForwardPass(const Prompt& prompt,
-                                  const ModelWeightsPtrs<float>& weights,
-                                  ForwardPass<float>& forward,
-                                  RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool) {
-  return HWY_DYNAMIC_DISPATCH(CrossEntropyLossForwardPassT)(
-      prompt, weights, forward, inv_timescale, pool);
-}
-
-}  // namespace gcpp
-#endif  // HWY_ONCE
diff --git a/backprop/forward.h b/backprop/forward.h
deleted file mode 100644
index 3b42298..0000000
--- a/backprop/forward.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_H_
-
-#include "backprop/activations.h"
-#include "backprop/prompt.h"
-#include "gemma/weights.h"
-#include "util/allocator.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-namespace gcpp {
-
-float CrossEntropyLossForwardPass(const Prompt& prompt,
-                                  const ModelWeightsPtrs<float>& weights,
-                                  ForwardPass<float>& forward,
-                                  RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool);
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_H_
diff --git a/backprop/forward_scalar.h b/backprop/forward_scalar.h
deleted file mode 100644
index 617d0c3..0000000
--- a/backprop/forward_scalar.h
+++ /dev/null
@@ -1,294 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_SCALAR_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_SCALAR_H_
-
-#include <stddef.h>
-#include <string.h>
-
-#include <cmath>
-#include <complex>
-#include <vector>
-
-#include "backprop/activations.h"
-#include "backprop/common_scalar.h"
-#include "backprop/prompt.h"
-#include "gemma/common.h"  // EmbeddingScaling
-#include "gemma/weights.h"
-#include "hwy/base.h"
-
-namespace gcpp {
-
-// w is N x M matrix in row-major order, x is M x K matrix in column-major order
-// y = w * x is N x K matrix in column-major order.
-template<typename T>
-void MatMulT(const T* w, const T* x, T* y, size_t N, size_t M, size_t K) {
-  for (size_t i = 0; i < K; ++i) {
-    for (size_t j = 0; j < N; ++j) {
-      y[i * N + j] = DotT(&w[j * M], &x[i * M], M);
-    }
-  }
-}
-
-// w is H concatenated N x M matrix in row-major order, x is HM x K matrix in
-// column-major order and y = w' * x is N x K matrix in column-major order,
-// where w' is the rearrangement of w into an N x HM matrix.
-template<typename T>
-void MultiHeadMatMul(const T* w, const T* x, T* y, size_t H, size_t N,
-                     size_t M, size_t K) {
-  memset(y, 0, N * K * sizeof(y[0]));
-  for (size_t i = 0; i < K; ++i) {
-    for (size_t h = 0; h < H; ++h) {
-      for (size_t j = 0; j < N; ++j) {
-        y[i * N + j] += DotT(&w[h * N * M + j * M], &x[i * H * M + h * M], M);
-      }
-    }
-  }
-}
-
-template<typename T>
-void RMSNormT(const T* w, const T* x, T* out, size_t N, size_t K) {
-  constexpr T eps(1e-6);
-  for (size_t i = 0; i < K; ++i) {
-    T ss = SquaredL2(x + i * N, N);
-    ss = T(1.0) / std::sqrt(ss / T(N) + eps);
-    for (size_t j = 0; j < N; j++) {
-      out[i * N + j] = (T(1.0) + w[j]) * (ss * x[i * N + j]);
-    }
-  }
-}
-template<typename T>
-void Softmax(T* x, size_t N) {
-  T sum = {};
-  auto maxreal = std::real(x[0]);
-  for (size_t i = 1; i < N; ++i) {
-    if (std::real(x[i]) > maxreal) {
-      maxreal = std::real(x[i]);
-    }
-  }
-  for (size_t i = 0; i < N; ++i) {
-    x[i] = std::exp(x[i] - maxreal);
-    sum += x[i];
-  }
-  T scale = T(1.0) / sum;
-  for (size_t i = 0; i < N; ++i) {
-    x[i] *= scale;
-  }
-}
-template<typename T>
-void Softmax(T* x, size_t N, size_t K) {
-  for (size_t i = 0; i < K; ++i) {
-    Softmax(x + i * N, N);
-  }
-}
-template <typename T>
-void Softcap(float cap, T* x, size_t N) {
-  const T inv_cap = T{1.0} / static_cast<T>(cap);
-  for (size_t i = 0; i < N; ++i) {
-    x[i] = static_cast<T>(cap) * std::tanh(x[i] * inv_cap);
-  }
-}
-
-template<typename T>
-void GatedGelu(const T* in, T* out, size_t N, size_t K) {
-  for (size_t i = 0; i < K; ++i) {
-    const T* x1 = in + i * 2 * N;
-    const T* x2 = x1 + N;
-    T* y = out + i * N;
-    for (size_t j = 0; j < N; ++j) {
-      y[j] = x2[j] * Gelu(x1[j]);
-    }
-  }
-}
-
-template<typename T>
-void InputEmbedding(const T* w, const std::vector<int>& tokens, T scaling,
-                    T* y, size_t N) {
-  HWY_ASSERT(w != nullptr);
-  HWY_ASSERT(y != nullptr);
-  const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
-  for (size_t i = 0; i < num_tokens; ++i) {
-    int token = tokens[i];
-    memcpy(y + i * N, w + token * N, N * sizeof(y[0]));
-    MulByConstT(scaling, y + i * N, N);
-  }
-}
-
-template <typename T>
-void MaskedAttention(const T* qkv, T* output, size_t num_tokens, size_t heads,
-                     size_t qkv_dim, size_t seq_len) {
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t head = 0; head < heads; ++head) {
-      const size_t qoffset = pos * (heads + 2) * qkv_dim;
-      const size_t aoffset = pos * heads * seq_len + head * seq_len;
-      const T* q = qkv + qoffset + head * qkv_dim;
-      for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const T* k = qkv + (pos2 * (heads + 2) + heads) * qkv_dim;
-        output[aoffset + pos2] = DotT(q, k, qkv_dim);
-      }
-    }
-  }
-}
-template <typename T>
-void MaskedSoftmax(T* x, size_t num_tokens, size_t heads, size_t seq_len) {
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t head = 0; head < heads; ++head) {
-      size_t offset = pos * heads * seq_len + head * seq_len;
-      Softmax(x + offset, pos + 1);
-      memset(x + offset + pos + 1, 0, (seq_len - pos - 1) * sizeof(T));
-    }
-  }
-}
-template <typename T>
-void MixByAttention(const T* qkv, const T* attention, T* output,
-                    size_t num_tokens, size_t heads, size_t qkv_dim,
-                    size_t seq_len) {
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t head = 0; head < heads; ++head) {
-      const T* att = &attention[pos * heads * seq_len + head * seq_len];
-      T* out = &output[head * qkv_dim + pos * heads * qkv_dim];
-      memset(out, 0, qkv_dim * sizeof(out[0]));
-      for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        size_t v_offset = (pos2 * (heads + 2) + heads + 1) * qkv_dim;
-        const T* v = &qkv[v_offset];
-        MulByConstAndAddT(att[pos2], v, out, qkv_dim);
-      }
-    }
-  }
-}
-template <typename T>
-void ApplyLayer(const LayerWeightsPtrs<T>& weights,
-                ForwardLayer<T>& activations, size_t num_tokens, T* output) {
-  const LayerConfig& layer_config = weights.layer_config;
-  const size_t model_dim = layer_config.model_dim;
-  const size_t seq_len = activations.input.Rows();
-  const size_t qkv_dim = layer_config.qkv_dim;
-  const size_t heads = layer_config.heads;
-  const size_t ff_hidden_dim = layer_config.ff_hidden_dim;
-  static const T query_scale = T(1.0) / std::sqrt(T(qkv_dim));
-
-  RMSNormT(weights.pre_attention_norm_scale.data(), activations.input.data(),
-           activations.pre_att_rms_out.data(), model_dim, num_tokens);
-
-  MatMulT(weights.qkv_einsum_w.data(), activations.pre_att_rms_out.data(),
-          activations.qkv.data(), (heads + 2) * qkv_dim, model_dim, num_tokens);
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = activations.qkv.data() + pos * (heads + 2) * qkv_dim;
-    for (size_t h = 0; h <= heads; ++h) {
-      Rope(qkv + h * qkv_dim, qkv_dim, pos);
-    }
-  }
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = activations.qkv.data() + pos * (heads + 2) * qkv_dim;
-    MulByConstT(query_scale, qkv, heads * qkv_dim);
-  }
-
-  MaskedAttention(activations.qkv.data(), activations.att.data(), num_tokens,
-                  heads, qkv_dim, seq_len);
-
-  MaskedSoftmax(activations.att.data(), num_tokens, heads, seq_len);
-
-  MixByAttention(activations.qkv.data(), activations.att.data(),
-                 activations.att_out.data(), num_tokens, heads, qkv_dim,
-                 seq_len);
-
-  MultiHeadMatMul(weights.attn_vec_einsum_w.data(), activations.att_out.data(),
-                  activations.attention_out.data(), heads, model_dim, qkv_dim,
-                  num_tokens);
-
-  AddFromT(activations.input.data(), activations.attention_out.data(),
-           num_tokens * model_dim);
-
-  RMSNormT(weights.pre_ffw_norm_scale.data(), activations.attention_out.data(),
-           activations.bf_pre_ffw_rms_out.data(), model_dim, num_tokens);
-
-  MatMulT(weights.gating_einsum_w.data(), activations.bf_pre_ffw_rms_out.data(),
-          activations.ffw_hidden.data(), ff_hidden_dim * 2, model_dim,
-          num_tokens);
-
-  GatedGelu(activations.ffw_hidden.data(), activations.ffw_hidden_gated.data(),
-            ff_hidden_dim, num_tokens);
-
-  MatMulT(weights.linear_w.data(), activations.ffw_hidden_gated.data(), output,
-          model_dim, ff_hidden_dim, num_tokens);
-
-  AddFromT(activations.attention_out.data(), output, num_tokens * model_dim);
-}
-
-template<typename T>
-T CrossEntropyLoss(const T* x, const Prompt& prompt, size_t V) {
-  T loss = {};
-  const std::vector<int> tokens = prompt.tokens;
-  const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
-  for (size_t i = 0; i < num_tokens; ++i) {
-    if (i + 1 < prompt.context_size) {
-      continue;  // next token is part of context, don't try to predict it
-    }
-    const int next_token = tokens[i + 1];
-    loss += std::log(x[i * V + next_token]);
-  }
-  T scaling = -1.0 / std::log(2.0);
-  return loss * scaling;
-}
-
-template <typename T>
-T CrossEntropyLossForwardPass(const Prompt& prompt,
-                              const ModelWeightsPtrs<T>& weights,
-                              ForwardPass<T>& forward) {
-  const ModelConfig& config = weights.weights_config;
-  const size_t model_dim = config.model_dim;
-  const size_t vocab_size = config.vocab_size;
-  const size_t layers = config.layer_configs.size();
-  const std::vector<int> tokens = prompt.tokens;
-  const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
-
-  const T kEmbScaling = EmbeddingScaling(model_dim);
-  InputEmbedding(weights.embedder_input_embedding.data(), tokens, kEmbScaling,
-                 forward.layers[0].input.data(), model_dim);
-
-  for (size_t layer = 0; layer < layers; ++layer) {
-    T* output = layer + 1 < layers ? forward.layers[layer + 1].input.data()
-                                   : forward.final_layer_output.data();
-    ApplyLayer(*weights.GetLayer(layer), forward.layers[layer], num_tokens,
-               output);
-  }
-
-  RMSNormT(weights.final_norm_scale.data(), forward.final_layer_output.data(),
-           forward.final_norm_output.data(), model_dim, num_tokens);
-
-  MatMulT(weights.embedder_input_embedding.data(),
-          forward.final_norm_output.data(), forward.logits.data(), vocab_size,
-          model_dim, num_tokens);
-
-  for (size_t pos = 0; pos < num_tokens; ++pos) {
-    if (config.final_cap > 0.0f) {
-      Softcap(config.final_cap, forward.logits.data() + pos * vocab_size,
-              vocab_size);
-    }
-  }
-
-  memcpy(forward.probs.data(), forward.logits.data(),
-         num_tokens * vocab_size * sizeof(forward.logits.At(0)));
-  Softmax(forward.probs.data(), vocab_size, num_tokens);
-
-  return CrossEntropyLoss(forward.probs.data(), prompt, vocab_size);
-}
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_SCALAR_H_
diff --git a/backprop/optimize_test.cc b/backprop/optimize_test.cc
deleted file mode 100644
index 6f08bf0..0000000
--- a/backprop/optimize_test.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-
-#include <algorithm>
-#include <cstdio>
-#include <random>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "backprop/activations.h"
-#include "backprop/backward.h"
-#include "backprop/forward.h"
-#include "backprop/optimizer.h"
-#include "backprop/prompt.h"
-#include "backprop/sampler.h"
-#include "compression/shared.h"
-#include "gemma/common.h"
-#include "gemma/configs.h"
-#include "gemma/gemma.h"
-#include "gemma/weights.h"
-#include "ops/ops.h"
-#include "util/allocator.h"
-#include "util/basics.h"
-#include "util/threading.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-namespace gcpp {
-
-TEST(OptimizeTest, GradientDescent) {
-  const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 1));
-  Allocator::Init(topology);
-  NestedPools pools(topology, 1, /*pin=*/Tristate::kFalse);
-  MatMulEnv env(topology, pools);
-  hwy::ThreadPool& pool = pools.Pool();
-  std::mt19937 gen(42);
-
-  const ModelInfo info = {
-      .model = Model::GEMMA_TINY,
-      .wrapping = PromptWrapping::GEMMA_IT,
-      .weight = Type::kF32,
-  };
-  ModelConfig config = ConfigFromModel(info.model);
-  ModelWeightsStorage grad, grad_m, grad_v;
-  grad.Allocate(info.model, info.weight, pool);
-  grad_m.Allocate(info.model, info.weight, pool);
-  grad_v.Allocate(info.model, info.weight, pool);
-  grad_m.ZeroInit();
-  grad_v.ZeroInit();
-  ForwardPass<float> forward(config), backward(config);
-  KVCache kv_cache = KVCache::Create(config, /*prefill_tbatch_size=*/16);
-
-  RowVectorBatch<float> inv_timescale = CreateInvTimescale(
-      config.layer_configs[0].qkv_dim,
-      config.layer_configs[0].post_qk == PostQKType::HalfRope);
-
-  Gemma gemma(GemmaTokenizer(), info, env);
-
-  const auto generate = [&](const std::vector<int>& prompt) {
-    std::vector<int> reply;
-    auto stream_token = [&reply](int token, float) {
-      reply.push_back(token);
-      return token != ReverseSequenceSampler::kEndToken;
-    };
-    RuntimeConfig runtime = {
-        .max_generated_tokens = 16,
-        .temperature = 1.0f,
-        .gen = &gen,
-        .verbosity = 0,
-        .stream_token = stream_token,
-        .eos_id = ReverseSequenceSampler::kEndToken,
-    };
-    TimingInfo timing_info;
-    gemma.Generate(runtime, prompt, 0, kv_cache, timing_info);
-    return reply;
-  };
-
-  // Sanity check of reply tokens.
-  // 1) Its length should be greater than the prompt.
-  // 2) The prompt should be a prefix of the reply.
-  auto verify = [&](const Prompt& prompt) {
-    const std::vector<int>& context = prompt.context();
-    std::vector<int> reply = generate(context);
-    if (reply.size() <= context.size()) return false;
-    return std::equal(context.begin(), context.end(), reply.begin(),
-                      reply.begin() + context.size());
-  };
-
-  gemma.MutableWeights().RandInit(gen);
-  gemma.MutableWeights().AllocAndCopyWithTranspose(pool);
-
-  printf("Initial weights:\n");
-  gemma.MutableWeights().LogWeightStats();
-
-  constexpr size_t kBatchSize = 8;
-  const float alpha = 0.001f;
-  const float beta1 = 0.9f;
-  const float beta2 = 0.999f;
-  const float epsilon = 1e-8f;
-
-  ReverseSequenceSampler training_task({
-      0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1});
-  size_t steps = 0;
-  size_t num_ok;
-  for (; steps < 1000000; ++steps) {
-    std::mt19937 sgen(42);
-    grad.ZeroInit();
-    float total_loss = 0.0f;
-    num_ok = 0;
-    for (size_t i = 0; i < kBatchSize; ++i) {
-      Prompt prompt = training_task.Sample(sgen);
-      total_loss += CrossEntropyLossForwardPass(
-          prompt, *gemma.Weights().GetWeightsOfType<float>(), forward,
-          inv_timescale, pool);
-      CrossEntropyLossBackwardPass(
-          prompt, *gemma.Weights().GetWeightsOfType<float>(), forward,
-          *grad.GetWeightsOfType<float>(), backward, inv_timescale, pool);
-      gemma.MutableWeights().CopyWithTranspose(pool);
-      num_ok += verify(prompt) ? 1 : 0;
-    }
-    total_loss /= kBatchSize;
-
-    AdamUpdate(info.weight, grad, alpha, beta1, beta2, epsilon, steps + 1,
-               gemma.Weights(), grad_m, grad_v, pool);
-    printf("step: %zu  total_loss: %.15f   num_ok: %zu/%zu\n",
-           steps, total_loss, num_ok, kBatchSize);
-    if (steps % 100 == 0) {
-      printf("Batch gradient:\n");
-      grad.LogWeightStats();
-    }
-    if (total_loss < 0.5f) {
-      break;
-    }
-  }
-  printf("Num steps: %zu\n", steps);
-  printf("Final weights:\n");
-  gemma.MutableWeights().LogWeightStats();
-  EXPECT_LT(steps, 300);
-  EXPECT_EQ(num_ok, kBatchSize);
-}
-
-}  // namespace gcpp
diff --git a/backprop/optimizer.cc b/backprop/optimizer.cc
deleted file mode 100644
index 9187bf7..0000000
--- a/backprop/optimizer.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "backprop/optimizer.h"
-
-#include <cmath>
-
-#include "compression/compress.h"
-#include "gemma/common.h"
-#include "gemma/weights.h"
-#include "util/allocator.h"
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-namespace gcpp {
-
-namespace {
-
-class AdamUpdater {
- public:
-  explicit AdamUpdater(float alpha, float beta1, float beta2, float epsilon,
-                       size_t t)
-      : alpha_(alpha), beta1_(beta1), beta2_(beta2), cbeta1_(1.0f - beta1),
-        cbeta2_(1.0f - beta2), norm1_(1.0 / (1.0 - std::pow(beta1, t))),
-        norm2_(1.0 / (1.0 - std::pow(beta2, t))), epsilon_(epsilon) {}
-
-  void operator()(const char* name, const MatPtr& grad, MatPtr& weights,
-                  MatPtr& grad_m, MatPtr& grad_v) {
-    const float* HWY_RESTRICT g = grad.data<float>();
-    float* HWY_RESTRICT w = weights.data<float>();
-    float* HWY_RESTRICT m = grad_m.data<float>();
-    float* HWY_RESTRICT v = grad_v.data<float>();
-    for (size_t i = 0; i < grad.NumElements(); ++i) {
-      m[i] *= beta1_;
-      m[i] += cbeta1_ * g[i];
-      v[i] *= beta2_;
-      v[i] += cbeta2_ * g[i] * g[i];
-      const float mhat = m[i] * norm1_;
-      const float vhat = v[i] * norm2_;
-      w[i] -= alpha_ * mhat / (std::sqrt(vhat) + epsilon_);
-    }
-  }
-
- private:
-  float alpha_;
-  float beta1_;
-  float beta2_;
-  float cbeta1_;
-  float cbeta2_;
-  float norm1_;
-  float norm2_;
-  float epsilon_;
-};
-
-void AdamUpdate(ModelWeightsPtrs<float>* grad, float alpha, float beta1,
-                float beta2, float epsilon, size_t t,
-                ModelWeightsPtrs<float>* weights,
-                ModelWeightsPtrs<float>* grad_m,
-                ModelWeightsPtrs<float>* grad_v, hwy::ThreadPool& pool) {
-  AdamUpdater updater(alpha, beta1, beta2, epsilon, t);
-  ModelWeightsPtrs<float>::ForEachTensor(
-      {grad, weights, grad_m, grad_v}, ForEachType::kLoadNoToc,
-      [&updater](const char* name, hwy::Span<MatPtr*> tensors) {
-        updater(name, *tensors[0], *tensors[1], *tensors[2], *tensors[3]);
-      });
-}
-
-}  // namespace
-
-void AdamUpdate(Type weight_type, const ModelWeightsStorage& grad, float alpha,
-                float beta1, float beta2, float epsilon, size_t t,
-                const ModelWeightsStorage& weights,
-                const ModelWeightsStorage& grad_m,
-                const ModelWeightsStorage& grad_v, hwy::ThreadPool& pool) {
-  HWY_ASSERT(weight_type == Type::kF32);
-  AdamUpdate(grad.GetWeightsOfType<float>(), alpha, beta1, beta2, epsilon, t,
-             weights.GetWeightsOfType<float>(),
-             grad_m.GetWeightsOfType<float>(), grad_v.GetWeightsOfType<float>(),
-             pool);
-}
-
-}  // namespace gcpp
diff --git a/backprop/optimizer.h b/backprop/optimizer.h
deleted file mode 100644
index 8b25c52..0000000
--- a/backprop/optimizer.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_OPTIMIZER_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_OPTIMIZER_H_
-
-#include "gemma/common.h"
-#include "gemma/weights.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-namespace gcpp {
-
-void AdamUpdate(Type weight_type, const ModelWeightsStorage& grad, float alpha,
-                float beta1, float beta2, float epsilon, size_t t,
-                const ModelWeightsStorage& weights,
-                const ModelWeightsStorage& grad_m,
-                const ModelWeightsStorage& grad_v, hwy::ThreadPool& pool);
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_OPTIMIZER_H_
diff --git a/backprop/sampler.h b/backprop/sampler.h
deleted file mode 100644
index 17f5762..0000000
--- a/backprop/sampler.h
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_SAMPLER_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_SAMPLER_H_
-
-#include <stddef.h>
-#include <stdio.h>
-
-#include <random>
-#include <vector>
-
-#include "backprop/prompt.h"
-
-namespace gcpp {
-
-class PromptSampler {
- public:
-  virtual Prompt Sample(std::mt19937& gen) = 0;
-  virtual ~PromptSampler() = default;
-
-  std::vector<Prompt> SampleBatch(size_t batch_size, std::mt19937& gen) {
-    std::vector<Prompt> batch;
-    batch.reserve(batch_size);
-    for (size_t i = 0; i < batch_size; ++i) {
-      batch.emplace_back(Sample(gen));
-    }
-    return batch;
-  }
-};
-
-class ReverseSequenceSampler : public PromptSampler {
- public:
-  explicit ReverseSequenceSampler(const std::vector<int>& length_histo)
-      : token_dist_(0, 9) {
-    for (int i = 0; i < length_histo.size(); ++i) {
-      const int count = length_histo[i];
-      for (int j = 0; j < count; ++j) {
-        length_lut_.push_back(i + 1);
-      }
-    }
-    length_dist_ = std::uniform_int_distribution<>(0, length_lut_.size() - 1);
-  }
-  virtual ~ReverseSequenceSampler() = default;
-
-  static constexpr int kReverseToken = 10;
-  static constexpr int kEndToken = 11;
-
-  Prompt Sample(std::mt19937& gen) override {
-    Prompt prompt;
-    int len = length_lut_[length_dist_(gen)];
-    prompt.tokens.resize(2 * len + 2);
-    prompt.tokens[len] = kReverseToken;
-    prompt.tokens[2 * len + 1] = kEndToken;
-    for (size_t i = 0; i < len; ++i) {
-      prompt.tokens[i] = prompt.tokens[2 * len - i] = token_dist_(gen);
-    }
-    prompt.context_size = len + 1;
-    return prompt;
-  }
-
-  static void LogPrompt(const Prompt& prompt) {
-    static const char* kVocab[] = {
-      "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "-->", "|",
-    };
-    for (int token : prompt.tokens) printf("%s", kVocab[token]);
-    printf("  [context_size: %zu]\n", prompt.context_size);
-  }
-
- private:
-  std::uniform_int_distribution<> token_dist_;
-  std::uniform_int_distribution<> length_dist_;
-  std::vector<int> length_lut_;
-};
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_SAMPLER_H_
diff --git a/backprop/test_util.h b/backprop/test_util.h
deleted file mode 100644
index a83e3d5..0000000
--- a/backprop/test_util.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_TEST_UTIL_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_TEST_UTIL_H_
-
-#include <stddef.h>
-
-#include <cmath>
-#include <complex>
-#include <random>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "compression/compress.h"
-#include "gemma/configs.h"
-#include "gemma/weights.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-namespace gcpp {
-
-template <typename T>
-void RandInit(MatPtrT<T>& x, T stddev, std::mt19937& gen) {
-  std::normal_distribution<T> dist(0.0, stddev);
-  for (size_t i = 0; i < x.NumElements(); ++i) {
-    x.At(i) = dist(gen);
-  }
-}
-
-// TODO: make a member of Layer<T>.
-template <typename T>
-void RandInit(LayerWeightsPtrs<T>& w, T stddev, std::mt19937& gen) {
-  RandInit(w.pre_attention_norm_scale, stddev, gen);
-  RandInit(w.attn_vec_einsum_w, stddev, gen);
-  RandInit(w.qkv_einsum_w, stddev, gen);
-  RandInit(w.pre_ffw_norm_scale, stddev, gen);
-  RandInit(w.gating_einsum_w, stddev, gen);
-  RandInit(w.linear_w, stddev, gen);
-}
-
-template <typename T>
-void RandInit(ModelWeightsPtrs<T>& w, T stddev, std::mt19937& gen) {
-  const size_t kLayers = w.c_layers.size();
-  RandInit(w.embedder_input_embedding, stddev, gen);
-  RandInit(w.final_norm_scale, stddev, gen);
-  for (size_t i = 0; i < kLayers; ++i) {
-    RandInit(*w.GetLayer(i), stddev, gen);
-  }
-}
-
-template <typename T, typename U>
-void Complexify(const MatPtrT<T>& x, MatPtrT<std::complex<U>>& c_x) {
-  for (size_t i = 0; i < x.NumElements(); ++i) {
-    c_x.At(i) = std::complex<U>(x.At(i), 0.0);
-  }
-}
-
-template <typename T, typename U>
-void Complexify(const LayerWeightsPtrs<T>& w, LayerWeightsPtrs<U>& c_w) {
-  Complexify(w.pre_attention_norm_scale, c_w.pre_attention_norm_scale);
-  Complexify(w.attn_vec_einsum_w, c_w.attn_vec_einsum_w);
-  Complexify(w.qkv_einsum_w, c_w.qkv_einsum_w);
-  Complexify(w.pre_ffw_norm_scale, c_w.pre_ffw_norm_scale);
-  Complexify(w.gating_einsum_w, c_w.gating_einsum_w);
-  Complexify(w.linear_w, c_w.linear_w);
-}
-
-template <typename T, typename U>
-void Complexify(const ModelWeightsPtrs<T>& w, ModelWeightsPtrs<U>& c_w) {
-  const size_t kLayers = w.c_layers.size();
-  Complexify(w.embedder_input_embedding, c_w.embedder_input_embedding);
-  Complexify(w.final_norm_scale, c_w.final_norm_scale);
-  for (size_t i = 0; i < kLayers; ++i) {
-    Complexify(*w.GetLayer(i), *c_w.GetLayer(i));
-  }
-}
-
-// Somewhat duplicates ModelWeightsStorage, but that has neither double nor
-// complex types allowed and it would cause code bloat to add them there.
-template <typename T>
-class WeightsWrapper {
- public:
-  explicit WeightsWrapper(const ModelConfig& config)
-      : pool_(0), weights_(config) {
-    weights_.Allocate(data_, pool_);
-  }
-
-  const ModelWeightsPtrs<T>& get() const { return weights_; }
-  ModelWeightsPtrs<T>& get() { return weights_; }
-  void ZeroInit() { weights_.ZeroInit(); }
-  void CopyFrom(const WeightsWrapper<T>& other) {
-    weights_.CopyFrom(other.weights_);
-  }
-
- private:
-  hwy::ThreadPool pool_;
-  std::vector<MatStorage> data_;
-  ModelWeightsPtrs<T> weights_;
-};
-
-template <typename T, typename U>
-void TestNear(const MatPtrT<T>& actual, const MatPtrT<U>& expected,
-              double max_abs_err, double max_rel_err, int line) {
-  double sum0 = 0;
-  double sum1 = 0;
-  double sum01 = 0;
-  for (size_t i = 0; i < actual.NumElements(); ++i) {
-    sum0 += actual.At(i) * actual.At(i);
-    sum1 += expected.At(i) * expected.At(i);
-    sum01 += actual.At(i) * expected.At(i);
-    ASSERT_NEAR(actual.At(i), expected.At(i),
-                std::max(max_abs_err, std::abs(expected.At(i)) * max_rel_err))
-        << "line: " << line << " dim=" << expected.NumElements() << " i=" << i;
-  }
-  if (sum0 > 1e-40) {
-    double norm_dot = sum01 / std::sqrt(sum0) / std::sqrt(sum1);
-    ASSERT_NEAR(norm_dot, 1.0, 1e-7)
-        << "line: " << line << " sum0: " << sum0  << " sum1: " << sum1
-        << " sum01: " << sum01;
-  }
-}
-
-// Compute gradient with the finite difference method in the complex plane.
-// If f : R->R is the tested function and F : C->C is its extension on the
-// complex plane so that F is complex differentiable in x, then
-//
-//   F(x + ih) = F(x) + ih F'(x) + O(h^2) F''(x)
-//
-// which means that
-//
-//   F'(x) ~= Imag(F(x + ih)) / h
-//
-// This method is more numerically stable than the real-valued finite difference
-// method since we don't need to subtract floating point numbers that are near
-// to each other.
-template <typename FUNC, typename T, typename U>
-void TestGradient(const MatPtrT<T>& grad, MatPtrT<std::complex<U>>& x,
-                  FUNC func, U step, T max_abs_err, T max_rel_err, int line) {
-  MatStorageT<T> exp_grad("exp_grad", x.Rows(), x.Cols());
-  const U inv_step = 1.0 / step;
-  for (size_t i = 0; i < x.NumElements(); ++i) {
-    const U x0 = std::real(x.At(i));
-    const std::complex<U> x1 = std::complex<U>(x0, step);
-    x.At(i) = x1;
-    const std::complex<U> f1 = func();
-    exp_grad.At(i) = std::imag(f1) * inv_step;
-    x.At(i) = x0;
-  }
-  TestNear(grad, exp_grad, max_abs_err, max_rel_err, line);
-}
-
-template <typename FUNC>
-void TestGradient(const MatPtrT<float>& grad, MatPtrT<std::complex<float>>& x,
-                  FUNC func, float max_abs_err, float max_rel_error, int line) {
-  TestGradient(grad, x, func, 1e-30f, max_abs_err, max_rel_error, line);
-}
-
-template <typename FUNC, typename T>
-void TestGradient(const MatPtrT<T>& grad, MatPtrT<std::complex<double>>& x,
-                  FUNC func, T max_abs_err, T max_rel_error, int line) {
-  TestGradient(grad, x, func, 1e-50, max_abs_err, max_rel_error, line);
-}
-
-template <typename T, typename U, typename FUNC>
-void TestGradient(const LayerWeightsPtrs<T>& grad,
-                  LayerWeightsPtrs<U>& c_weights, FUNC func, T max_err) {
-  TestGradient(grad.pre_attention_norm_scale,
-               c_weights.pre_attention_norm_scale,
-               func, max_err, max_err, __LINE__);
-  TestGradient(grad.attn_vec_einsum_w, c_weights.attn_vec_einsum_w,
-               func, max_err, max_err, __LINE__);
-  TestGradient(grad.qkv_einsum_w, c_weights.qkv_einsum_w,
-               func, max_err, max_err, __LINE__);
-  TestGradient(grad.pre_ffw_norm_scale, c_weights.pre_ffw_norm_scale,
-               func, max_err, max_err, __LINE__);
-  TestGradient(grad.gating_einsum_w, c_weights.gating_einsum_w,
-               func, max_err, max_err, __LINE__);
-  TestGradient(grad.linear_w, c_weights.linear_w,
-               func, max_err, max_err, __LINE__);
-}
-
-template <typename T, typename U, typename FUNC>
-void TestGradient(const ModelWeightsPtrs<T>& grad,
-                  ModelWeightsPtrs<U>& c_weights, FUNC func, T max_err) {
-  TestGradient(grad.embedder_input_embedding,
-                 c_weights.embedder_input_embedding,
-                 func,  2 * max_err, max_err, __LINE__);
-  TestGradient(grad.final_norm_scale, c_weights.final_norm_scale,
-               func, max_err, max_err, __LINE__);
-  for (size_t i = 0; i < grad.c_layers.size(); ++i) {
-    TestGradient(*grad.GetLayer(i), *c_weights.GetLayer(i), func, max_err);
-  }
-}
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_TEST_UTIL_H_
diff --git a/build/.gitignore b/build/.gitignore
deleted file mode 100644
index 3822a0b..0000000
--- a/build/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*
-!.gitignore
-!.hgignore
\ No newline at end of file
diff --git a/compression/BUILD.bazel b/compression/BUILD.bazel
index f12ca59..0104ef5 100644
--- a/compression/BUILD.bazel
+++ b/compression/BUILD.bazel
@@ -1,4 +1,4 @@
-# Weight compression, I/O and analysis
+# Weight compression and analysis.
 
 package(
     default_applicable_licenses = [
@@ -20,78 +20,11 @@ config_setting(
     visibility = ["//visibility:private"],
 )
 
-FILE_DEPS = select({
-    "//conditions:default": [
-        # Placeholder for io deps, do not remove
-    ],
-    ":android": [],
-    # Placeholder for internal build rules, do not remove
-})
-
-cc_library(
-    name = "io",
-    srcs = [
-        "io.cc",
-        # Placeholder for io backend, do not remove
-    ],
-    hdrs = ["io.h"],
-    local_defines = select({
-        # Placeholder for internal build rules, do not remove
-        "//conditions:default": [],
-    }),
-    deps = [
-        "@highway//:hwy",
-    ] + FILE_DEPS,
-)
-
-cc_library(
-    name = "fields",
-    srcs = ["fields.cc"],
-    hdrs = ["fields.h"],
-    deps = [
-        "@highway//:hwy",
-    ],
-)
-
-cc_test(
-    name = "fields_test",
-    srcs = ["fields_test.cc"],
-    deps = [
-        ":fields",
-        "@googletest//:gtest_main",  # buildcleaner: keep
-        "@highway//:hwy_test_util",
-    ],
-)
-
-cc_library(
-    name = "blob_store",
-    srcs = ["blob_store.cc"],
-    hdrs = ["blob_store.h"],
-    deps = [
-        ":io",
-        "@highway//:hwy",
-        "@highway//:thread_pool",
-    ],
-)
-
-cc_test(
-    name = "blob_store_test",
-    srcs = ["blob_store_test.cc"],
-    deps = [
-        ":blob_store",
-        ":io",
-        "@googletest//:gtest_main",  # buildcleaner: keep
-        "@highway//:hwy",
-        "@highway//:hwy_test_util",
-        "@highway//:thread_pool",
-    ],
-)
-
 cc_library(
     name = "distortion",
     hdrs = [
         "distortion.h",
-        "shared.h",
+        "types.h",
     ],
     deps = [
         "//:basics",
@@ -115,21 +48,29 @@ cc_test(
 )
 
 cc_library(
-    name = "sfp",
-    hdrs = ["shared.h"],
-    textual_hdrs = ["sfp-inl.h"],
+    name = "types",
+    hdrs = ["types.h"],
     deps = [
         "//:basics",
         "@highway//:hwy",
     ],
 )
 
+cc_library(
+    name = "sfp",
+    textual_hdrs = ["sfp-inl.h"],
+    deps = [
+        ":types",
+        "@highway//:hwy",
+    ],
+)
+
 cc_library(
     name = "nuq",
-    hdrs = ["shared.h"],
     textual_hdrs = ["nuq-inl.h"],
     deps = [
         ":sfp",
+        ":types",
         "//:basics",
         "@highway//:hwy",
         "@highway//hwy/contrib/sort:vqsort",
@@ -144,8 +85,10 @@ cc_library(
     deps = [
         ":compress",
         ":distortion",
+        "//:mat",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
+        "@highway//:thread_pool",
     ],
 )
 
@@ -153,7 +96,6 @@ cc_test(
     name = "sfp_test",
     size = "small",
     srcs = ["sfp_test.cc"],
-    features = ["fully_static_link"],
     linkstatic = True,
     local_defines = ["HWY_IS_TEST"],
     # for test_suite.
@@ -174,7 +116,6 @@ cc_test(
     size = "small",
     timeout = "long",
     srcs = ["nuq_test.cc"],
-    features = ["fully_static_link"],
     linkstatic = True,
     local_defines = ["HWY_IS_TEST"],
     # for test_suite.
@@ -182,7 +123,6 @@ cc_test(
     deps = [
         ":distortion",
         ":nuq",
-        ":sfp",
         "@googletest//:gtest_main",  # buildcleaner: keep
         "//:test_util",
         "@highway//:hwy",
@@ -196,21 +136,18 @@ cc_library(
     srcs = ["compress.cc"],
     hdrs = [
         "compress.h",
-        "shared.h",
+        "types.h",
     ],
     textual_hdrs = ["compress-inl.h"],
     deps = [
-        ":blob_store",
         ":distortion",
-        ":fields",
-        ":io",
         ":nuq",
         ":sfp",
-        "//:allocator",
         "//:basics",
-        "//:common",
+        "//:mat",
         "@highway//:hwy",
         "@highway//:nanobenchmark",
+        "@highway//:profiler",
         "@highway//:stats",
         "@highway//:thread_pool",
     ],
@@ -221,7 +158,6 @@ cc_test(
     size = "small",
     timeout = "long",
     srcs = ["compress_test.cc"],
-    features = ["fully_static_link"],
     linkstatic = True,
     local_defines = ["HWY_IS_TEST"],
     # for test_suite.
@@ -245,51 +181,10 @@ cc_library(
     deps = [
         ":nuq",
         ":sfp",
+        ":types",
         "@highway//:hwy",
         "@highway//:stats",
         "@highway//:thread_pool",
         "@highway//hwy/contrib/sort:vqsort",
     ],
 )
-
-cc_binary(
-    name = "compress_weights",
-    srcs = ["compress_weights.cc"],
-    deps = [
-        ":compress",
-        ":io",
-        "//:allocator",
-        "//:args",
-        "//:common",
-        "//:tokenizer",
-        "//:weights",
-        "@highway//:hwy",
-        "@highway//:thread_pool",
-    ],
-)
-
-cc_binary(
-    name = "blob_compare",
-    srcs = ["blob_compare.cc"],
-    deps = [
-        ":blob_store",
-        ":io",
-        "//:allocator",
-        "//:basics",
-        "//:threading",
-        "@highway//:hwy",
-        "@highway//:hwy_test_util",
-        "@highway//:nanobenchmark",
-    ],
-)
-
-cc_binary(
-    name = "migrate_weights",
-    srcs = ["migrate_weights.cc"],
-    deps = [
-        "//:app",
-        "//:args",
-        "//:benchmark_helper",
-        "//:gemma_lib",
-    ],
-)
diff --git a/compression/analyze.h b/compression/analyze.h
index 38537db..7d41633 100644
--- a/compression/analyze.h
+++ b/compression/analyze.h
@@ -26,7 +26,7 @@
 #include <cstdlib>  // std::abs
 #include <vector>
 
-#include "compression/shared.h"
+#include "compression/types.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/stats.h"
diff --git a/compression/blob_compare.cc b/compression/blob_compare.cc
deleted file mode 100644
index c0fe63c..0000000
--- a/compression/blob_compare.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <atomic>
-#include <vector>
-
-#include "compression/blob_store.h"
-#include "compression/io.h"  // Path
-#include "util/allocator.h"
-#include "util/basics.h"  // IndexRange
-#include "util/threading.h"
-#include "hwy/aligned_allocator.h"  // Span
-#include "hwy/base.h"
-#include "hwy/timer.h"
-
-namespace gcpp {
-
-using KeySpan = hwy::Span<const hwy::uint128_t>;
-
-// Returns false if any keys differ, because then blobs are not comparable.
-bool CompareKeys(const BlobReader& reader1, const BlobReader& reader2) {
-  KeySpan keys1 = reader1.Keys();
-  KeySpan keys2 = reader2.Keys();
-  if (keys1.size() != keys2.size()) {
-    fprintf(stderr, "#keys mismatch: %zu vs %zu\n", keys1.size(), keys2.size());
-    return false;
-  }
-  for (size_t i = 0; i < keys1.size(); ++i) {
-    if (keys1[i] != keys2[i]) {
-      fprintf(stderr, "key %zu mismatch: %s vs %s\n", i,
-              StringFromKey(keys1[i]).c_str(), StringFromKey(keys2[i]).c_str());
-      return false;
-    }
-  }
-
-  return true;
-}
-
-// Total amount to allocate for all blobs.
-size_t TotalBytes(BlobReader& reader) {
-  size_t total_bytes = 0;
-  for (const hwy::uint128_t key : reader.Keys()) {
-    total_bytes += reader.BlobSize(key);
-  }
-  return total_bytes;
-}
-
-using BytePtr = hwy::AlignedFreeUniquePtr<uint8_t[]>;
-using ByteSpan = hwy::Span<uint8_t>;    // Sections within BytePtr
-using BlobVec = std::vector<ByteSpan>;  // in order of keys
-
-// Allocates memory within the single allocation and updates `pos`.
-BlobVec ReserveMemory(BlobReader& reader, BytePtr& all_blobs, size_t& pos) {
-  BlobVec blobs;
-  for (const hwy::uint128_t key : reader.Keys()) {
-    const size_t bytes = reader.BlobSize(key);
-    blobs.push_back(ByteSpan(all_blobs.get() + pos, bytes));
-    pos += bytes;
-  }
-  return blobs;
-}
-
-// Reads one set of blobs in parallel (helpful if in disk cache).
-void ReadBlobs(BlobReader& reader, BlobVec& blobs, hwy::ThreadPool& pool) {
-  HWY_ASSERT(reader.Keys().size() == blobs.size());
-  for (size_t i = 0; i < blobs.size(); ++i) {
-    reader.Enqueue(reader.Keys()[i], blobs[i].data(), blobs[i].size());
-  }
-  const BlobError err = reader.ReadAll(pool);
-  if (err != 0) {
-    HWY_ABORT("Parallel read failed: %d\n", err);
-  }
-}
-
-// Parallelizes ReadBlobs across (two) packages, if available.
-void ReadBothBlobs(BlobReader& reader1, BlobReader& reader2, size_t total_bytes,
-                   BlobVec& blobs1, BlobVec& blobs2, NestedPools& pools) {
-  const double t0 = hwy::platform::Now();
-  fprintf(stderr, "Reading %zu GiB, %zux%zu cores: ", total_bytes >> 30,
-          pools.AllPackages().NumWorkers(), pools.Pool().NumWorkers());
-  pools.AllPackages().Run(0, 2, [&](size_t task, size_t pkg_idx) {
-    ReadBlobs(task ? reader2 : reader1, task ? blobs2 : blobs1,
-              pools.Pool(pkg_idx));
-  });
-  const double t1 = hwy::platform::Now();
-  fprintf(stderr, "%.1f GB/s\n", total_bytes / (t1 - t0) * 1E-9);
-}
-
-// Returns number of elements with a mismatch. For float and bf16 blobs, uses
-// L1 and relative error, otherwise byte-wise comparison.
-size_t BlobDifferences(const ByteSpan& data1, const ByteSpan& data2,
-                       const hwy::uint128_t key) {
-  if (data1.size() != data2.size() || data1.size() == 0) {
-    HWY_ABORT("key %s size mismatch: %zu vs %zu\n", StringFromKey(key).c_str(),
-              data1.size(), data2.size());
-  }
-
-  size_t mismatches = 0;
-  char type;
-  hwy::CopyBytes(&key, &type, 1);
-  if (type == 'F') {
-    HWY_ASSERT(data1.size() % sizeof(float) == 0);
-    for (size_t j = 0; j < data1.size(); j += sizeof(float)) {
-      float f1, f2;
-      hwy::CopyBytes(&data1[j], &f1, sizeof(f1));
-      hwy::CopyBytes(&data2[j], &f2, sizeof(f2));
-      const float l1 = hwy::ScalarAbs(f1 - f2);
-      const float rel = hwy::ScalarAbs(f1) == 0.0f ? 0.0f : l1 / f1;
-      if (l1 > 1E-3f || rel > 1E-2f) {
-        fprintf(stderr, "key %s %5zu: L1 %.5f rel %.4f\n",
-                StringFromKey(key).c_str(), j, l1, rel);
-        ++mismatches;
-      }
-    }
-  } else if (type == 'B') {
-    for (size_t j = 0; j < data1.size(); j += sizeof(hwy::bfloat16_t)) {
-      hwy::bfloat16_t b1, b2;
-      hwy::CopyBytes(&data1[j], &b1, sizeof(b1));
-      hwy::CopyBytes(&data2[j], &b2, sizeof(b2));
-      const float f1 = hwy::ConvertScalarTo<float>(b1);
-      const float f2 = hwy::ConvertScalarTo<float>(b2);
-      const float l1 = hwy::ScalarAbs(f1 - f2);
-      const float rel = hwy::ScalarAbs(f1) == 0.0f ? 0.0f : l1 / f1;
-      if (l1 > 1E-2f || rel > 1E-1f) {
-        fprintf(stderr, "key %s %5zu: L1 %.5f rel %.4f\n",
-                StringFromKey(key).c_str(), j, l1, rel);
-        ++mismatches;
-      }
-    }
-  } else {
-    for (size_t j = 0; j < data1.size(); ++j) {
-      if (data1[j] != data2[j]) {
-        if (mismatches == 0) {
-          fprintf(stderr, "key %s mismatch at byte %5zu\n",
-                  StringFromKey(key).c_str(), j);
-        }
-        ++mismatches;
-      }
-    }
-  }
-  return mismatches;
-}
-
-void CompareBlobs(const KeySpan& keys, BlobVec& blobs1, BlobVec& blobs2,
-                  size_t total_bytes, NestedPools& pools) {
-  fprintf(stderr, "Comparing %zu blobs in parallel: ", keys.size());
-  const double t0 = hwy::platform::Now();
-  std::atomic<size_t> blobs_equal{};
-  std::atomic<size_t> blobs_diff{};
-  const IndexRangePartition ranges = StaticPartition(
-      IndexRange(0, keys.size()), pools.AllPackages().NumWorkers(), 1);
-  ParallelizeOneRange(
-      ranges, pools.AllPackages(),
-      [&](const IndexRange& range, size_t pkg_idx) {
-        pools.Pool(pkg_idx).Run(
-            range.begin(), range.end(), [&](size_t i, size_t /*thread*/) {
-              const size_t mismatches =
-                  BlobDifferences(blobs1[i], blobs2[i], keys[i]);
-              if (mismatches != 0) {
-                fprintf(stderr, "key %s has %zu mismatches in %zu bytes!\n",
-                        StringFromKey(keys[i]).c_str(), mismatches,
-                        blobs1[i].size());
-                blobs_diff.fetch_add(1);
-              } else {
-                blobs_equal.fetch_add(1);
-              }
-            });
-      });
-  const double t1 = hwy::platform::Now();
-  fprintf(stderr, "%.1f GB/s; total blob matches=%zu, mismatches=%zu\n",
-          total_bytes / (t1 - t0) * 1E-9, blobs_equal.load(),
-          blobs_diff.load());
-}
-
-// Compares two sbs files, including blob order.
-void ReadAndCompareBlobs(const char* path1, const char* path2) {
-  // Open files.
-  BlobReader reader1;
-  BlobReader reader2;
-  const BlobError err1 = reader1.Open(Path(path1));
-  const BlobError err2 = reader2.Open(Path(path2));
-  if (err1 != 0 || err2 != 0) {
-    HWY_ABORT("Failed to open files: %s %s: %d %d\n", path1, path2, err1, err2);
-  }
-
-  if (!CompareKeys(reader1, reader2)) return;
-
-  // Single allocation, avoid initializing the memory.
-  BoundedTopology topology;
-  Allocator::Init(topology);
-  NestedPools pools(topology);
-  const size_t total_bytes = TotalBytes(reader1) + TotalBytes(reader2);
-  BytePtr all_blobs = hwy::AllocateAligned<uint8_t>(total_bytes);
-  size_t pos = 0;
-  BlobVec blobs1 = ReserveMemory(reader1, all_blobs, pos);
-  BlobVec blobs2 = ReserveMemory(reader2, all_blobs, pos);
-
-  ReadBothBlobs(reader1, reader2, total_bytes, blobs1, blobs2, pools);
-
-  CompareBlobs(reader1.Keys(), blobs1, blobs2, total_bytes, pools);
-}
-
-}  // namespace gcpp
-
-int main(int argc, char** argv) {
-  if (argc != 3) {
-    HWY_ABORT("Usage: %s <sbs_path> <sbs_path>\n", argv[0]);
-  }
-  if (strcmp(argv[1], argv[2]) == 0) {
-    HWY_ABORT("Filenames are the same, skipping comparison: %s\n", argv[1]);
-  }
-  gcpp::ReadAndCompareBlobs(argv[1], argv[2]);
-  return 0;
-}
diff --git a/compression/blob_store.cc b/compression/blob_store.cc
deleted file mode 100644
index 06bcb56..0000000
--- a/compression/blob_store.cc
+++ /dev/null
@@ -1,341 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "compression/blob_store.h"
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <atomic>
-#include <cstdio>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "compression/io.h"
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-#include "hwy/detect_compiler_arch.h"
-
-namespace gcpp {
-
-hwy::uint128_t MakeKey(const char* string) {
-  size_t length = 0;
-  for (size_t i = 0; string[i] != '\0'; ++i) {
-    ++length;
-  }
-  if (length > 16) {
-    HWY_ABORT("Key %s is too long, please truncate to 16 chars.", string);
-  }
-
-  hwy::uint128_t ret;
-  hwy::ZeroBytes<sizeof(ret)>(&ret);
-  hwy::CopyBytes(string, &ret, length);
-  return ret;
-}
-
-std::string StringFromKey(hwy::uint128_t key) {
-  std::string name(sizeof(key) + 1, '\0');
-  hwy::CopyBytes(&key, name.data(), sizeof(key));
-  name.resize(name.find('\0'));
-  return name;
-}
-
-namespace {
-void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
-                          std::vector<BlobIO>& requests) {
-  // Split into chunks for load-balancing even if blob sizes vary.
-  constexpr size_t kChunkSize = 4 * 1024 * 1024;  // bytes
-
-  // Split into whole chunks and possibly one remainder.
-  uint64_t pos = 0;
-  if (size >= kChunkSize) {
-    for (; pos <= size - kChunkSize; pos += kChunkSize) {
-      requests.emplace_back(offset + pos, kChunkSize, data + pos, 0);
-    }
-  }
-  if (pos != size) {
-    requests.emplace_back(offset + pos, size - pos, data + pos, 0);
-  }
-}
-}  // namespace
-
-static_assert(HWY_IS_LITTLE_ENDIAN, "Assumes little endian");
-
-// On-disk representation (little-endian).
-//
-// Deliberately omits a version number because this file format is unchanging.
-// Additional data may be added only inside new blobs. Changes to the blob
-// contents or type should be handled by renaming keys.
-#pragma pack(push, 1)
-class BlobStore {
-  static constexpr uint32_t kMagic = 0x0A534253;  // SBS\n
-
- public:
-  // NOT including padding, so that we can also use ZeroFillPadding after
-  // copying the header.
-  static constexpr size_t HeaderSize(size_t num_blobs) {
-    // 16-byte fixed fields plus per-blob: 16-byte key, 16-byte offset/size.
-    return 16 + 32 * num_blobs;
-  }
-
-  // Returns how many bytes to allocate for the header without the subsequent
-  // blobs. Requires num_blobs_ to already be set, typically by reading
-  // sizeof(BlobStore) bytes from disk.
-  size_t PaddedHeaderSize() const {
-    return hwy::RoundUpTo(HeaderSize(num_blobs_), kBlobAlign);
-  }
-
-  // Returns aligned offset and zero-fills between that and `offset`.
-  uint64_t ZeroFillPadding(uint64_t offset) {
-    uint8_t* const bytes = reinterpret_cast<uint8_t*>(this);
-    const uint64_t padded = hwy::RoundUpTo(offset, kBlobAlign);
-    hwy::ZeroBytes(bytes + offset, padded - offset);
-    return padded;
-  }
-
-  BlobError CheckValidity(const uint64_t file_size) {
-    if (magic_ != kMagic) return __LINE__;
-    if (num_blobs_ == 0) return __LINE__;
-    if (file_size_ != file_size) return __LINE__;
-
-    // Ensure blobs are back to back, and zero-pad.
-    uint64_t offset = ZeroFillPadding(HeaderSize(num_blobs_));
-    for (size_t i = 0; i < num_blobs_; ++i) {
-      const hwy::uint128_t val = keys_[num_blobs_ + i];
-      if (val.lo != offset) return __LINE__;
-      offset = hwy::RoundUpTo(offset + val.hi, kBlobAlign);
-    }
-
-    if (offset != file_size_) return __LINE__;
-
-    return 0;  // all OK
-  }
-
-  static BlobStorePtr Allocate(uint64_t total_size) {
-    uint8_t* bytes =
-        static_cast<uint8_t*>(hwy::AllocateAlignedBytes(total_size));
-    if (!bytes) return BlobStorePtr();
-    return BlobStorePtr(new (bytes) BlobStore(), hwy::AlignedFreer());
-  }
-
-  static std::vector<BlobIO> PrepareWriteRequests(
-      const hwy::uint128_t keys[], const hwy::Span<const uint8_t> blobs[],
-      size_t num_blobs, BlobStore* bs) {
-    // Sanity check and ensure the cast below is safe.
-    HWY_ASSERT(num_blobs < (1ULL << 20));
-
-    // Allocate var-length header.
-    const size_t header_size = HeaderSize(num_blobs);
-    const size_t padded_header_size = hwy::RoundUpTo(header_size, kBlobAlign);
-    const uint64_t padded_header_end = bs->ZeroFillPadding(header_size);
-    HWY_ASSERT(padded_header_end == padded_header_size);
-
-    // All-zero buffer used to write padding to the file without copying the
-    // input blobs.
-    static uint8_t zeros[kBlobAlign] = {0};
-
-    // Total file size will be the header plus all padded blobs.
-    uint64_t payload = 0;
-    for (size_t i = 0; i < num_blobs; ++i) {
-      payload += hwy::RoundUpTo(blobs[i].size(), kBlobAlign);
-    }
-    const size_t total_size = padded_header_size + payload;
-
-    // Fill header.
-    bs->magic_ = kMagic;
-    bs->num_blobs_ = static_cast<uint32_t>(num_blobs);
-    bs->file_size_ = total_size;
-    hwy::CopyBytes(keys, bs->keys_, num_blobs * sizeof(keys[0]));
-
-    // First IO request is for the header (not yet filled!).
-    std::vector<BlobIO> requests;
-    requests.reserve(1 + 2 * num_blobs);
-    requests.emplace_back(/*offset=*/0, padded_header_size,
-                          reinterpret_cast<uint8_t*>(bs), 0);
-
-    // Fill second half of keys_ with offset/size and prepare IO requests.
-    uint64_t offset = padded_header_end;
-    for (size_t i = 0; i < num_blobs; ++i) {
-      bs->keys_[num_blobs + i].lo = offset;
-      bs->keys_[num_blobs + i].hi = blobs[i].size();
-
-      EnqueueChunkRequests(offset, blobs[i].size(),
-                           const_cast<uint8_t*>(blobs[i].data()), requests);
-      offset += blobs[i].size();
-      const size_t padded_size = hwy::RoundUpTo(blobs[i].size(), kBlobAlign);
-      if (padded_size != blobs[i].size()) {
-        const size_t padding = padded_size - blobs[i].size();
-        HWY_ASSERT(padding <= kBlobAlign);
-        requests.emplace_back(offset, padding, zeros, 0);
-        offset += padding;
-      }
-    }
-
-    HWY_ASSERT(offset == total_size);
-    return requests;
-  }
-
-  bool FindKey(const hwy::uint128_t key, uint64_t& offset, size_t& size) const {
-    for (size_t i = 0; i < num_blobs_; ++i) {
-      if (keys_[i] == key) {
-        const hwy::uint128_t val = keys_[num_blobs_ + i];
-        offset = val.lo;
-        size = val.hi;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  hwy::Span<const hwy::uint128_t> Keys() const {
-    return hwy::Span<const hwy::uint128_t>(keys_, num_blobs_);
-  }
-
- private:
-  uint32_t magic_;
-  uint32_t num_blobs_;      // never 0
-  uint64_t file_size_;      // must match actual size of file
-  hwy::uint128_t keys_[1];  // length: 2 * num_blobs
-  // Padding, then the blob identified by keys[0], then padding etc.
-};
-#pragma pack(pop)
-
-BlobError BlobReader::Open(const Path& filename) {
-  file_ = OpenFileOrNull(filename, "r");
-  if (!file_) return __LINE__;
-
-  // Read first part of header to get actual size.
-  BlobStore bs;
-  if (!file_->Read(0, sizeof(bs), &bs)) return __LINE__;
-  const size_t padded_size = bs.PaddedHeaderSize();
-  HWY_ASSERT(padded_size >= sizeof(bs));
-
-  // Allocate full header.
-  blob_store_ = BlobStore::Allocate(padded_size);
-  if (!blob_store_) return __LINE__;
-
-  // Copy what we already read (more efficient than seek + re-read).
-  hwy::CopySameSize(&bs, blob_store_.get());
-  // Read the rest of the header, but not the full file.
-  uint8_t* bytes = reinterpret_cast<uint8_t*>(blob_store_.get());
-  if (!file_->Read(sizeof(bs), padded_size - sizeof(bs), bytes + sizeof(bs))) {
-    return __LINE__;
-  }
-
-  return blob_store_->CheckValidity(file_->FileSize());
-}
-
-size_t BlobReader::BlobSize(hwy::uint128_t key) const {
-  uint64_t offset;
-  size_t size;
-  if (!blob_store_->FindKey(key, offset, size)) return 0;
-  return size;
-}
-
-BlobError BlobReader::Enqueue(hwy::uint128_t key, void* data, size_t size) {
-  uint64_t offset;
-  size_t actual_size;
-  if (!blob_store_->FindKey(key, offset, actual_size)) return __LINE__;
-  if (actual_size != size) {
-    fprintf(stderr,
-            "Mismatch between expected %d and actual %d KiB size of blob %s. "
-            "Please see README.md on how to update the weights.\n",
-            static_cast<int>(size >> 10), static_cast<int>(actual_size >> 10),
-            StringFromKey(key).c_str());
-    return __LINE__;
-  }
-
-  EnqueueChunkRequests(offset, actual_size, reinterpret_cast<uint8_t*>(data),
-                       requests_);
-  return 0;
-}
-
-// Parallel synchronous I/O. Alternatives considered:
-// - readv is limited to 0x7FFFF000 bytes on Linux (even 64-bit). Note that
-//   pread calls preadv with a single iovec.
-// - O_DIRECT seems undesirable because we do want to use the OS cache
-//   between consecutive runs.
-// - memory-mapped I/O is less predictable and adds noise to measurements.
-BlobError BlobReader::ReadAll(hwy::ThreadPool& pool) {
-  File* pfile = file_.get();  // not owned
-  const auto& requests = requests_;
-  std::atomic_flag err = ATOMIC_FLAG_INIT;
-  // >5x speedup from parallel reads when cached.
-  pool.Run(0, requests.size(),
-           [pfile, &requests, &err](uint64_t i, size_t /*thread*/) {
-             if (!pfile->Read(requests[i].offset, requests[i].size,
-                              requests[i].data)) {
-               fprintf(stderr, "Failed to read blob %zu\n",
-                       static_cast<size_t>(i));
-               err.test_and_set();
-             }
-           });
-  if (err.test_and_set()) return __LINE__;
-  return 0;
-}
-
-BlobError BlobReader::ReadOne(hwy::uint128_t key, void* data,
-                              size_t size) const {
-  uint64_t offset;
-  size_t actual_size;
-  if (!blob_store_->FindKey(key, offset, actual_size)) return __LINE__;
-  if (actual_size != size) {
-    fprintf(stderr,
-            "Mismatch between expected %d and actual %d KiB size of blob %s. "
-            "Please see README.md on how to update the weights.\n",
-            static_cast<int>(size >> 10), static_cast<int>(actual_size >> 10),
-            StringFromKey(key).c_str());
-    return __LINE__;
-  }
-  if (!file_->Read(offset, actual_size, data)) {
-    return __LINE__;
-  }
-  return 0;
-}
-
-hwy::Span<const hwy::uint128_t> BlobReader::Keys() const {
-  return blob_store_->Keys();
-}
-
-BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool, const Path& filename) {
-  HWY_ASSERT(keys_.size() == blobs_.size());
-
-  // Concatenate blobs in memory.
-  const size_t header_size = BlobStore::HeaderSize(keys_.size());
-  const size_t padded_header_size = hwy::RoundUpTo(header_size, kBlobAlign);
-  const BlobStorePtr bs = BlobStore::Allocate(padded_header_size);
-  const std::vector<BlobIO> requests = BlobStore::PrepareWriteRequests(
-      keys_.data(), blobs_.data(), keys_.size(), bs.get());
-
-  // Create/replace existing file.
-  std::unique_ptr<File> file = OpenFileOrNull(filename, "w+");
-  if (!file) return __LINE__;
-  File* pfile = file.get();  // not owned
-
-  std::atomic_flag err = ATOMIC_FLAG_INIT;
-  pool.Run(0, requests.size(),
-           [pfile, &requests, &err](uint64_t i, size_t /*thread*/) {
-             if (!pfile->Write(requests[i].data, requests[i].size,
-                               requests[i].offset)) {
-               err.test_and_set();
-             }
-           });
-  if (err.test_and_set()) return __LINE__;
-  return 0;
-}
-
-}  // namespace gcpp
diff --git a/compression/blob_store.h b/compression/blob_store.h
deleted file mode 100644
index d98235c..0000000
--- a/compression/blob_store.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_BLOB_STORE_H_
-#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_BLOB_STORE_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "compression/io.h"
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"  // hwy::uint128_t
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-namespace gcpp {
-
-// Convenient way to construct a key from a string (<= 16 chars).
-hwy::uint128_t MakeKey(const char* string);
-
-// Returns a string from a key.
-std::string StringFromKey(hwy::uint128_t key);
-
-// Ordered list of opaque blobs (~hundreds), identified by unique opaque
-// 128-bit keys.
-class BlobStore;
-
-// Incomplete type, so dtor will not be called.
-using BlobStorePtr = hwy::AlignedFreeUniquePtr<BlobStore>;
-
-// 0 if successful, otherwise the line number of the failing check.
-using BlobError = int;
-
-// Blob offsets on disk and memory addresses are a multiple of this, because
-// we pad the header and each blob's size. This matches CUDA alignment and the
-// maximum SVE vector size, and exceeds typical x86 cache line sizes (64 or
-// 128), which can help performance.
-static constexpr size_t kBlobAlign = 256;
-
-// One I/O request, serviced by threads in a pool.
-struct BlobIO {
-  BlobIO(uint64_t offset, size_t size, void* data, uint64_t padding)
-      : offset(offset), size(size), data(data), padding(padding) {}
-
-  uint64_t offset;
-  size_t size;  // bytes
-  void* data;
-  uint64_t padding;
-};
-
-class BlobReader {
- public:
-  BlobReader() { requests_.reserve(500); }
-  ~BlobReader() = default;
-
-  // Opens `filename` and reads its header.
-  BlobError Open(const Path& filename);
-
-  // Returns the size of the blob identified by `key`, or 0 if not found.
-  size_t BlobSize(hwy::uint128_t key) const;
-
-  // Enqueues read requests if `key` is found and its size matches `size`, which
-  // is in units of bytes.
-  BlobError Enqueue(hwy::uint128_t key, void* data, size_t size);
-
-  // Reads all enqueued requests.
-  BlobError ReadAll(hwy::ThreadPool& pool);
-
-  // Reads one blob directly.
-  BlobError ReadOne(hwy::uint128_t key, void* data, size_t size) const;
-
-  // Returns all available blob keys.
-  hwy::Span<const hwy::uint128_t> Keys() const;
-
- private:
-  BlobStorePtr blob_store_;  // holds header, not the entire file
-  std::vector<BlobIO> requests_;
-  std::unique_ptr<File> file_;
-};
-
-class BlobWriter {
- public:
-  // `size` is in bytes.
-  void Add(hwy::uint128_t key, const void* data, size_t size) {
-    keys_.push_back(key);
-    blobs_.emplace_back(static_cast<const uint8_t*>(data), size);
-  }
-
-  // Stores all blobs to disk in the given order with padding for alignment.
-  BlobError WriteAll(hwy::ThreadPool& pool, const Path& filename);
-
-  // Returns the number of blobs added.
-  size_t DebugNumBlobsAdded() const { return keys_.size(); }
-
- private:
-  std::vector<hwy::uint128_t> keys_;
-  std::vector<hwy::Span<const uint8_t>> blobs_;
-};
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_BLOB_STORE_H_
diff --git a/compression/blob_store_test.cc b/compression/blob_store_test.cc
deleted file mode 100644
index dbba55f..0000000
--- a/compression/blob_store_test.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "compression/blob_store.h"
-
-#include <stdio.h>
-
-#include <algorithm>
-#include <array>
-
-#include "compression/io.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-#include "hwy/tests/hwy_gtest.h"
-#include "hwy/tests/test_util-inl.h"  // HWY_ASSERT_EQ
-
-namespace gcpp {
-namespace {
-
-#if !HWY_TEST_STANDALONE
-class BlobStoreTest : public testing::Test {};
-#endif
-
-#if !HWY_OS_WIN
-TEST(BlobStoreTest, TestReadWrite) {
-  static const std::array<float, 4> kOriginalData = {-1, 0, 3.14159, 2.71828};
-
-  // mkstemp will modify path_str so it holds a newly-created temporary file.
-  char path_str[] = "/tmp/blob_store_test.sbs-XXXXXX";
-  const int fd = mkstemp(path_str);
-  HWY_ASSERT(fd > 0);
-
-  hwy::ThreadPool pool(4);
-  const Path path(path_str);
-  std::array<float, 4> buffer = kOriginalData;
-
-  const hwy::uint128_t keyA = MakeKey("0123456789abcdef");
-  const hwy::uint128_t keyB = MakeKey("q");
-  BlobWriter writer;
-  writer.Add(keyA, "DATA", 5);
-  writer.Add(keyB, buffer.data(), sizeof(buffer));
-  HWY_ASSERT_EQ(writer.WriteAll(pool, path), 0);
-  HWY_ASSERT_ARRAY_EQ(kOriginalData.data(), buffer.data(), buffer.size());
-
-  std::fill(buffer.begin(), buffer.end(), 0);
-  BlobReader reader;
-  HWY_ASSERT_EQ(reader.Open(path), 0);
-  HWY_ASSERT_EQ(reader.BlobSize(keyA), 5);
-  HWY_ASSERT_EQ(reader.BlobSize(keyB), sizeof(buffer));
-
-  HWY_ASSERT_EQ(reader.Enqueue(keyB, buffer.data(), sizeof(buffer)), 0);
-  HWY_ASSERT_EQ(reader.ReadAll(pool), 0);
-  HWY_ASSERT_ARRAY_EQ(kOriginalData.data(), buffer.data(), buffer.size());
-
-  {
-    std::array<char, 5> buffer;
-    HWY_ASSERT(reader.ReadOne(keyA, buffer.data(), 1) != 0);
-    HWY_ASSERT_EQ(reader.ReadOne(keyA, buffer.data(), 5), 0);
-    HWY_ASSERT_STRING_EQ("DATA", buffer.data());
-  }
-
-  const hwy::Span<const hwy::uint128_t> keys = reader.Keys();
-  HWY_ASSERT_EQ(keys.size(), 2);
-  HWY_ASSERT_EQ(keys[0], keyA);
-  HWY_ASSERT_EQ(keys[1], keyB);
-
-  close(fd);
-  unlink(path_str);
-}
-#endif
-
-}  // namespace
-}  // namespace gcpp
-
-HWY_TEST_MAIN();
diff --git a/compression/compress-inl.h b/compression/compress-inl.h
index 8638b5f..512f8fa 100644
--- a/compression/compress-inl.h
+++ b/compression/compress-inl.h
@@ -21,19 +21,20 @@
 #include <stdint.h>
 #include <stdio.h>
 
-#include <cmath>  // lroundf, only if COMPRESS_STATS
-#include <string>
+#include <memory>
 #include <vector>
 
-#include "compression/blob_store.h"
 #include "compression/compress.h"  // IWYU pragma: export
 #include "compression/distortion.h"
-#include "gemma/configs.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/timer.h"
 
+#if COMPRESS_STATS
+#include <cmath>  // lroundf
+#endif
+
 #endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_COMPRESS_INL_H_
 
 // Include guard for (potentially) SIMD code.
@@ -64,8 +65,8 @@ static constexpr bool kIsTest = false;
 template <typename T>  // primary, must specialize
 struct CompressTraits {};
 
-// Used by backprop/, where weights are currently f32; also MatMul for f32
-// weights or activations, if native `ReorderWidenMulAccumulate` is available.
+// Used by MatMul for f32 weights or activations, if native
+// `ReorderWidenMulAccumulate` is available.
 template <>
 struct CompressTraits<float> {
   using Packed = float;
@@ -379,7 +380,7 @@ struct CompressTraits<SfpStream> {
   using Packed = SfpStream;
 
   // Callers are responsible for scaling `raw` such that its magnitudes do not
-  // exceed `SfpStream::kMax`. See CompressedArray::scale().
+  // exceed `SfpStream::kMax`. See CompressedArray::Scale().
   template <class DF, HWY_IF_F32_D(DF)>
   static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT raw,
                                   size_t num, CompressPerThread& tls,
@@ -387,7 +388,7 @@ struct CompressTraits<SfpStream> {
                                   const size_t packed_ofs) {
     SfpCodec::Enc(df, raw, num, packed.ptr + packed_ofs);
 
-    if (COMPRESS_STATS) {
+    if constexpr (COMPRESS_STATS) {
       const hn::Repartition<BF16, DF> dbf;
       auto distorted =
           hwy::AllocateAligned<BF16>(hwy::RoundUpTo(num, hn::Lanes(dbf)));
@@ -431,9 +432,10 @@ struct CompressTraits<NuqStream> {
                                   size_t num, CompressPerThread& tls,
                                   const PackedSpan<Packed>& packed,
                                   const size_t packed_ofs) {
-    NuqCodec::Enc(df, raw, num, tls.buf, packed, packed_ofs);
+    if (!tls.buf) tls.buf = std::make_unique<NuqStream::ClusterBuf>();
+    NuqCodec::Enc(df, raw, num, *tls.buf, packed, packed_ofs);
 
-    if (COMPRESS_STATS) {
+    if constexpr (COMPRESS_STATS) {
       for (size_t i = 0; i < num; ++i) {
         tls.stats.NotifyIn(static_cast<int>(lroundf(raw[i] * 100.0f + 500.0f)));
       }
@@ -477,7 +479,7 @@ HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
                            const size_t packed_ofs, hwy::ThreadPool& pool) {
   packed.BoundsCheck(packed_ofs, num);
   work.tls.resize(pool.NumWorkers());
-  if (COMPRESS_STATS) {
+  if constexpr (COMPRESS_STATS) {
     for (auto& tls : work.tls) {
       tls.stats.Reset();
     }
@@ -486,7 +488,7 @@ HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
   const bool want_bench = COMPRESS_STATS || !kIsTest;
   const double t0 = want_bench ? hwy::platform::Now() : 0.0;
 
-  using Traits = CompressTraits<Packed>;
+  using Traits = CompressTraits<hwy::RemoveConst<Packed>>;
   constexpr size_t kBatch = 8192;
   const size_t num_batches = hwy::DivCeil(num, kBatch);
   pool.Run(0, num_batches,
@@ -507,7 +509,7 @@ HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
     fprintf(stderr, "Compress %.1f MB/s\n", mbps);
   }
 
-  if (COMPRESS_STATS) {
+  if constexpr (COMPRESS_STATS) {
     for (size_t i = 1; i < work.tls.size(); ++i) {
       work.tls[0].stats.Assimilate(work.tls[i].stats);
     }
@@ -515,26 +517,25 @@ HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
   }
 }
 
-// Adapter that compresses into `MatStorageT`. `raw` must already be scaled
-// to fit the value range, if `Packed` is `SfpStream`.
+// Same as above, but without parallelization nor benchmarking.
 template <typename Packed>
-HWY_INLINE void CompressScaled(const float* HWY_RESTRICT raw, size_t num,
-                               CompressWorkingSet& work,
-                               MatStorageT<Packed>& compressed,
-                               hwy::ThreadPool& pool) {
-  Compress(raw, num, work,
-           MakeSpan(compressed.data(), compressed.NumElements()),
-           /*packed_ofs=*/0, pool);
+HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
+                           CompressPerThread& tls,
+                           const PackedSpan<Packed>& packed,
+                           const size_t packed_ofs) {
+  packed.BoundsCheck(packed_ofs, num);
+  using Traits = CompressTraits<hwy::RemoveConst<Packed>>;
+  const hn::ScalableTag<float> df;
+  Traits::Compress(df, raw, num, tls, packed, packed_ofs);
 }
 
-// Stores two f32 vectors to f32 or bf16; avoids duplicating RMSNorm and
-// RMSNormInplace for the two output types.
+// Stores two f32 vectors to f32 or bf16.
 template <class DF, typename Packed, HWY_IF_F32_D(DF), class VF = hn::Vec<DF>>
 void Compress2(DF df, VF raw0, VF raw1, const PackedSpan<Packed>& packed,
                const size_t packed_ofs) {
   static_assert(hwy::IsSameEither<Packed, float, BF16>());
   packed.BoundsCheck(packed_ofs, 2 * hn::Lanes(df));
-  using Traits = CompressTraits<Packed>;
+  using Traits = CompressTraits<hwy::RemoveConst<Packed>>;
   Traits::Store2(df, raw0, raw1, packed, packed_ofs);
 }
 
@@ -566,7 +567,7 @@ HWY_INLINE void Decompress2(DRaw d, const PackedSpan<Packed>& packed,
 // Decompresses from any type of `packed`, starting at (any) `packed_ofs`, to
 // (any) `num` elements in `raw`, then appends `[0, hn::Lanes(d))` zeroes as
 // required to round `num` up to one vector, if it is not already. The caller is
-// responsible for scaling `raw` to the original range because `EmbedToken`
+// responsible for scaling `raw` to the original range because `EmbedMMToken`
 // also wants to scale the decompressed elements.
 // `TRaw` can be `float/BF16`, or `double` if `Packed` is `float`.
 template <class DRaw, typename Packed, typename TRaw = hn::TFromD<DRaw>>
@@ -708,51 +709,6 @@ HWY_INLINE float DecompressAndCall(D, const PackedSpan<const VT> v,
                        comp3);
 }
 
-// Functor called for each tensor, which compresses and stores them along with
-// their scaling factors to BlobStore.
-class Compressor {
- public:
-  explicit Compressor(hwy::ThreadPool& pool) : writer_(pool) {}
-
-  template <typename Packed>
-  void operator()(MatPtrT<Packed>* compressed, const char* decorated_name,
-                  const float* HWY_RESTRICT weights) {
-    size_t num_weights = compressed->NumElements();
-    if (num_weights == 0 || weights == nullptr || compressed->Ptr() == nullptr)
-      return;
-    size_t num_compressed = compressed->NumElements();
-    PackedSpan<Packed> packed = MakeSpan(compressed->data(), num_compressed);
-    fprintf(stderr, "Compressing %s (%zuM), please wait\n", decorated_name,
-            num_weights / (1000 * 1000));
-    Compress(weights, num_weights, work_, packed, /*packed_ofs=*/0,
-             writer_.pool());
-    writer_(compressed, decorated_name);
-  }
-
-  void AddTokenizer(const std::string& tokenizer) {
-    writer_.AddTokenizer(tokenizer);
-  }
-
-  void AddScales(const float* scales, size_t len) {
-    writer_.AddScales(scales, len);
-  }
-
-  // Writes all blobs to disk in the given order. The config is optional and
-  // if given, it is written to the file, along with the TOC, making it
-  // single-file format. Otherwise, the file is written in the multi-file format
-  // without a TOC.
-  BlobError WriteAll(const Path& blob_filename, const ModelConfig* config) {
-    return writer_.WriteAll(blob_filename, config);
-  }
-
-  // Returns the number of blobs added.
-  size_t DebugNumBlobsAdded() const { return writer_.DebugNumBlobsAdded(); }
-
- private:
-  CompressWorkingSet work_;
-  WriteToBlobStore writer_;
-};
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace gcpp
diff --git a/compression/compress.cc b/compression/compress.cc
index e858e15..6ef8990 100644
--- a/compression/compress.cc
+++ b/compression/compress.cc
@@ -15,8 +15,34 @@
 
 #include "compression/compress.h"
 
+#include <stddef.h>
+#include <stdint.h>
+
+#include "util/mat.h"
+#include "hwy/base.h"
+#include "hwy/profiler.h"
+
 namespace gcpp {
 
-MatPtr::~MatPtr() {}
+float ScaleWeights(float* HWY_RESTRICT raw, size_t num) {
+  PROFILER_FUNC;
+
+  float maxabs = 0.0;
+  for (size_t i = 0; i < num; ++i) {
+    maxabs = HWY_MAX(maxabs, hwy::ScalarAbs(raw[i]));
+  }
+  if (maxabs <= SfpStream::kMax) {
+    return 1.0f;
+  }
+  const float scale = maxabs / SfpStream::kMax;
+  const float inv_scale = static_cast<float>(1.0 / static_cast<double>(scale));
+  for (size_t i = 0; i < num; ++i) {
+    // Clamp because kMax may still be exceeded.
+    const float magn =
+        HWY_MIN(SfpStream::kMax, hwy::ScalarAbs(raw[i] * inv_scale));
+    raw[i] = hwy::ScalarCopySign(magn, raw[i]);
+  }
+  return scale;
+}
 
 }  // namespace gcpp
diff --git a/compression/compress.h b/compression/compress.h
index d875c4b..811f483 100644
--- a/compression/compress.h
+++ b/compression/compress.h
@@ -17,31 +17,19 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_COMPRESS_H_
 #define THIRD_PARTY_GEMMA_CPP_COMPRESSION_COMPRESS_H_
 
-#include "hwy/base.h"
 #define COMPRESS_STATS 0
 
 #include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
 
-#include <cstdio>
-#include <cstring>
-#include <string>
-#include <unordered_map>
-#include <utility>
+#if COMPRESS_STATS
+#include <stdio.h>
+#endif
+
+#include <memory>
 #include <vector>
 
-// IWYU pragma: begin_exports
-#include "compression/blob_store.h"
-#include "compression/fields.h"
-#include "compression/io.h"
-#include "compression/shared.h"
-#include "gemma/tensor_index.h"
-#include "util/basics.h"
-// IWYU pragma: end_exports
-#include "gemma/configs.h"
-#include "util/allocator.h"
-#include "hwy/per_target.h"
+#include "compression/types.h"  // IWYU pragma: export
 #if COMPRESS_STATS
 #include "compression/distortion.h"
 #include "hwy/stats.h"
@@ -49,388 +37,6 @@
 
 namespace gcpp {
 
-// Base class for rank-1 or 2 tensors (vector or matrix).
-// Supports both dynamic and compile-time sizing.
-// Holds metadata and a non-owning pointer to the data, owned by the derived
-// MatStorageT class.
-// This class also provides easy conversion from/to a table of contents for a
-// BlobStore file, and a templated (compile-time) accessor for a 2-d array of
-// fixed inner dimension and type.
-// It is designed to be put in a vector, and has default copy and operator=, so
-// it is easy to read/write a blob_store file.
-class MatPtr : public IFields {
- public:
-  // Full constructor for dynamic sizing.
-  MatPtr(const std::string& name, Type type, size_t element_size, size_t rows,
-         size_t cols)
-      : name_(name),
-        type_(type),
-        element_size_(element_size),
-        num_elements_(rows * cols),
-        rows_(rows),
-        cols_(cols),
-        ptr_(nullptr) {
-    stride_ = cols;
-  }
-  // Default is to leave all fields default-initialized.
-  MatPtr() = default;
-  virtual ~MatPtr();
-
-  // Compatibility interface for CompressedArray.
-  // TODO: remove.
-  template <typename T>
-  T* data() {
-    return HWY_RCAST_ALIGNED(T*, ptr_);
-  }
-  template <typename T>
-  const T* data() const {
-    return HWY_RCAST_ALIGNED(const T*, ptr_);
-  }
-
-  const void* Ptr() const { return ptr_; }
-  void* Ptr() { return ptr_; }
-  // Sets the pointer from another MatPtr.
-  void SetPtr(const MatPtr& other) { ptr_ = other.ptr_; }
-
-  // Copying allowed as the metadata is small.
-  MatPtr(const MatPtr& other) = default;
-  MatPtr& operator=(const MatPtr& other) = default;
-
-  // Returns the name of the blob.
-  const char* Name() const override { return name_.c_str(); }
-  void SetName(const std::string& name) { name_ = name; }
-
-  // Returns the type of the blob.
-  Type GetType() const { return type_; }
-
-  // Returns the size of each element in bytes.
-  size_t ElementSize() const { return element_size_; }
-
-  // Returns the number of elements in the array.
-  size_t NumElements() const { return num_elements_; }
-
-  // Returns the number of bytes in the array.
-  size_t SizeBytes() const {
-    if (this->GetType() == TypeEnum<NuqStream>()) {
-      return NuqStream::PackedEnd(num_elements_);
-    }
-    return num_elements_ * element_size_;
-  }
-
-  // Returns the number of rows in the 2-d array (outer dimension).
-  size_t Rows() const { return rows_; }
-
-  // Returns the number of columns in the 2-d array (inner dimension).
-  size_t Cols() const { return cols_; }
-
-  Extents2D Extents() const { return Extents2D(rows_, cols_); }
-
-  // Currently same as cols, but may differ in the future. This is the offset by
-  // which to advance pointers to the next row.
-  size_t Stride() const { return stride_; }
-
-  // Decoded elements should be multiplied by this to restore their original
-  // range. This is required because SfpStream can only encode a limited range
-  // of magnitudes.
-  float scale() const { return scale_; }
-  void set_scale(float scale) { scale_ = scale; }
-
-  std::string LayerName(int layer) const {
-    std::string name = name_ + std::to_string(layer);
-    HWY_ASSERT(name.size() <= sizeof(hwy::uint128_t));
-    return name;
-  }
-
-  // Sets all data to zero.
-  void ZeroInit() {
-    if (ptr_ == nullptr)
-      HWY_ABORT("ptr_ is null on tensor %s\n", name_.c_str());
-    hwy::ZeroBytes(ptr_, SizeBytes());
-  }
-
-  void VisitFields(IFieldsVisitor& visitor) override {
-    visitor(name_);
-    visitor(type_);
-    visitor(element_size_);
-    visitor(num_elements_);
-    visitor(rows_);
-    visitor(cols_);
-    visitor(scale_);
-    visitor(stride_);
-  }
-
-  // Calls func on the upcasted type. Since MatPtr by design is not templated,
-  // here we provide a way to get to the derived type, provided that `Type()`
-  // is one of the strings returned by `TypeName()`.
-  template <class FuncT, typename... TArgs>
-  decltype(auto) CallUpcasted(FuncT& func, TArgs&&... args);
-
- protected:
-  // Arbitrary name for the array of preferably <= 16 characters.
-  std::string name_;
-  // Should be the result of TypeEnum<T> for CallUpcasted() to work.
-  Type type_;
-  // sizeof(T)
-  uint32_t element_size_ = 0;
-  // Number of elements in the array.
-  uint32_t num_elements_ = 0;  // In element_size units.
-  // Number of rows in the 2-d array (outer dimension).
-  uint32_t rows_ = 0;
-  // Number of columns in the 2-d array (inner dimension).
-  uint32_t cols_ = 0;
-  // Scaling to apply to each element.
-  float scale_ = 1.0f;
-  // Aligned data array. This is always a borrowed pointer. It should never be
-  // freed. The underlying memory is owned by a subclass or some external class
-  // and must outlive this object.
-  void* ptr_ = nullptr;
-
-  uint32_t stride_;
-};
-
-// MatPtrT adds a single template argument to MatPtr for an explicit type.
-// Use this class as a function argument where the type needs to be known.
-// Use MatPtr where the type does not need to be known.
-template <typename MatT>
-class MatPtrT : public MatPtr {
- public:
-  // Full constructor for dynamic sizing.
-  MatPtrT(const std::string& name, size_t rows, size_t cols)
-      : MatPtr(name, TypeEnum<MatT>(), sizeof(MatT), rows, cols) {}
-  // Construction from TensorIndex entry to remove duplication of sizes.
-  MatPtrT(const std::string& name, const TensorIndex& tensor_index)
-      : MatPtrT<MatT>(name, tensor_index.FindName(name)) {}
-  MatPtrT(const std::string& name, const TensorInfo* tensor)
-      : MatPtr(name, TypeEnum<MatT>(), sizeof(MatT), 0, 0) {
-    if (tensor == nullptr) {
-      cols_ = 0;
-      rows_ = 0;
-    } else {
-      cols_ = tensor->shape.back();
-      rows_ = 1;
-      if (tensor->cols_take_extra_dims) {
-        // The columns eat the extra dimensions.
-        rows_ = tensor->shape[0];
-        for (size_t i = 1; i < tensor->shape.size() - 1; ++i) {
-          cols_ *= tensor->shape[i];
-        }
-      } else {
-        // The rows eat the extra dimensions.
-        for (size_t i = 0; i < tensor->shape.size() - 1; ++i) {
-          rows_ *= tensor->shape[i];
-        }
-      }
-    }
-    stride_ = cols_;
-    num_elements_ = rows_ * cols_;
-  }
-
-  // Copying allowed as the metadata is small.
-  MatPtrT(const MatPtr& other) : MatPtr(other) {}
-  MatPtrT& operator=(const MatPtr& other) {
-    MatPtr::operator=(other);
-    return *this;
-  }
-  MatPtrT(const MatPtrT& other) = default;
-  MatPtrT& operator=(const MatPtrT& other) = default;
-
-  std::string CacheName(int layer = -1, char separator = ' ',
-                        int index = -1) const {
-    // Already used/retired: s, S, n, 1
-    const char prefix = hwy::IsSame<MatT, float>()       ? 'F'
-                        : hwy::IsSame<MatT, BF16>()      ? 'B'
-                        : hwy::IsSame<MatT, SfpStream>() ? '$'
-                        : hwy::IsSame<MatT, NuqStream>() ? '2'
-                                                         : '?';
-    std::string name = std::string(1, prefix) + name_;
-    if (layer >= 0 || index >= 0) {
-      name += '_';
-      if (layer >= 0) name += std::to_string(layer);
-      if (index >= 0) {
-        name += separator + std::to_string(index);
-      }
-    }
-    return name;
-  }
-
-  // Sets the number of elements in the array. For use when the number of
-  // elements is != rows * cols ONLY.
-  void SetNumElements(size_t num_elements) {
-    num_elements_ = CompressedArrayElements<MatT>(num_elements);
-  }
-
-  // 2-d Accessor for a specific type but with a dynamic inner dimension.
-  template <typename T = MatT>
-  const T& At(size_t row, size_t col) const {
-    size_t index = row * cols_ + col;
-    HWY_DASSERT(index < num_elements_);
-    return HWY_RCAST_ALIGNED(const T*, ptr_)[index];
-  }
-
-  // 1-d Accessor for a specific type.
-  // TODO: replace this with a Foreach(), or at least a ForEachRow().
-  const MatT& At(size_t index) const {
-    HWY_DASSERT(index < num_elements_);
-    return HWY_RCAST_ALIGNED(const MatT*, ptr_)[index];
-  }
-  MatT& At(size_t index) { return HWY_RCAST_ALIGNED(MatT*, ptr_)[index]; }
-
-  // Compatibility interface for CompressedArray.
-  // TODO: remove
-  template <typename T = MatT>
-  T* data() {
-    return HWY_RCAST_ALIGNED(T*, ptr_);
-  }
-  template <typename T = MatT>
-  const T* data() const {
-    return HWY_RCAST_ALIGNED(const T*, ptr_);
-  }
-  // The const accessor data_scale1() asserts (!) that the scale is 1.0f, so
-  // calling it means "I am sure the scale is 1 and therefore ignore the scale".
-  // A scale of 0 indicates that the scale has likely never been set, so is
-  // "implicitly 1".
-  const MatT* data_scale1() const {
-    HWY_ASSERT(scale() == 1.f);
-    return HWY_RCAST_ALIGNED(const MatT*, ptr_);
-  }
-};
-
-template <class FuncT, typename... TArgs>
-decltype(auto) MatPtr::CallUpcasted(FuncT& func, TArgs&&... args) {
-  if (type_ == TypeEnum<float>()) {
-    return func(dynamic_cast<MatPtrT<float>*>(this),
-                std::forward<TArgs>(args)...);
-  } else if (type_ == TypeEnum<BF16>()) {
-    return func(dynamic_cast<MatPtrT<BF16>*>(this),
-                std::forward<TArgs>(args)...);
-  } else if (type_ == TypeEnum<SfpStream>()) {
-    return func(dynamic_cast<MatPtrT<SfpStream>*>(this),
-                std::forward<TArgs>(args)...);
-  } else if (type_ == TypeEnum<NuqStream>()) {
-    return func(dynamic_cast<MatPtrT<NuqStream>*>(this),
-                std::forward<TArgs>(args)...);
-  } else {
-    HWY_ABORT("Type %d unknown.", type_);
-  }
-}
-
-// MatStorageT adds the actual data storage to MatPtrT.
-// TODO: use Extents2D instead of rows and cols.
-template <typename MatT>
-class MatStorageT : public MatPtrT<MatT> {
- public:
-  // Full constructor for dynamic sizing.
-  MatStorageT(const std::string& name, size_t rows, size_t cols)
-      : MatPtrT<MatT>(name, rows, cols) {
-    Allocate();
-  }
-  // Can copy the metadata, from a MatPtr, and allocate later.
-  MatStorageT(const MatPtr& other) : MatPtrT<MatT>(other) {}
-  ~MatStorageT() = default;
-
-  // Move-only because this contains a unique_ptr.
-  MatStorageT(const MatStorageT& other) = delete;
-  MatStorageT& operator=(const MatStorageT& other) = delete;
-  MatStorageT(MatStorageT&& other) = default;
-  MatStorageT& operator=(MatStorageT&& other) = default;
-
-  // Allocate the memory and copy the pointer to the MatPtr.
-  // num_elements is in elements. In the default (zero) case, it is computed
-  // from the current num_elements_ which was set by the constructor from the
-  // rows and cols.
-  void Allocate(size_t num_elements = 0) {
-    if (num_elements == 0) {
-      num_elements = hwy::DivCeil(this->SizeBytes(), sizeof(MatT));
-    } else {
-      this->num_elements_ = num_elements;
-    }
-    // Pad to allow overrunning the last row by 2 BF16 vectors, hence at most
-    // `2 * VectorBytes / sizeof(BF16)` elements of MatT.
-    const size_t padding = hwy::VectorBytes();
-    data_ = Allocator::Alloc<MatT>(num_elements + padding);
-    hwy::ZeroBytes(&data_[num_elements], padding * sizeof(MatT));
-    this->ptr_ = data_.get();
-  }
-
-  // Zeros the content.
-  void ZeroInit() {
-    HWY_ASSERT(data_ != nullptr);
-    hwy::ZeroBytes(data_.get(), this->SizeBytes());
-  }
-
- private:
-  AlignedPtr<MatT> data_;
-};
-
-// MatStorage allows heterogeneous tensors to be stored in a single vector.
-using MatStorage = MatStorageT<hwy::uint128_t>;
-
-// Table of contents for a blob store file. Full metadata, but not actual data.
-class BlobToc {
- public:
-  BlobToc() = default;
-
-  // Loads the table of contents from the given reader.
-  BlobError LoadToc(BlobReader& reader) {
-    hwy::uint128_t toc_key = MakeKey(kTocName);
-    size_t toc_size = reader.BlobSize(toc_key);
-    if (toc_size != 0) {
-      std::vector<uint32_t> toc(toc_size / sizeof(uint32_t));
-      BlobError err = reader.ReadOne(toc_key, toc.data(), toc_size);
-      if (err != 0) {
-        fprintf(stderr, "Failed to read toc (error %d)\n", err);
-        return err;
-      }
-      size_t consumed = 0;
-      size_t prev_consumed = static_cast<size_t>(-1);
-      while (consumed < toc.size() && prev_consumed != consumed) {
-        MatPtr blob;
-        const IFields::ReadResult result =
-            blob.Read(hwy::Span<const uint32_t>(toc), consumed);
-        prev_consumed = consumed;
-        consumed = result.pos;
-        if (blob.NumElements() > 0) {
-          AddToToc(blob);
-        }
-      }
-    }
-    return 0;
-  }
-
-  bool Empty() const { return toc_map_.empty(); }
-
-  // Returns true if the table of contents contains the given name.
-  bool Contains(const std::string& name) const {
-    return toc_map_.find(name) != toc_map_.end();
-  }
-
-  // Returns the blob with the given name, or nullptr if not found.
-  const MatPtr* Get(const std::string& name) const {
-    auto it = toc_map_.find(name);
-    if (it == toc_map_.end()) return nullptr;
-    return &toc_[it->second];
-  }
-  // The name of the toc in the blob store file.
-  static constexpr char kTocName[] = "toc";
-
-  // The name of the config in the blob store file.
-  static constexpr char kConfigName[] = "config";
-
-  // The name of the tokenizer in the blob store file.
-  static constexpr char kTokenizerName[] = "tokenizer";
-
- private:
-  // Adds the blob to the table of contents.
-  void AddToToc(const MatPtr& blob) {
-    HWY_ASSERT(!Contains(blob.Name()));
-    toc_map_[blob.Name()] = toc_.size();
-    toc_.push_back(blob);
-  }
-
-  std::unordered_map<std::string, size_t> toc_map_;
-  std::vector<MatPtr> toc_;
-};
-
 #if COMPRESS_STATS
 class CompressStats {
  public:
@@ -489,7 +95,8 @@ struct CompressStats {
 #endif  // COMPRESS_STATS
 
 struct CompressPerThread {
-  NuqStream::ClusterBuf buf;
+  // Allocated the first time NUQ is used.
+  std::unique_ptr<NuqStream::ClusterBuf> buf;
   CompressStats stats;
 };
 
@@ -497,196 +104,11 @@ struct CompressWorkingSet {
   std::vector<CompressPerThread> tls;
 };
 
-// Class to collect and write a set of tensors to a blob store file.
-class WriteToBlobStore {
- public:
-  explicit WriteToBlobStore(hwy::ThreadPool& pool) : pool_(pool) {}
-
-  template <typename Packed>
-  void operator()(MatPtrT<Packed>* compressed, const char* decorated_name) {
-    if (compressed->Ptr() == nullptr) return;
-    writer_.Add(MakeKey(decorated_name), compressed->Ptr(),
-                compressed->SizeBytes());
-    MatPtr renamed_tensor(*compressed);
-    renamed_tensor.SetName(decorated_name);
-    renamed_tensor.AppendTo(toc_);
-  }
-
-  void AddTokenizer(const std::string& tokenizer) {
-    writer_.Add(MakeKey(BlobToc::kTokenizerName), tokenizer.data(),
-                tokenizer.size() * sizeof(tokenizer[0]));
-  }
-
-  void AddScales(const float* scales, size_t len) {
-    if (len) {
-      MatPtrT<float> scales_ptr("scales", 0, 1);
-      writer_.Add(MakeKey(scales_ptr.CacheName().c_str()), scales,
-                  len * sizeof(scales[0]));
-    }
-  }
-
-  // Writes all blobs to disk in the given order. The config is optional and
-  // if given, it is written to the file, along with the TOC, making it
-  // single-file format. Otherwise, the file is written in the multi-file format
-  // without a TOC.
-  BlobError WriteAll(const Path& blob_filename, const ModelConfig* config) {
-    if (config) {
-      writer_.Add(MakeKey(BlobToc::kTocName), toc_.data(),
-                  toc_.size() * sizeof(toc_[0]));
-      config_buffer_ = config->Write();
-      writer_.Add(MakeKey(BlobToc::kConfigName), config_buffer_.data(),
-                  config_buffer_.size() * sizeof(config_buffer_[0]));
-    }
-    const BlobError err = writer_.WriteAll(pool_, blob_filename);
-    if (err != 0) {
-      fprintf(stderr, "Failed to write blobs to %s (error %d)\n",
-              blob_filename.path.c_str(), err);
-    }
-    return err;
-  }
-
-  // Returns the number of blobs added.
-  size_t DebugNumBlobsAdded() const { return writer_.DebugNumBlobsAdded(); }
-
-  hwy::ThreadPool& pool() { return pool_; }
-
- protected:
-  hwy::ThreadPool& pool_;
-
- private:
-  std::vector<uint32_t> toc_;
-  BlobWriter writer_;
-  std::vector<uint32_t> config_buffer_;
-};
-
-// Functor called for each tensor, which loads them and their scaling factors
-// from BlobStore.
-class ReadFromBlobStore {
- public:
-  explicit ReadFromBlobStore(const Path& blob_filename) {
-    err_ = reader_.Open(blob_filename);
-    if (HWY_UNLIKELY(err_ != 0)) {
-      fprintf(stderr, "Error %d opening BlobStore %s.\n", err_,
-              blob_filename.path.c_str());
-      return;  // avoid overwriting err_ to ensure ReadAll will fail.
-    }
-    err_ = file_toc_.LoadToc(reader_);
-    if (HWY_UNLIKELY(err_ != 0)) {
-      fprintf(stderr, "Found a TOC, but failed to load it (code %d)\n", err_);
-    }
-  }
-
-  // Returns true if there is a TOC.
-  bool HaveToc() const { return !file_toc_.Empty(); }
-
-  // Reads the config from the blob store file.
-  BlobError LoadConfig(ModelConfig& config) {
-    hwy::uint128_t config_key = MakeKey(BlobToc::kConfigName);
-    size_t config_size = reader_.BlobSize(config_key);
-    if (config_size == 0) return __LINE__;
-    std::vector<uint32_t> config_buffer(config_size / sizeof(uint32_t));
-    BlobError err =
-        reader_.ReadOne(config_key, config_buffer.data(), config_size);
-    if (err != 0) {
-      fprintf(stderr, "Failed to read config (error %d)\n", err);
-      return err;
-    }
-    config.Read(hwy::Span<const uint32_t>(config_buffer), 0);
-    return 0;
-  }
-
-  // Reads the tokenizer from the blob store file.
-  BlobError LoadTokenizer(std::string& tokenizer) {
-    hwy::uint128_t key = MakeKey(BlobToc::kTokenizerName);
-    size_t tokenizer_size = reader_.BlobSize(key);
-    if (tokenizer_size == 0) return __LINE__;
-    tokenizer.resize(tokenizer_size);
-    ;
-    BlobError err = reader_.ReadOne(key, tokenizer.data(), tokenizer_size);
-    if (err != 0) {
-      fprintf(stderr, "Failed to read tokenizer (error %d)\n", err);
-      return err;
-    }
-    return 0;
-  }
-
-  // Called for each tensor, enqueues read requests.
-  void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
-    if (file_toc_.Empty() || file_toc_.Contains(name)) {
-      model_toc_.push_back(tensors[0]);
-      file_keys_.push_back(name);
-    }
-  }
-
-  BlobError LoadScales(float* scales, size_t len) {
-    for (size_t i = 0; i < len; ++i) {
-      scales[i] = 1.0f;
-    }
-    MatPtrT<float> scales_ptr("scales", 0, 1);
-    auto key = MakeKey(scales_ptr.CacheName().c_str());
-    if (reader_.BlobSize(key) == 0) return 0;
-    return reader_.Enqueue(key, scales, len * sizeof(scales[0]));
-  }
-
-  // Returns whether all tensors are successfully loaded from cache.
-  BlobError ReadAll(hwy::ThreadPool& pool,
-                    std::vector<MatStorage>& model_memory) {
-    // reader_ invalid or any Enqueue failed
-    if (err_ != 0) return err_;
-    // Setup the model_memory.
-    for (size_t b = 0; b < model_toc_.size(); ++b) {
-      const std::string& file_key = file_keys_[b];
-      MatPtr* blob = model_toc_[b];
-      if (!file_toc_.Empty()) {
-        const MatPtr* toc_blob = file_toc_.Get(file_key);
-        if (toc_blob == nullptr) {
-          fprintf(stderr, "Blob %s not found in TOC\n", file_key.c_str());
-          return __LINE__;
-        }
-        if (toc_blob->Rows() != blob->Rows() ||
-            toc_blob->Cols() != blob->Cols()) {
-          fprintf(stderr, "Blob %s has size mismatch TOC\n", file_key.c_str());
-          return __LINE__;
-        }
-        std::string name = blob->Name();
-        *blob = *toc_blob;
-        blob->SetName(name);
-      }
-      model_memory.emplace_back(*blob);
-      model_memory.back().SetName(file_key);
-    }
-    // Allocate in parallel using the pool.
-    pool.Run(0, model_memory.size(),
-             [this, &model_memory](uint64_t task, size_t /*thread*/) {
-               model_memory[task].Allocate();
-               model_toc_[task]->SetPtr(model_memory[task]);
-             });
-    // Enqueue the read requests.
-    for (auto& blob : model_memory) {
-      err_ =
-          reader_.Enqueue(MakeKey(blob.Name()), blob.data(), blob.SizeBytes());
-      if (err_ != 0) {
-        fprintf(stderr,
-                "Failed to read blob %s (error %d) of size %zu x %zu x %zu\n",
-                blob.Name(), err_, blob.Rows(), blob.Cols(),
-                blob.ElementSize());
-        return err_;
-      }
-    }
-    return reader_.ReadAll(pool);
-  }
-
- private:
-  BlobReader reader_;
-  BlobError err_ = 0;
-  // Table of contents from the file, if present.
-  BlobToc file_toc_;
-  // Table of contents from the model. Pointers to original MatPtrT so the
-  // data pointers can be updated.
-  std::vector<MatPtr*> model_toc_;
-  // Mangled names of the tensors in model_toc_ for reading from the file.
-  std::vector<std::string> file_keys_;
-};
+// Returns 1.0f if all magnitudes are <= `SfpStream::kMax`, otherwise scales
+// them such that the largest magnitude is `SfpStream::kMax`, and returns the
+// multiplier with which to restore the original values. This is only necessary
+// before compressing to `SfpStream` and `NuqStream`.
+float ScaleWeights(float* HWY_RESTRICT raw, size_t num);
 
 }  // namespace gcpp
 #endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_COMPRESS_H_
diff --git a/compression/compress_test.cc b/compression/compress_test.cc
index 13b1982..2270689 100644
--- a/compression/compress_test.cc
+++ b/compression/compress_test.cc
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// SFP uses ConcatEven/Odd which are not supported; skip SVE for faster tests.
+#include "compression/types.h"
 #ifndef HWY_DISABLED_TARGETS
-#define HWY_DISABLED_TARGETS (HWY_SCALAR | HWY_SVE)
-#endif
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
 
 #include "compression/compress.h"
 
@@ -80,7 +80,7 @@ struct TestDecompress2T {
       stats.Notify(raw[i], hwy::ConvertScalarTo<float>(dec[i]));
     }
 
-    if constexpr (false) {
+    if constexpr (true) {  // leave enabled due to sporadic failures
       fprintf(stderr,
               "TypeName<Packed>() %s TypeName<T>() %s: num %zu: stats.SumL1() "
               "%f stats.GeomeanValueDivL1() %f stats.WeightedAverageL1() %f "
diff --git a/compression/compress_weights.cc b/compression/compress_weights.cc
deleted file mode 100644
index cbf7e35..0000000
--- a/compression/compress_weights.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Command line tool to create compressed weights.
-
-// Compiles this file for multiple architectures via "foreach_target.h", to
-// which we pass the filename via macro 'argument'.
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "compression/compress_weights.cc"  // NOLINT
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-// After highway.h
-#include "compression/compress-inl.h"
-#include "gemma/configs.h"
-#include "gemma/tokenizer.h"
-
-#ifndef GEMMA_COMPRESS_WEIGHTS_ONCE
-#define GEMMA_COMPRESS_WEIGHTS_ONCE
-
-#include <stddef.h>
-#include <stdio.h>
-
-#include <algorithm>  // std::clamp
-#include <cstdlib>
-#include <iostream>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "compression/compress.h"
-#include "compression/io.h"      // Path
-#include "compression/shared.h"  // PromptWrapping
-#include "gemma/common.h"        // Model
-#include "gemma/weights.h"
-#include "util/allocator.h"
-#include "util/args.h"
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-namespace gcpp {
-
-namespace {
-
-}  // namespace
-
-struct Args : public ArgsBase<Args> {
-  static constexpr size_t kDefaultNumThreads = ~size_t{0};
-
-  void ChooseNumThreads() {
-    if (num_threads == kDefaultNumThreads) {
-      // This is a rough heuristic, replace with something better in the future.
-      num_threads = static_cast<size_t>(std::clamp(
-          static_cast<int>(std::thread::hardware_concurrency()) - 2, 1, 18));
-    }
-  }
-
- public:
-  Args(int argc, char* argv[]) {
-    InitAndParse(argc, argv);
-    ChooseNumThreads();
-  }
-
-  // Returns error string or nullptr if OK.
-  const char* Validate() {
-    if (const char* err = ParseModelTypeAndWrapping(model_type_str, model_type_,
-                                                    prompt_wrapping_)) {
-      return err;
-    }
-    if (const char* err = ParseType(weight_type_str, weight_type_)) {
-      return err;
-    }
-    if (weights.path.empty()) {
-      return "Missing --weights flag, a file for the uncompressed model.";
-    }
-    if (compressed_weights.path.empty()) {
-      return "Missing --compressed_weights flag, a file for the compressed "
-             "model.";
-    }
-    if (!weights.Exists()) {
-      return "Can't open file specified with --weights flag.";
-    }
-    return nullptr;
-  }
-
-  Path weights;             // uncompressed weights file location
-  Path compressed_weights;  // compressed weights file location
-  std::string model_type_str;
-  std::string weight_type_str;
-  size_t num_threads;
-  // If non-empty, whether to include the config and TOC in the output file, as
-  // well as the tokenizer.
-  Path tokenizer;
-
-  template <class Visitor>
-  void ForEach(const Visitor& visitor) {
-    visitor(weights, "weights", Path(),
-            "Path to model weights (.bin) file.\n"
-            "    Required argument.");
-    visitor(model_type_str, "model", std::string(),
-            "Model type\n    2b-it = 2B parameters, instruction-tuned\n    "
-            "2b-pt = 2B parameters, pretrained\n    7b-it = 7B parameters "
-            "instruction-tuned\n    7b-pt = 7B parameters, pretrained\n    "
-            "gr2b-it = griffin 2B parameters, instruction-tuned\n    "
-            "gr2b-pt = griffin 2B parameters, pretrained\n    "
-            "    Required argument.");
-    visitor(weight_type_str, "weight_type", std::string("sfp"),
-            "Weight type\n    f32 = float, bf16 = bfloat16, SFP = 8-bit FP\n"
-            "    Required argument.");
-    visitor(compressed_weights, "compressed_weights", Path(),
-            "Path name where compressed weights (.sbs) file will be written.\n"
-            "    Required argument.");
-    visitor(num_threads, "num_threads",
-            kDefaultNumThreads,  // see ChooseNumThreads
-            "Number of threads to use.\n    Default = Estimate of the "
-            "number of supported concurrent threads.",
-            2);
-    visitor(tokenizer, "tokenizer", Path(),
-            "Path to tokenizer file. If given, the config and TOC are also "
-            "added to the output file.");
-  }
-
-  // Uninitialized before Validate, must call after that.
-  gcpp::Model ModelType() const { return model_type_; }
-  gcpp::PromptWrapping PromptWrappingType() const { return prompt_wrapping_; }
-  gcpp::Type WeightType() const { return weight_type_; }
-
- private:
-  Model model_type_;
-  PromptWrapping prompt_wrapping_;
-  Type weight_type_;
-};
-
-void ShowHelp(gcpp::Args& args) {
-  std::cerr
-      << "Usage:\n./compress_weights --weights <path to uncompressed weights> "
-         " --model <model type> --compressed_weights <output path>\n";
-  std::cerr << "\n*Arguments*\n\n";
-  args.Help();
-  std::cerr << "\n";
-}
-
-}  // namespace gcpp
-#endif  // GEMMA_COMPRESS_WEIGHTS_ONCE
-
-// SIMD code, compiled once per target.
-HWY_BEFORE_NAMESPACE();
-namespace gcpp {
-namespace HWY_NAMESPACE {
-
-template <typename T>
-void CompressWeights(const Path& weights_path,
-                     const Path& compressed_weights_path, Model model_type,
-                     Type weight_type, PromptWrapping wrapping,
-                     const Path& tokenizer_path, hwy::ThreadPool& pool) {
-  if (!weights_path.Exists()) {
-    HWY_ABORT("The model weights file '%s' does not exist.",
-              weights_path.path.c_str());
-  }
-  printf("Compressing weights from %s to %s\n", weights_path.path.c_str(),
-         compressed_weights_path.path.c_str());
-  ModelConfig config = ConfigFromModel(model_type);
-  config.weight = weight_type;
-  config.wrapping = wrapping;
-  std::vector<MatStorage> model_storage;
-  ModelWeightsPtrs<T> c_weights(config);
-  c_weights.Allocate(model_storage, pool);
-  ModelWeightsPtrs<float> uc_weights(config);
-  uc_weights.Allocate(model_storage, pool);
-  // Get uncompressed weights, compress, and store.
-  FILE* fptr = fopen(weights_path.path.c_str(), "rb");
-  if (fptr == nullptr) {
-    HWY_ABORT("Failed to open model file %s - does it exist?",
-              weights_path.path.c_str());
-  }
-  bool ok = true;
-  uint64_t total_size = 0;
-  ModelWeightsPtrs<float>::ForEachTensor(
-      {&uc_weights}, ForEachType::kLoadNoToc,
-      [&](const char* name, hwy::Span<MatPtr*> tensors) {
-        fprintf(stderr, "Loading Parameters (size %zu): %s\n",
-                tensors[0]->SizeBytes(), name);
-        ok &= 1 == fread(tensors[0]->Ptr(), tensors[0]->SizeBytes(), 1, fptr);
-        total_size += tensors[0]->SizeBytes();
-      });
-  if (!tokenizer_path.path.empty()) {
-    uc_weights.AllocAndCopyWithTranspose(pool, model_storage);
-  }
-  const bool scale_for_compression = config.num_tensor_scales > 0;
-  std::vector<float> scales;
-  if (scale_for_compression) {
-    uc_weights.GetOrApplyScales(scales);
-  }
-  Compressor compressor(pool);
-  ModelWeightsPtrs<T>::ForEachTensor(
-      {reinterpret_cast<ModelWeightsPtrs<T>*>(&uc_weights), &c_weights},
-      tokenizer_path.path.empty() ? ForEachType::kLoadNoToc
-                                  : ForEachType::kLoadWithToc,
-      [&compressor](const char* name, hwy::Span<MatPtr*> tensors) {
-        tensors[1]->CallUpcasted(
-            compressor, name,
-            reinterpret_cast<const float*>(tensors[0]->Ptr()));
-      });
-  if (!tokenizer_path.path.empty()) {
-    std::string tokenizer_proto = ReadFileToString(tokenizer_path);
-    compressor.AddTokenizer(tokenizer_proto);
-  } else {
-    compressor.AddScales(scales.data(), scales.size() * sizeof(scales[0]));
-  }
-  compressor.WriteAll(compressed_weights_path,
-                      tokenizer_path.path.empty() ? nullptr : &config);
-}
-
-}  // namespace HWY_NAMESPACE
-}  // namespace gcpp
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace gcpp {
-
-void Run(Args& args) {
-  hwy::ThreadPool pool(args.num_threads);
-  if (args.PromptWrappingType() == PromptWrapping::PALIGEMMA) {
-    HWY_ABORT("PaliGemma is not supported in compress_weights.");
-  }
-  const Model model_type = args.ModelType();
-  const Type weight_type = args.WeightType();
-  switch (weight_type) {
-    case Type::kF32:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<float>)
-      (args.weights, args.compressed_weights, model_type, weight_type,
-       args.PromptWrappingType(), args.tokenizer, pool);
-      break;
-    case Type::kBF16:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<BF16>)
-      (args.weights, args.compressed_weights, model_type, weight_type,
-       args.PromptWrappingType(), args.tokenizer, pool);
-      break;
-    case Type::kSFP:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<SfpStream>)
-      (args.weights, args.compressed_weights, model_type, weight_type,
-       args.PromptWrappingType(), args.tokenizer, pool);
-      break;
-    case Type::kNUQ:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<NuqStream>)
-      (args.weights, args.compressed_weights, model_type, weight_type,
-       args.PromptWrappingType(), args.tokenizer, pool);
-      break;
-    default:
-      HWY_ABORT("Weight type %d unsupported.", static_cast<int>(weight_type));
-  }
-}
-
-}  // namespace gcpp
-
-int main(int argc, char** argv) {
-  gcpp::Args args(argc, argv);
-
-  if (gcpp::HasHelp(argc, argv)) {
-    gcpp::ShowHelp(args);
-    return 0;
-  }
-
-  if (const char* error = args.Validate()) {
-    gcpp::ShowHelp(args);
-    HWY_ABORT("\nInvalid args: %s", error);
-  }
-
-  gcpp::Run(args);
-
-  return 0;
-}
-
-#endif  // HWY_ONCE
diff --git a/compression/convert_weights.py b/compression/convert_weights.py
deleted file mode 100644
index 3ba1642..0000000
--- a/compression/convert_weights.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright 2024 Google LLC
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Converts pytorch to f32 for use by compress_weights.cc."""
-
-import argparse
-import collections
-import os
-from gemma import config
-from gemma import model as gemma_model
-import numpy as np
-import torch
-
-# Requires torch 2.2 and gemma package from
-# https://github.com/google/gemma_pytorch
-
-
-def check_file_exists(value):
-  if not os.path.exists(str(value)):
-    raise argparse.ArgumentTypeError(
-        "The file %s does not appear to exist." % value
-    )
-  return value
-
-
-def check_model_types(value):
-  if str(value).lower() not in ["2b", "7b"]:
-    raise argparse.ArgumentTypeError(
-        "Model type value %s is not in [2b, 7b]." % value
-    )
-  return value
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--tokenizer",
-    dest="tokenizer",
-    default="models/tokenizer.spm",
-    help="Location of tokenizer file (.model or .spm)",
-    type=check_file_exists,
-)
-
-parser.add_argument(
-    "--weights",
-    dest="weights",
-    default="models/gemma-2b-it.ckpt",
-    help="Location of input checkpoint file (.ckpt)",
-    type=check_file_exists,
-)
-
-parser.add_argument(
-    "--output_file",
-    dest="output_file",
-    default="2bit-f32.sbs",
-    help="Location to write converted weights",
-    type=str,
-)
-
-parser.add_argument(
-    "--model_type",
-    dest="model_type",
-    default="2b",
-    help="Model size / type (2b, 7b)",
-    type=check_model_types,
-)
-
-args = parser.parse_args()
-
-
-TRANSFORMATIONS = {
-    "2b": collections.defaultdict(
-        lambda: lambda x: x,
-        {
-            "embedder.weight": lambda x: x,
-            "self_attn.qkv_proj.weight": lambda x: x.reshape((10, 256, 2048)),
-            "self_attn.o_proj.weight": lambda x: x.reshape(
-                (2048, 8, 256)
-            ).transpose([1, 0, 2]),
-            "mlp.gate_proj.weight": lambda x: x[np.newaxis, :, :],
-            "mlp.up_proj.weight": lambda x: x[np.newaxis, :, :],
-            "mlp.down_proj.weight": lambda x: x,
-        },
-    ),
-    "7b": collections.defaultdict(
-        lambda: lambda x: x,
-        {
-            "embedder.weight": lambda x: x,
-            "self_attn.qkv_proj.weight": lambda x: x.reshape(
-                (3, 16, 256, 3072)
-            ).transpose([1, 0, 2, 3]),
-            "self_attn.o_proj.weight": lambda x: x.reshape(
-                (3072, 16, 256)
-            ).transpose([1, 0, 2]),
-            "mlp.gate_proj.weight": lambda x: x[np.newaxis, :, :],
-            "mlp.up_proj.weight": lambda x: x[np.newaxis, :, :],
-            "mlp.down_proj.weight": lambda x: x,
-        },
-    ),
-}
-
-VALIDATIONS = {
-    "2b": {
-        "embedder.weight": lambda x: x.shape == (256000, 2048),
-        "model.norm.weight": lambda x: x.shape == (2048,),
-        "self_attn.qkv_proj.weight": lambda x: x.shape == (10, 256, 2048),
-        "self_attn.o_proj.weight": lambda x: x.shape == (8, 2048, 256),
-        "mlp.gate_proj.weight": lambda x: x.shape == (1, 16384, 2048),
-        "mlp.up_proj.weight": lambda x: x.shape == (1, 16384, 2048),
-        "mlp.down_proj.weight": lambda x: x.shape == (2048, 16384),
-        "input_layernorm.weight": lambda x: x.shape == (2048,),
-        "post_attention_layernorm.weight": lambda x: x.shape == (2048,),
-    },
-    "7b": {
-        "embedder.weight": lambda x: x.shape == (256000, 3072),
-        "model.norm.weight": lambda x: x.shape == (3072,),
-        "self_attn.qkv_proj.weight": lambda x: x.shape == (16, 3, 256, 3072),
-        "self_attn.o_proj.weight": lambda x: x.shape == (16, 3072, 256),
-        "mlp.gate_proj.weight": lambda x: x.shape == (1, 24576, 3072),
-        "mlp.up_proj.weight": lambda x: x.shape == (1, 24576, 3072),
-        "mlp.down_proj.weight": lambda x: x.shape == (3072, 24576),
-        "input_layernorm.weight": lambda x: x.shape == (3072,),
-        "post_attention_layernorm.weight": lambda x: x.shape == (3072,),
-    },
-}
-
-
-def param_names(num_hidden_layers: int):
-  """Return parameter names in the order they are expected for deserialization."""
-
-  # note *weight_scaler params are ignored in the forward computation unless
-  # quantization is being used.
-  #
-  # since we are working with the full precision weights as input, don't
-  # include these in the parameters being iterated over.
-
-  names = [
-      ("embedder.weight",) * 2,  # embedder_input_embedding
-      ("model.norm.weight",) * 2,  # final_norm_scale
-  ]
-  layer_params = [
-      "self_attn.o_proj.weight",  # attn_vec_einsum_w
-      "self_attn.qkv_proj.weight",  # qkv_einsum_w
-      "mlp.gate_proj.weight",  # gating_einsum_w
-      "mlp.up_proj.weight",
-      "mlp.down_proj.weight",  # linear_w
-      "input_layernorm.weight",  # pre_attention_norm_scale
-      "post_attention_layernorm.weight",  # pre_ffw_norm_scale
-  ]
-  for layer in range(num_hidden_layers):
-    for layer_param in layer_params:
-      names = names + [(f"model.layers.{layer}.{layer_param}", layer_param)]
-  return names
-
-
-def convert_weights():
-  """Main function; loads weights, runs transformations, writes f32."""
-  model_type = args.model_type
-  output_file = args.output_file
-
-  model_config = config.get_model_config(model_type)
-  model_config.dtype = "float32"
-  model_config.tokenizer = args.tokenizer
-  device = torch.device("cpu")
-  torch.set_default_dtype(torch.float)
-  model = gemma_model.GemmaForCausalLM(model_config)
-
-  model.load_weights(args.weights)
-  model.to(device).eval()
-
-  model_dict = dict(model.named_parameters())
-  param_order = param_names(model_config.num_hidden_layers)
-
-  all_ok = True
-  print("Checking transformations ...")
-  for name, layer_name in param_order:
-    arr = model_dict[name].detach().numpy()
-    arr = TRANSFORMATIONS[model_type][layer_name](arr)
-    check = "OK" if VALIDATIONS[model_type][layer_name](arr) else "FAILED"
-
-    if check == "FAILED":
-      all_ok = False
-      print(f"  {name : <60}{str(arr.shape) : <20}{check}")
-
-  if all_ok:
-    print("Writing parameters ...")
-    with open(output_file, "wb") as bin_handle:
-      for name, layer_name in param_order:
-        arr = model_dict[name].detach().numpy()
-        arr = TRANSFORMATIONS[model_type][layer_name](arr)
-        check = "OK" if VALIDATIONS[model_type][layer_name](arr) else "FAILED"
-        print(f"  {name : <60}{str(arr.shape) : <20}{check}")
-        arr.flatten().astype(np.float32).tofile(bin_handle)
-
-
-if __name__ == "__main__":
-    convert_weights()
-    print("Done")
diff --git a/compression/distortion_test.cc b/compression/distortion_test.cc
index 9350b5b..c52ecca 100644
--- a/compression/distortion_test.cc
+++ b/compression/distortion_test.cc
@@ -17,7 +17,7 @@
 
 #include <stdio.h>
 
-#include "compression/shared.h"  // SfpStream::kMax
+#include "compression/types.h"  // SfpStream::kMax
 #include "util/test_util.h"
 #include "hwy/nanobenchmark.h"
 #include "hwy/tests/hwy_gtest.h"
diff --git a/compression/io.cc b/compression/io.cc
deleted file mode 100644
index 84e3603..0000000
--- a/compression/io.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Safe to be first, does not include POSIX headers.
-#include "hwy/detect_compiler_arch.h"
-// Only compile this file on non-Windows; it replaces io_win.cc. It is easier to
-// check this in source code because we support multiple build systems.
-#if !HWY_OS_WIN
-
-// Request POSIX 2008, including `pread()` and `posix_fadvise()`.
-#if !defined(_XOPEN_SOURCE) || _XOPEN_SOURCE < 700
-#undef _XOPEN_SOURCE
-#define _XOPEN_SOURCE 700
-#endif
-#if !defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE < 200809
-#define _POSIX_C_SOURCE 200809
-#endif
-
-// Make `off_t` 64-bit even on 32-bit systems. Works for Android >= r15c.
-#undef _FILE_OFFSET_BITS
-#define _FILE_OFFSET_BITS 64
-
-#include <fcntl.h>  // open
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>     // SEEK_END - unistd isn't enough for IDE.
-#include <sys/stat.h>  // O_RDONLY
-#include <unistd.h>    // read, write, close
-
-#include <memory>
-
-#include "compression/io.h"
-#include "hwy/base.h"  // HWY_ASSERT
-
-namespace gcpp {
-
-class FilePosix : public File {
-  int fd_ = 0;
-
- public:
-  explicit FilePosix(int fd) : fd_(fd) { HWY_ASSERT(fd > 0); }
-  ~FilePosix() override {
-    if (fd_ != 0) {
-      HWY_ASSERT(close(fd_) != -1);
-    }
-  }
-
-  uint64_t FileSize() const override {
-    static_assert(sizeof(off_t) == 8, "64-bit off_t required");
-    const off_t size = lseek(fd_, 0, SEEK_END);
-    if (size < 0) {
-      return 0;
-    }
-    return static_cast<uint64_t>(size);
-  }
-
-  bool Read(uint64_t offset, uint64_t size, void* to) const override {
-    uint8_t* bytes = reinterpret_cast<uint8_t*>(to);
-    uint64_t pos = 0;
-    for (;;) {
-      // pread seems to be faster than lseek + read when parallelized.
-      const auto bytes_read = pread(fd_, bytes + pos, size - pos, offset + pos);
-      if (bytes_read <= 0) break;
-      pos += bytes_read;
-      HWY_ASSERT(pos <= size);
-      if (pos == size) break;
-    }
-    return pos == size;  // success if managed to read desired size
-  }
-
-  bool Write(const void* from, uint64_t size, uint64_t offset) override {
-    const uint8_t* bytes = reinterpret_cast<const uint8_t*>(from);
-    uint64_t pos = 0;
-    for (;;) {
-      const auto bytes_written =
-          pwrite(fd_, bytes + pos, size - pos, offset + pos);
-      if (bytes_written <= 0) break;
-      pos += bytes_written;
-      HWY_ASSERT(pos <= size);
-      if (pos == size) break;
-    }
-    return pos == size;  // success if managed to write desired size
-  }
-};  // FilePosix
-
-HWY_MAYBE_UNUSED extern std::unique_ptr<File> OpenFileGoogle(
-    const Path& filename, const char* mode);
-
-std::unique_ptr<File> OpenFileOrNull(const Path& filename, const char* mode) {
-  std::unique_ptr<File> file;  // OpenFileGoogle omitted
-  if (file) return file;
-
-  const bool is_read = mode[0] != 'w';
-  const int flags = is_read ? O_RDONLY : O_CREAT | O_RDWR | O_TRUNC;
-  const int fd = open(filename.path.c_str(), flags, 0644);
-  if (fd < 0) return file;
-
-#if HWY_OS_LINUX && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 21)
-  if (is_read) {
-    // Doubles the readahead window, which seems slightly faster when cached.
-    (void)posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
-  }
-#endif
-
-  return std::make_unique<FilePosix>(fd);
-}
-
-}  // namespace gcpp
-#endif  // !HWY_OS_WIN
diff --git a/compression/nuq-inl.h b/compression/nuq-inl.h
index 150ad79..997bb5b 100644
--- a/compression/nuq-inl.h
+++ b/compression/nuq-inl.h
@@ -23,7 +23,7 @@
 
 #include <cstdio>
 
-#include "compression/shared.h"
+#include "compression/types.h"
 #include "util/basics.h"
 #include "hwy/base.h"
 
diff --git a/compression/nuq_test.cc b/compression/nuq_test.cc
index 6dd5982..df300f4 100644
--- a/compression/nuq_test.cc
+++ b/compression/nuq_test.cc
@@ -13,20 +13,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// SFP uses ConcatEven/Odd which are not supported; skip SVE for faster tests.
+#include "compression/types.h"
 #ifndef HWY_DISABLED_TARGETS
-#define HWY_DISABLED_TARGETS (HWY_SCALAR | HWY_SVE)
-#endif
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
 
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 
 #include <algorithm>  // std::shuffle
+#include <array>
 #include <random>
 
 #include "compression/distortion.h"
-#include "compression/shared.h"
 #include "util/test_util.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
@@ -104,7 +104,7 @@ struct TestPlateaus {
       HWY_ASSERT(-0.5f <= in[i] && in[i] < 0.5f);
     }
 
-    std::random_device rd;
+    std::random_device rd;  // NOLINT
     std::mt19937 rng(rd());
     std::shuffle(in.get(), in.get() + kGroupSize, rng);
 
@@ -151,7 +151,7 @@ struct TestRamp {
       HWY_ASSERT(-0.45f <= in[i] && in[i] < 0.55f);
     }
 
-    std::random_device rd;
+    std::random_device rd;  // NOLINT
     std::mt19937 rng(rd());
     std::shuffle(in.get(), in.get() + kGroupSize, rng);
 
@@ -246,7 +246,8 @@ struct TestOffset {
     auto in = hwy::AllocateAligned<float>(total);  // Enc() requires f32
     auto dec1 = hwy::AllocateAligned<T>(total);
     auto dec2 = hwy::AllocateAligned<T>(kMidLen);
-    auto nuq = hwy::AllocateAligned<NuqStream>(NuqStream::PackedEnd(total));
+    auto nuq = hwy::AllocateAligned<NuqStream>(
+        hwy::RoundUpTo(NuqStream::PackedEnd(total), hwy::VectorBytes()));
     HWY_ASSERT(in && dec1 && dec2 && nuq);
     const auto nuq_span = MakeSpan(nuq.get(), total);
 
@@ -296,7 +297,8 @@ struct TestUnalignedOffset {
 
       auto in = hwy::AllocateAligned<float>(total);  // Enc() requires f32
       auto dec1 = hwy::AllocateAligned<T>(total);
-      auto nuq = hwy::AllocateAligned<NuqStream>(NuqStream::PackedEnd(total));
+      auto nuq = hwy::AllocateAligned<NuqStream>(
+          hwy::RoundUpTo(NuqStream::PackedEnd(total), hwy::VectorBytes()));
       auto dec2 = hwy::AllocateAligned<T>(num_decompressed);
       HWY_ASSERT(in && dec1 && dec2 && nuq);
       const auto nuq_span = MakeSpan(nuq.get(), total);
@@ -347,7 +349,8 @@ struct TestDec2 {
     auto dec0 = hwy::AllocateAligned<T>(total);
     auto dec1 = hwy::AllocateAligned<T>(total);
     auto dec2 = hwy::AllocateAligned<T>(kMidLen);
-    auto nuq = hwy::AllocateAligned<NuqStream>(NuqStream::PackedEnd(total));
+    auto nuq = hwy::AllocateAligned<NuqStream>(
+        hwy::RoundUpTo(NuqStream::PackedEnd(total), hwy::VectorBytes()));
     HWY_ASSERT(in && dec0 && dec1 && dec2 && nuq);
     const auto nuq_span = MakeSpan(nuq.get(), total);
 
@@ -449,7 +452,8 @@ struct TestEncDec {
     const size_t num = 4 * kGroupSize;
     auto in = hwy::AllocateAligned<float>(num);  // Enc() requires f32
     auto out = hwy::AllocateAligned<T>(num);     // already padded
-    auto nuq = hwy::AllocateAligned<NuqStream>(NuqStream::PackedEnd(num));
+    auto nuq = hwy::AllocateAligned<NuqStream>(
+        hwy::RoundUpTo(NuqStream::PackedEnd(num), hwy::VectorBytes()));
     HWY_ASSERT(in && out && nuq);
     const auto nuq_span = MakeSpan(nuq.get(), num);
 
@@ -512,6 +516,7 @@ HWY_AFTER_NAMESPACE();
 #if HWY_ONCE
 namespace gcpp {
 HWY_BEFORE_TEST(NuqTest);
+#if GEMMA_ENABLE_NUQ
 HWY_EXPORT_AND_TEST_P(NuqTest, TestAllFlat);
 HWY_EXPORT_AND_TEST_P(NuqTest, TestAllPlateaus);
 HWY_EXPORT_AND_TEST_P(NuqTest, TestAllRamp);
@@ -525,6 +530,9 @@ HWY_EXPORT_AND_TEST_P(NuqTest, TestUnalignedOffsetF32);
 HWY_EXPORT_AND_TEST_P(NuqTest, TestAllNibble);
 HWY_EXPORT_AND_TEST_P(NuqTest, TestEncDecBF16);
 HWY_EXPORT_AND_TEST_P(NuqTest, TestEncDecF32);
+#else
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(NuqTest);
+#endif  // GEMMA_ENABLE_NUQ
 HWY_AFTER_TEST();
 }  // namespace gcpp
 #endif  // HWY_ONCE
diff --git a/compression/python/BUILD.bazel b/compression/python/BUILD.bazel
index 8bfb391..474511c 100644
--- a/compression/python/BUILD.bazel
+++ b/compression/python/BUILD.bazel
@@ -14,11 +14,16 @@ cc_library(
     hdrs = ["compression_clif_aux.h"],
     visibility = ["//visibility:private"],
     deps = [
-        "@abseil-cpp//absl/types:span",
-        "//:common",
+        "//:basics",
+        "//:configs",
+        "//:mat",
+        "//:model_store",
+        "//:tensor_info",
+        "//:threading_context",
         "//:tokenizer",
         "//compression:compress",
-        "//compression:io",
+        "//io",
+        "//io:blob_store",
         "@highway//:hwy",
         "@highway//:thread_pool",
     ],
@@ -29,9 +34,9 @@ pybind_extension(
     srcs = ["compression_extension.cc"],
     deps = [
         ":compression_clif_aux",
-        "@abseil-cpp//absl/types:span",
-        "//:common",
-        "//compression:sfp",
+        "//:mat",
+        "//:tensor_info",
+        "//compression:types",
     ],
 )
 
diff --git a/compression/python/compression_clif_aux.cc b/compression/python/compression_clif_aux.cc
index 2705756..2de1b67 100644
--- a/compression/python/compression_clif_aux.cc
+++ b/compression/python/compression_clif_aux.cc
@@ -15,14 +15,29 @@
 
 #include "compression/python/compression_clif_aux.h"
 
-#include <cstddef>
-#include <cstdio>
+#include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
 #include <string>
 #include <vector>
 
-#include "compression/compress.h"
-#include "compression/shared.h"
-#include "hwy/aligned_allocator.h"
+#include "compression/compress.h"  // ScaleWeights
+#include "gemma/configs.h"         // ModelConfig
+#include "gemma/model_store.h"     // ModelStore
+#include "gemma/tensor_info.h"     // TensorInfo
+#include "gemma/tokenizer.h"
+#include "io/blob_store.h"  // BlobWriter
+#include "io/io.h"          // Path
+#include "util/basics.h"
+#include "util/mat.h"
+#include "util/threading_context.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE \
@@ -32,157 +47,97 @@
 // After highway.h
 #include "compression/compress-inl.h"
 
-// Non-SIMD includes and types. Note that HWY_ONCE is only true on the last
-// compile pass, whereas we want this defined in the first.
-#ifndef GEMMA_ONCE
-#define GEMMA_ONCE
-
-#include "absl/types/span.h"
-#include "compression/io.h"
-#include "gemma/configs.h"
-#include "gemma/tensor_index.h"
-#include "gemma/tokenizer.h"
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-namespace gcpp {
-
-class WriterInterface {
- public:
-  virtual ~WriterInterface() = default;
-
-  virtual void Insert(std::string name, absl::Span<const float> weights,
-                      Type type, const TensorInfo& tensor_info,
-                      float scale) = 0;
-  virtual void InsertSfp(std::string name, absl::Span<const float> weights) = 0;
-  virtual void InsertNUQ(std::string name, absl::Span<const float> weights) = 0;
-  virtual void InsertBfloat16(std::string name,
-                              absl::Span<const float> weights) = 0;
-  virtual void InsertFloat(std::string name,
-                           absl::Span<const float> weights) = 0;
-  virtual void AddScales(const std::vector<float>& scales) = 0;
-  virtual void AddTokenizer(const std::string& tokenizer_path) = 0;
-
-  virtual size_t DebugNumBlobsAdded() const = 0;
-
-  virtual int WriteWithConfig(std::string path, const ModelConfig* config) = 0;
-};
-
-}  // namespace gcpp
-
-#endif  // GEMMA_ONCE
-
 // SIMD code, compiled once per target.
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-class SbsWriterImpl : public WriterInterface {
+// Implementation for the currently compiled SIMD target.
+class SbsWriterImpl : public ISbsWriter {
   template <typename Packed>
-  void AllocateAndCompress(const std::string& name,
-                           absl::Span<const float> weights) {
-    MatPtrT<Packed> storage(name, 1, weights.size());
-    model_memory_.push_back(storage);
-    model_memory_.back().Allocate();
-    storage.SetPtr(model_memory_.back());
-    std::string decorated_name = storage.CacheName();
-    compressor_(&storage, decorated_name.c_str(), weights.data());
-  }
-  template <typename Packed>
-  void AllocateWithShape(const std::string& name,
-                         absl::Span<const float> weights,
-                         const TensorInfo& tensor_info, float scale) {
-    MatPtrT<Packed> storage(name, &tensor_info);
-    storage.set_scale(scale);
+  void InsertT(const char* name, F32Span weights,
+               const TensorInfo& tensor_info) {
+    // TODO(janwas): 1D parallel-for.
+    hwy::ThreadPool& pool = ctx_.pools.Pool();
 
-    // Don't reset num_elements for NUQ.
-    if (!hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>()) {
-      storage.SetNumElements(CompressedArrayElements<Packed>(weights.size()));
+    MatPtrT<Packed> mat(name, ExtentsFromInfo(&tensor_info));
+    // SFP and NUQ (which uses SFP for cluster centers) have a limited range
+    // and depending on the input values may require rescaling. Scaling is
+    // cheap for matmul and probably not an issue for other ops, but it might be
+    // beneficial for precision to keep the original data range for other types.
+    if (mat.GetType() == Type::kSFP || mat.GetType() == Type::kNUQ) {
+      mat.SetScale(ScaleWeights(weights.data(), weights.size()));
     }
 
-    model_memory_.push_back(storage);
-    if (mode_ == CompressorMode::kTEST_ONLY) return;
-    model_memory_.back().Allocate();
-    storage.SetPtr(model_memory_.back());
-    std::string decorated_name = storage.CacheName();
-    compressor_(&storage, decorated_name.c_str(), weights.data());
+    if (weights.size() == 0) {
+      HWY_WARN("Ignoring zero-sized tensor %s.", name);
+      return;
+    }
+
+    mat.AppendTo(serialized_mat_ptrs_);
+    MatOwner mat_owner;
+    mat_owner.AllocateFor(mat, ctx_.allocator, MatPadding::kPacked);
+
+    // Handle gemma_export_test's MockArray. Write blobs so that the test
+    // succeeds, but we only have 10 floats, not the full tensor.
+    if (weights.size() == 10 && mat.Extents().Area() != 10) {
+      Compress(weights.data(), weights.size(), working_set_, mat.Span(),
+               /*packed_ofs=*/0, pool);
+      writer_.Add(name, mat.Packed(), mat.ElementBytes() * 10);
+      return;
+    }
+
+    fprintf(stderr, "Compressing %s (%zu x %zu = %zuM) to %s, please wait\n",
+            name, mat.Rows(), mat.Cols(), weights.size() / (1000 * 1000),
+            TypeName(TypeEnum<Packed>()));
+    HWY_ASSERT(weights.size() == mat.Extents().Area());
+    Compress(weights.data(), weights.size(), working_set_, mat.Span(),
+             /*packed_ofs=*/0, pool);
+    writer_.Add(name, mat.Packed(), mat.PackedBytes());
   }
 
  public:
-  explicit SbsWriterImpl(CompressorMode mode)
-      : pool_(0), compressor_(pool_), mode_(mode) {}
+  SbsWriterImpl(const std::string& sbs_path)
+      : ctx_(ThreadingArgs()),
+        writer_(gcpp::Path(sbs_path), ctx_.pools.Pool()) {}
 
-  void Insert(std::string name, absl::Span<const float> weights, Type type,
-              const TensorInfo& tensor_info, float scale) override {
+  void Insert(const char* name, F32Span weights, Type type,
+              const TensorInfo& tensor_info) override {
     switch (type) {
       case Type::kSFP:
-        AllocateWithShape<SfpStream>(name, weights, tensor_info, scale);
+        InsertT<SfpStream>(name, weights, tensor_info);
         break;
       case Type::kNUQ:
-        AllocateWithShape<NuqStream>(name, weights, tensor_info, scale);
+        InsertT<NuqStream>(name, weights, tensor_info);
         break;
       case Type::kBF16:
-        AllocateWithShape<BF16>(name, weights, tensor_info, scale);
+        InsertT<BF16>(name, weights, tensor_info);
         break;
       case Type::kF32:
-        AllocateWithShape<float>(name, weights, tensor_info, scale);
+        InsertT<float>(name, weights, tensor_info);
         break;
       default:
-        HWY_ABORT("Unsupported type");
+        HWY_ABORT("Unsupported destination (compressed) type %s",
+                  TypeName(type));
     }
   }
 
-  void InsertSfp(std::string name, absl::Span<const float> weights) override {
-    AllocateAndCompress<SfpStream>(name, weights);
+  void Write(const ModelConfig& config,
+             const std::string& tokenizer_path) override {
+    const GemmaTokenizer tokenizer(
+        tokenizer_path.empty() ? kMockTokenizer
+                               : ReadFileToString(Path(tokenizer_path)));
+    WriteSingleFile(config, tokenizer, serialized_mat_ptrs_, writer_);
   }
 
-  void InsertNUQ(std::string name, absl::Span<const float> weights) override {
-    AllocateAndCompress<NuqStream>(name, weights);
-  }
-
-  void InsertBfloat16(std::string name,
-                      absl::Span<const float> weights) override {
-    AllocateAndCompress<BF16>(name, weights);
-  }
-
-  void InsertFloat(std::string name, absl::Span<const float> weights) override {
-    AllocateAndCompress<float>(name, weights);
-  }
-
-  void AddScales(const std::vector<float>& scales) override {
-    HWY_ASSERT(scales_.empty());
-    scales_ = scales;
-    compressor_.AddScales(scales_.data(), scales_.size());
-  }
-
-  void AddTokenizer(const std::string& tokenizer_path) override {
-    Path path(tokenizer_path);
-    GemmaTokenizer tokenizer(path);
-    std::string tokenizer_proto = tokenizer.Serialize();
-    HWY_ASSERT(!tokenizer_proto.empty());
-    compressor_.AddTokenizer(tokenizer_proto);
-  }
-
-  // Returns the number of blobs added.
-  size_t DebugNumBlobsAdded() const {
-    if (mode_ == CompressorMode::kTEST_ONLY) return model_memory_.size();
-    return compressor_.DebugNumBlobsAdded();
-  }
-
-  int WriteWithConfig(std::string path, const ModelConfig* config) override {
-    return compressor_.WriteAll(gcpp::Path(path), config);
-  }
-
-  hwy::ThreadPool pool_;
-  Compressor compressor_;
+  ThreadingContext ctx_;
   CompressWorkingSet working_set_;
-  std::vector<MatStorage> model_memory_;
-  std::vector<float> scales_;
-  CompressorMode mode_;
+  BlobWriter writer_;
+  std::vector<uint32_t> serialized_mat_ptrs_;
 };
 
-WriterInterface* NewSbsWriter(CompressorMode mode) {
-  return new SbsWriterImpl(mode);
+ISbsWriter* NewSbsWriter(const std::string& sbs_path) {
+  return new SbsWriterImpl(sbs_path);
 }
 
 }  // namespace HWY_NAMESPACE
@@ -194,43 +149,11 @@ namespace gcpp {
 
 HWY_EXPORT(NewSbsWriter);
 
-SbsWriter::SbsWriter(CompressorMode mode)
-    : impl_(HWY_DYNAMIC_DISPATCH(NewSbsWriter)(mode)) {}
-SbsWriter::~SbsWriter() = default;
+SbsWriter::SbsWriter(const std::string& path)
+    : impl_(HWY_DYNAMIC_DISPATCH(NewSbsWriter)(path)) {}
 
-void SbsWriter::Insert(std::string name, absl::Span<const float> weights,
-                       Type type, const TensorInfo& tensor_info, float scale) {
-  impl_->Insert(name, weights, type, tensor_info, scale);
-}
-void SbsWriter::InsertSfp(std::string name, absl::Span<const float> weights) {
-  impl_->InsertSfp(name, weights);
-}
-void SbsWriter::InsertNUQ(std::string name, absl::Span<const float> weights) {
-  impl_->InsertNUQ(name, weights);
-}
-void SbsWriter::InsertBfloat16(std::string name,
-                               absl::Span<const float> weights) {
-  impl_->InsertBfloat16(name, weights);
-}
-void SbsWriter::InsertFloat(std::string name, absl::Span<const float> weights) {
-  impl_->InsertFloat(name, weights);
-}
-
-void SbsWriter::AddScales(const std::vector<float>& scales) {
-  impl_->AddScales(scales);
-}
-
-void SbsWriter::AddTokenizer(const std::string& tokenizer_path) {
-  impl_->AddTokenizer(tokenizer_path);
-}
-
-size_t SbsWriter::DebugNumBlobsAdded() const {
-  return impl_->DebugNumBlobsAdded();
-}
-
-int SbsWriter::WriteWithConfig(std::string path, const ModelConfig* config) {
-  return impl_->WriteWithConfig(path, config);
-}
+SbsReader::SbsReader(const std::string& path)
+    : reader_(Path(path)), model_(reader_) {}
 
 }  // namespace gcpp
 #endif  // HWY_ONCE
diff --git a/compression/python/compression_clif_aux.h b/compression/python/compression_clif_aux.h
index 4ea5b16..6979865 100644
--- a/compression/python/compression_clif_aux.h
+++ b/compression/python/compression_clif_aux.h
@@ -16,52 +16,67 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_PYTHON_COMPRESSION_CLIF_AUX_H_
 #define THIRD_PARTY_GEMMA_CPP_COMPRESSION_PYTHON_COMPRESSION_CLIF_AUX_H_
 
-#include <cstddef>
+#include <stddef.h>
+
 #include <memory>
 #include <string>
-#include <vector>
 
-#include "absl/types/span.h"
-#include "compression/shared.h"
+#include "compression/types.h"  // Type
 #include "gemma/configs.h"
-#include "gemma/tensor_index.h"
+#include "gemma/model_store.h"
+#include "gemma/tensor_info.h"
+#include "io/blob_store.h"
+#include "util/mat.h"
+#include "hwy/aligned_allocator.h"  // Span
 
 namespace gcpp {
 
-// How to process the data.
-enum class CompressorMode {
-  // No compression, no write to file, just for testing.
-  kTEST_ONLY,
-  // Old-style compression, no table of contents.
-  kNO_TOC,
-  // New-style compression, with table of contents.
-  kWITH_TOC,
+// Can be modified in place by ScaleWeights.
+using F32Span = hwy::Span<float>;
+
+// Interface because we compile one derived implementation per SIMD target,
+// because Compress() uses SIMD.
+class ISbsWriter {
+ public:
+  virtual ~ISbsWriter() = default;
+
+  virtual void Insert(const char* name, F32Span weights, Type type,
+                      const TensorInfo& tensor_info) = 0;
+
+  virtual void Write(const ModelConfig& config,
+                     const std::string& tokenizer_path) = 0;
 };
 
-class WriterInterface;
-
+// Non-virtual class used by pybind that calls the interface's virtual methods.
+// This avoids having to register the derived types with pybind.
 class SbsWriter {
  public:
-  explicit SbsWriter(CompressorMode mode);
-  ~SbsWriter();
+  explicit SbsWriter(const std::string& sbs_path);
 
-  void Insert(std::string name, absl::Span<const float> weights, Type type,
-              const TensorInfo& tensor_info, float scale);
-  void InsertSfp(std::string name, absl::Span<const float> weights);
-  void InsertNUQ(std::string name, absl::Span<const float> weights);
-  void InsertBfloat16(std::string name, absl::Span<const float> weights);
-  void InsertFloat(std::string name, absl::Span<const float> weights);
-  void AddScales(const std::vector<float>& scales);
-  void AddTokenizer(const std::string& tokenizer_path);
+  void Insert(const char* name, F32Span weights, Type type,
+              const TensorInfo& tensor_info) {
+    impl_->Insert(name, weights, type, tensor_info);
+  }
 
-  size_t DebugNumBlobsAdded() const;
-
-  int Write(std::string path) { return WriteWithConfig(path, nullptr); }
-  int WriteWithConfig(std::string path, const ModelConfig* config);
+  void Write(const ModelConfig& config, const std::string& tokenizer_path) {
+    impl_->Write(config, tokenizer_path);
+  }
 
  private:
-  // Isolates Highway-dispatched types and other internals from CLIF.
-  std::unique_ptr<WriterInterface> impl_;
+  std::unique_ptr<ISbsWriter> impl_;
+};
+
+// Limited metadata-only reader for tests.
+class SbsReader {
+ public:
+  SbsReader(const std::string& path);
+
+  const ModelConfig& Config() const { return model_.Config(); }
+  const MatPtr* FindMat(const char* name) const { return model_.FindMat(name); }
+
+ private:
+  gcpp::BlobReader reader_;
+  gcpp::ModelStore model_;
 };
 
 }  // namespace gcpp
diff --git a/compression/python/compression_extension.cc b/compression/python/compression_extension.cc
index c873a23..e3d1556 100644
--- a/compression/python/compression_extension.cc
+++ b/compression/python/compression_extension.cc
@@ -15,58 +15,54 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
-#include <stdexcept>
 #include <string>
 
-#include "absl/types/span.h"
 #include "compression/python/compression_clif_aux.h"
-#include "compression/shared.h"
+#include "compression/types.h"  // Type
+#include "gemma/tensor_info.h"
+#include "util/mat.h"
 
-using gcpp::CompressorMode;
+using gcpp::MatPtr;
+using gcpp::SbsReader;
 using gcpp::SbsWriter;
 
-namespace py = pybind11;
+namespace pybind11 {
 
-namespace {
 template <auto Func>
-void wrap_span(SbsWriter& writer, std::string name, py::array_t<float> data) {
+static void CallWithF32Span(SbsWriter& writer, const char* name,
+                            array_t<float> data, gcpp::Type type,
+                            const gcpp::TensorInfo& tensor_info) {
   if (data.ndim() != 1 || data.strides(0) != sizeof(float)) {
-    throw std::domain_error("Input array must be 1D and densely packed.");
+    HWY_ABORT("Input array must be 1D (not %d) and contiguous floats.",
+              static_cast<int>(data.ndim()));
   }
-  std::invoke(Func, writer, name, absl::MakeSpan(data.data(0), data.size()));
+  std::invoke(Func, writer, name,
+              gcpp::F32Span(data.mutable_data(0), data.size()), type,
+              tensor_info);
 }
-template <auto Func>
-void wrap_span_typed(SbsWriter& writer, std::string name,
-                     py::array_t<float> data, gcpp::Type type,
-                     gcpp::TensorInfo tensor_info, float scale) {
-  if (data.ndim() != 1 || data.strides(0) != sizeof(float)) {
-    throw std::domain_error("Input array must be 1D and densely packed.");
-  }
-  std::invoke(Func, writer, name, absl::MakeSpan(data.data(0), data.size()),
-              type, tensor_info, scale);
-}
-}  // namespace
 
 PYBIND11_MODULE(compression, m) {
-  py::enum_<CompressorMode>(m, "CompressorMode")
-      .value("TEST_ONLY", CompressorMode::kTEST_ONLY)
-      .value("NO_TOC", CompressorMode::kNO_TOC)
-      .value("WITH_TOC", CompressorMode::kWITH_TOC);
+  class_<SbsWriter>(m, "SbsWriter")
+      .def(init<std::string>())
+      .def("insert", CallWithF32Span<&SbsWriter::Insert>)
+      .def("write", &SbsWriter::Write, arg("config"), arg("tokenizer_path"));
 
-  py::class_<SbsWriter>(m, "SbsWriter")
-      .def(py::init<CompressorMode>())
-      // NOTE: Individual compression backends may impose constraints on the
-      // array length, such as a minimum of (say) 32 elements.
-      .def("insert", wrap_span_typed<&SbsWriter::Insert>)
-      .def("insert_sfp", wrap_span<&SbsWriter::InsertSfp>)
-      .def("insert_nuq", wrap_span<&SbsWriter::InsertNUQ>)
-      .def("insert_bf16", wrap_span<&SbsWriter::InsertBfloat16>)
-      .def("insert_float", wrap_span<&SbsWriter::InsertFloat>)
-      .def("add_scales", &SbsWriter::AddScales)
-      .def("add_tokenizer", &SbsWriter::AddTokenizer)
-      .def("debug_num_blobs_added", &SbsWriter::DebugNumBlobsAdded)
-      .def("write", &SbsWriter::Write)
-      .def("write_with_config", &SbsWriter::WriteWithConfig);
+  class_<MatPtr>(m, "MatPtr")
+      // No init, only created within C++.
+      .def_property_readonly("rows", &MatPtr::Rows, "Number of rows")
+      .def_property_readonly("cols", &MatPtr::Cols, "Number of cols")
+      .def_property_readonly("type", &MatPtr::GetType, "Element type")
+      .def_property_readonly("scale", &MatPtr::Scale, "Scaling factor");
+
+  class_<SbsReader>(m, "SbsReader")
+      .def(init<std::string>())
+      .def_property_readonly("config", &SbsReader::Config,
+                             return_value_policy::reference_internal,
+                             "ModelConfig")
+      .def("find_mat", &SbsReader::FindMat,
+           return_value_policy::reference_internal,
+           "Returns MatPtr for given name.");
 }
+
+}  // namespace pybind11
diff --git a/compression/python/compression_test.py b/compression/python/compression_test.py
index fdf00e3..e8244ed 100644
--- a/compression/python/compression_test.py
+++ b/compression/python/compression_test.py
@@ -25,46 +25,120 @@ from python import configs
 class CompressionTest(absltest.TestCase):
 
   def test_sbs_writer(self):
+    info_192 = configs.TensorInfo()
+    info_192.name = "ignored_192"
+    info_192.axes = [0]
+    info_192.shape = [192]
+
     temp_file = self.create_tempfile("test.sbs")
-    tensor_info = configs.TensorInfo()
-    tensor_info.name = "foo"
-    tensor_info.axes = [0]
-    tensor_info.shape = [192]
-
-    writer = compression.SbsWriter(compression.CompressorMode.NO_TOC)
+    writer = compression.SbsWriter(temp_file.full_path)
     writer.insert(
-        "foo",
-        np.array([0.0012] * 128 + [0.001] * 64, dtype=np.float32),
+        "tensor0",
+        # Large enough to require scaling.
+        np.array([3.0012] * 128 + [4.001] * 64, dtype=np.float32),
         configs.Type.kSFP,
-        tensor_info,
-        1.0,
+        info_192,
     )
 
-    tensor_info_nuq = configs.TensorInfo()
-    tensor_info_nuq.name = "fooNUQ"
-    tensor_info_nuq.axes = [0]
-    tensor_info_nuq.shape = [256]
+    # 2D tensor.
+    info_2d = configs.TensorInfo()
+    info_2d.name = "ignored_2d"
+    info_2d.axes = [0, 1]
+    info_2d.shape = [96, 192]
     writer.insert(
-        "fooNUQ",
+        "tensor_2d",
+        np.array([i / 1e3 for i in range(96 * 192)], dtype=np.float32),
+        configs.Type.kBF16,
+        info_2d,
+    )
+
+    # 3D collapsed into rows.
+    info_3d = configs.TensorInfo()
+    info_3d.name = "ignored_3d"
+    info_3d.axes = [0, 1, 2]
+    info_3d.shape = [10, 12, 192]
+    info_3d.cols_take_extra_dims = False
+    writer.insert(
+        "tensor_3d",
+        # Verification of scale below depends on the shape and multiplier here.
+        np.array([i / 1e3 for i in range(10 * 12 * 192)], dtype=np.float32),
+        configs.Type.kSFP,
+        info_3d,
+    )
+
+    # Exercise all types supported by Compress.
+    info_256 = configs.TensorInfo()
+    info_256.name = "ignored_256"
+    info_256.axes = [0]
+    info_256.shape = [256]
+    writer.insert(
+        "tensor_sfp",
         np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32),
-        configs.Type.kNUQ,
-        tensor_info_nuq,
-        1.0,
+        configs.Type.kSFP,
+        info_256,
     )
-    writer.insert_sfp(
-        "bar", np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32)
+    writer.insert(
+        "tensor_bf",
+        np.array([0.000375] * 128 + [0.00007] * 128, dtype=np.float32),
+        configs.Type.kBF16,
+        info_256,
     )
-    writer.insert_nuq(
-        "baz", np.array([0.000125] * 128 + [0.00008] * 128, dtype=np.float32)
+    writer.insert(
+        "tensor_f32",
+        np.array([0.000375] * 128 + [0.00006] * 128, dtype=np.float32),
+        configs.Type.kF32,
+        info_256,
     )
-    writer.insert_bf16(
-        "qux", np.array([0.000375] * 128 + [0.00007] * 128, dtype=np.float32)
+
+    config = configs.ModelConfig(
+        configs.Model.GEMMA_TINY,
+        configs.Type.kSFP,
+        configs.PromptWrapping.GEMMA_IT,
     )
-    writer.insert_float(
-        "quux", np.array([0.000375] * 128 + [0.00006] * 128, dtype=np.float32)
-    )
-    self.assertEqual(writer.debug_num_blobs_added(), 6)
-    self.assertEqual(writer.write(temp_file.full_path), 0)
+    tokenizer_path = ""  # no tokenizer required for testing
+    writer.write(config, tokenizer_path)
+
+    print("Ignore next two warnings; test does not enable model deduction.")
+    reader = compression.SbsReader(temp_file.full_path)
+
+    self.assertEqual(reader.config.model, configs.Model.GEMMA_TINY)
+    self.assertEqual(reader.config.weight, configs.Type.kSFP)
+
+    mat = reader.find_mat("tensor0")
+    self.assertEqual(mat.cols, 192)
+    self.assertEqual(mat.rows, 1)
+    self.assertEqual(mat.type, configs.Type.kSFP)
+    self.assertAlmostEqual(mat.scale, 4.001 / 1.875, places=5)
+
+    mat = reader.find_mat("tensor_2d")
+    self.assertEqual(mat.cols, 192)
+    self.assertEqual(mat.rows, 96)
+    self.assertEqual(mat.type, configs.Type.kBF16)
+    self.assertAlmostEqual(mat.scale, 1.0)
+
+    mat = reader.find_mat("tensor_3d")
+    self.assertEqual(mat.cols, 192)
+    self.assertEqual(mat.rows, 10 * 12)
+    self.assertEqual(mat.type, configs.Type.kSFP)
+    self.assertAlmostEqual(mat.scale, 192 * 120 / 1e3 / 1.875, places=2)
+
+    mat = reader.find_mat("tensor_sfp")
+    self.assertEqual(mat.cols, 256)
+    self.assertEqual(mat.rows, 1)
+    self.assertEqual(mat.type, configs.Type.kSFP)
+    self.assertAlmostEqual(mat.scale, 1.0)
+
+    mat = reader.find_mat("tensor_bf")
+    self.assertEqual(mat.cols, 256)
+    self.assertEqual(mat.rows, 1)
+    self.assertEqual(mat.type, configs.Type.kBF16)
+    self.assertAlmostEqual(mat.scale, 1.0)
+
+    mat = reader.find_mat("tensor_f32")
+    self.assertEqual(mat.cols, 256)
+    self.assertEqual(mat.rows, 1)
+    self.assertEqual(mat.type, configs.Type.kF32)
+    self.assertAlmostEqual(mat.scale, 1.0)
 
 
 if __name__ == "__main__":
diff --git a/compression/sfp-inl.h b/compression/sfp-inl.h
index 1be84e9..dad6536 100644
--- a/compression/sfp-inl.h
+++ b/compression/sfp-inl.h
@@ -20,7 +20,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include "compression/shared.h"
+#include "compression/types.h"
 #include "hwy/base.h"
 
 #endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_SFP_INL_H_
diff --git a/compression/sfp_test.cc b/compression/sfp_test.cc
index f79e600..8e49ceb 100644
--- a/compression/sfp_test.cc
+++ b/compression/sfp_test.cc
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// We use ConcatEven/Odd which are not supported. Use HWY_EMU128 instead.
+#include "compression/types.h"
 #ifndef HWY_DISABLED_TARGETS
-#define HWY_DISABLED_TARGETS HWY_SCALAR
-#endif
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
 
 #include <stddef.h>
 #include <stdint.h>
@@ -25,7 +25,6 @@
 #include <set>
 
 #include "compression/distortion.h"
-#include "compression/shared.h"
 #include "util/test_util.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
diff --git a/compression/test_util-inl.h b/compression/test_util-inl.h
index 860644a..7c4f854 100644
--- a/compression/test_util-inl.h
+++ b/compression/test_util-inl.h
@@ -18,10 +18,13 @@
 #define THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_INL_H_
 
 // IWYU pragma: begin_exports
-#include "compression/compress.h"
 #include "compression/distortion.h"
+#include "util/mat.h"
 // IWYU pragma: end_exports
 
+#include "compression/compress.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+
 #endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_INL_H_
 
 // Include guard for (potentially) SIMD code.
@@ -59,7 +62,63 @@ void ForeachPackedAndRawType() {
   ForeachRawType<BF16, TestT>();
   ForeachRawType<float, TestT>();
   ForeachRawType<SfpStream, TestT>();
-  ForeachRawType<NuqStream, TestT>();
+  if constexpr (GEMMA_ENABLE_NUQ) {
+    ForeachRawType<NuqStream, TestT>();
+  }
+}
+
+// Generates inputs: deterministic, within max SfpStream range.
+template <typename MatT>
+MatStorageT<MatT> GenerateMat(const Extents2D& extents,
+                              const Allocator& allocator, MatPadding padding,
+                              hwy::ThreadPool& pool) {
+  gcpp::CompressWorkingSet ws;
+  ws.tls.resize(pool.NumWorkers());
+  MatStorageT<float> raw("raw", extents, allocator, MatPadding::kPacked);
+  MatStorageT<MatT> compressed("mat", extents, allocator, padding);
+  const float scale = SfpStream::kMax / extents.Area();
+  pool.Run(0, extents.rows, [&](const size_t r, size_t thread) {
+    float* HWY_RESTRICT row = raw.Row(r);
+    for (size_t c = 0; c < extents.cols; c++) {
+      float f = static_cast<float>(r * extents.cols + c) * scale;
+      if ((r + c) & 1) f = -f;  // Also generate some negative values.
+      row[c] = f;
+    }
+    Compress(raw.Row(r), raw.Cols(), ws.tls[thread],
+             MakeSpan(compressed.Row(r), compressed.Cols()),
+             /*packed_ofs=*/0);
+  });
+
+  compressed.SetScale(0.6f);  // Arbitrary value, different from 1.
+  return compressed;
+}
+
+// Same, but `extents` describes the transposed matrix.
+template <typename MatT>
+MatStorageT<MatT> GenerateTransposedMat(const Extents2D extents,
+                                        const Allocator& allocator,
+                                        MatPadding padding,
+                                        hwy::ThreadPool& pool) {
+  gcpp::CompressWorkingSet ws;
+  ws.tls.resize(pool.NumWorkers());
+  MatStorageT<float> raw("raw", extents, allocator, MatPadding::kPacked);
+  MatStorageT<MatT> compressed("trans", extents, allocator, padding);
+  const float scale = SfpStream::kMax / extents.Area();
+  pool.Run(0, extents.rows, [&](const size_t r, size_t thread) {
+    float* HWY_RESTRICT row = raw.Row(r);
+    for (size_t c = 0; c < extents.cols; c++) {
+      float f = static_cast<float>(c * extents.rows + r) * scale;
+      if ((r + c) & 1) f = -f;  // Also generate some negative values.
+      row[c] = f;
+    }
+    Compress(raw.Row(r), raw.Cols(), ws.tls[thread],
+             MakeSpan(compressed.Row(r), compressed.Cols()),
+             /*packed_ofs=*/0);
+  });
+
+  // Arbitrary value, different from 1, must match `GenerateMat`.
+  compressed.SetScale(0.6f);
+  return compressed;
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/compression/shared.h b/compression/types.h
similarity index 76%
rename from compression/shared.h
rename to compression/types.h
index a5c87ae..dc10676 100644
--- a/compression/shared.h
+++ b/compression/types.h
@@ -13,18 +13,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Definitions shared between the public compress-inl.h interface and the
-// sfp-inl.h and nuq-inl.h implementation details.
+// Types shared between tensor definitions and `compress-inl.h`.
 
-#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_SHARED_H_
-#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_SHARED_H_
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_TYPES_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_TYPES_H_
 
 #include <stddef.h>
 #include <stdint.h>
 
-#include <complex>
-#include <cstdio>
-
 // IWYU pragma: begin_exports
 #include "util/basics.h"  // BF16
 #include "hwy/aligned_allocator.h"
@@ -33,6 +29,35 @@
 
 namespace gcpp {
 
+// EMU128 must not be disabled because we disable SCALAR.
+#define HWY_BROKEN_EMU128 0
+
+// Allow user override of disabled targets.
+#ifndef GEMMA_DISABLED_TARGETS
+
+// All platforms: exclude SCALAR because we use ReorderWidenMulAccumulate.
+
+#if HWY_ARCH_ARM_V7
+// No NEON because we require double-precision support.
+#define HWY_DISABLED_TARGETS (HWY_SCALAR | HWY_ALL_NEON)
+#elif HWY_ARCH_ARM_A64
+// We do not yet use AES (e.g. for random generation), hence NEON is the same
+// as NEON_WITHOUT_AES. Also skip SVE because SVE2_128 and SVE_256 cover most.
+#define GEMMA_DISABLED_TARGETS (HWY_SCALAR | HWY_NEON | HWY_SVE)
+#elif HWY_ARCH_X86
+// Skip anything older than Haswell (2013); also use Zen4 for recent CPUs,
+// because we do not use anything added by SPR (e.g. FP16) nor AVX 10.2.
+#define GEMMA_DISABLED_TARGETS \
+  (HWY_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | HWY_AVX3_SPR | HWY_AVX10_2)
+#endif  // HWY_ARCH_*
+
+#endif  // GEMMA_DISABLED_TARGETS
+
+// Only used in experiments, hence disable in default builds.
+#ifndef GEMMA_ENABLE_NUQ
+#define GEMMA_ENABLE_NUQ 0
+#endif
+
 // Switching Floating Point: a hybrid 8-bit float representation of bf16/f32
 // inputs that combines the advantages of e4m3 and e5m2 into a single format.
 // It supports seeking at a granularity of 1 and decoding to bf16/f32.
@@ -63,30 +88,6 @@ struct SfpStream {
 };
 #pragma pack(pop)
 
-// Returns 1.0f if all magnitudes are <= SfpStream::kMax, otherwise scales them
-// such that the largest magnitude is SfpStream::kMax, and returns the
-// multiplier with which to restore the original values. This is only necessary
-// before compressing to SfpStream.
-// TODO: vectorize
-static inline float ScaleWeights(float* HWY_RESTRICT raw, size_t num) {
-  float maxabs = 0.0;
-  for (size_t i = 0; i < num; ++i) {
-    maxabs = HWY_MAX(maxabs, hwy::ScalarAbs(raw[i]));
-  }
-  if (maxabs <= SfpStream::kMax) {
-    return 1.0f;
-  }
-  const float scale = maxabs / SfpStream::kMax;
-  const float inv_scale = static_cast<float>(1.0 / static_cast<double>(scale));
-  for (size_t i = 0; i < num; ++i) {
-    // Clamp because kMax may still be exceeded.
-    const float magn =
-        HWY_MIN(SfpStream::kMax, hwy::ScalarAbs(raw[i] * inv_scale));
-    raw[i] = hwy::ScalarCopySign(magn, raw[i]);
-  }
-  return scale;
-}
-
 // Non-uniform quantization: a compressed representation of f32 inputs that
 // supports seeking at a granularity of 1 (for `DecompressAndZeroPad`) or
 // two vectors (for `Decompress2`), and decoding to bf16/f32.
@@ -185,31 +186,25 @@ constexpr bool IsNuqStream() {
   return hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>();
 }
 
-// Instruction-tuned models require extra 'turn structure' tokens in prompts.
-enum class PromptWrapping {
-  GEMMA_IT,
-  GEMMA_PT,
-  GEMMA_VLM,
-  PALIGEMMA,
-  kSentinel  // must be last
+// Tensor types for loading weights.
+enum class Type { kUnknown, kF32, kBF16, kSFP, kNUQ, kF64 };
+// These are used in `ModelConfig.Specifier`, hence the strings will not
+// change, though new ones may be added.
+static constexpr const char* kTypeStrings[] = {"unknown", "f32", "bf16",
+                                               "sfp",     "nuq", "f64"};
+static constexpr size_t kNumTypes =
+    sizeof(kTypeStrings) / sizeof(kTypeStrings[0]);
+static constexpr size_t kTypeBits[] = {
+    0,
+    8 * sizeof(float),
+    8 * sizeof(BF16),
+    8 * sizeof(SfpStream),
+    4 /* NuqStream, actually 4.5 */,
+    8 * sizeof(double),
 };
 
-inline bool EnumValid(PromptWrapping type) {
-  return static_cast<int>(type) >= 0 &&
-         static_cast<int>(type) < static_cast<int>(PromptWrapping::kSentinel);
-}
-
-// Tensor types for loading weights. Note that not all types are supported as
-// weights for a model, but can be used for other purposes, such as types for
-// ModelWeightsPtrs. When adding a new type that is supported, also
-// update gemma.cc, weights.*, and add instantiations/new_one.cc.
-enum class Type { kUnknown, kF32, kBF16, kSFP, kNUQ, kF64, kC64, kU128 };
-constexpr const char* kTypeStrings[] = {"unknown", "f32", "bf16", "sfp",
-                                        "nuq",     "f64", "c64",  "u128"};
-
-inline bool EnumValid(Type type) {
-  return static_cast<int>(type) >= 0 &&
-         static_cast<int>(type) <= static_cast<int>(Type::kU128);
+static inline bool EnumValid(Type type) {
+  return static_cast<size_t>(type) < kNumTypes;
 }
 
 // Returns a Type enum for the type of the template parameter.
@@ -226,20 +221,22 @@ Type TypeEnum() {
     return Type::kNUQ;
   } else if constexpr (hwy::IsSame<Packed, double>()) {
     return Type::kF64;
-  } else if constexpr (hwy::IsSame<Packed, std::complex<double>>()) {
-    return Type::kC64;
-  } else if constexpr (hwy::IsSame<Packed, hwy::uint128_t>()) {
-    return Type::kU128;
   } else {
     HWY_DASSERT(false);
     return Type::kUnknown;
   }
 }
 
-// Returns a string name for the type of the template parameter.
+static inline size_t TypeBits(Type type) {
+  return kTypeBits[static_cast<int>(type)];
+}
+
+static inline const char* TypeName(Type type) {
+  return kTypeStrings[static_cast<int>(type)];
+}
 template <typename PackedT>
 const char* TypeName() {
-  return kTypeStrings[static_cast<int>(TypeEnum<PackedT>())];
+  return TypeName(TypeEnum<PackedT>());
 }
 
 template <typename Packed>
@@ -248,7 +245,9 @@ constexpr bool IsCompressed() {
 }
 
 // Returns the number of `MatT` elements required to store `capacity` values,
-// which must not be zero.
+// which must not be zero. This is only intended to support the extra tables
+// required for NUQ. `capacity` includes any padding and is `rows * stride`.
+// Deprecated, replaced by fixup within `MatPtr`. Only used by tests.
 template <typename Packed>
 constexpr size_t CompressedArrayElements(size_t capacity) {
   if constexpr (hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>()) {
@@ -304,4 +303,4 @@ HWY_INLINE PackedSpan<const Packed> MakeConst(PackedSpan<Packed> packed) {
 }
 
 }  // namespace gcpp
-#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_SHARED_H_
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_TYPES_H_
diff --git a/evals/benchmark.cc b/evals/benchmark.cc
index 8682189..4dec9ee 100644
--- a/evals/benchmark.cc
+++ b/evals/benchmark.cc
@@ -6,14 +6,12 @@
 #include <iostream>
 #include <ostream>
 #include <string>
-#include <utility>  // std::pair
 #include <vector>
 
-#include "compression/io.h"  // Path
 #include "evals/benchmark_helper.h"
 #include "evals/cross_entropy.h"
-#include "gemma/common.h"
 #include "gemma/gemma.h"
+#include "io/io.h"  // Path
 #include "util/args.h"
 #include "hwy/base.h"
 #include "hwy/timer.h"
@@ -27,7 +25,6 @@ class BenchmarkArgs : public ArgsBase<BenchmarkArgs> {
  public:
   BenchmarkArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
 
-  Path goldens;
   Path summarize_text;
   Path cross_entropy;
   Path trivia_qa;
@@ -36,8 +33,6 @@ class BenchmarkArgs : public ArgsBase<BenchmarkArgs> {
 
   template <class Visitor>
   void ForEach(const Visitor& visitor) {
-    visitor(goldens.path, "goldens_dir", std::string(""),
-            "Directory containing golden files", 2);
     visitor(summarize_text.path, "summarize_text", std::string(""),
             "Path to text file to summarize", 2);
     visitor(cross_entropy.path, "cross_entropy", std::string(""),
@@ -53,56 +48,6 @@ class BenchmarkArgs : public ArgsBase<BenchmarkArgs> {
   }
 };
 
-std::vector<std::pair<std::string, std::string>> load_goldens(
-    const std::string& path) {
-  std::ifstream goldens_file(path);
-  if (!goldens_file) {
-    std::cout << "Could not load goldens file: " << path << "\n" << std::flush;
-    return {};
-  }
-  std::vector<std::pair<std::string, std::string>> res;
-  std::string query_separator;
-  std::string query;
-  std::string answer_separator;
-  std::string answer;
-  while (std::getline(goldens_file, query_separator) &&
-         std::getline(goldens_file, query) &&
-         std::getline(goldens_file, answer_separator) &&
-         std::getline(goldens_file, answer)) {
-    res.push_back({query, answer});
-  }
-  return res;
-}
-
-int BenchmarkGoldens(GemmaEnv& env, const std::string& golden_path) {
-  std::vector<std::pair<std::string, std::string>> queries_answers =
-      load_goldens(golden_path);
-  size_t correct_answers = 0;
-  size_t total_tokens = 0;
-  const double time_start = hwy::platform::Now();
-  for (auto& [question, expected_answer] : queries_answers) {
-    QueryResult result = env.QueryModel(question);
-    total_tokens += result.tokens_generated;
-    if (result.response.find(expected_answer) != std::string::npos) {
-      correct_answers++;
-    } else {
-      std::cout << "Wrong!\n";
-      std::cout << "Input: " << question << "\n";
-      std::cout << "Expected: " << expected_answer << "\n";
-      std::cout << "Output: " << result.response << "\n\n" << std::flush;
-    }
-  }
-  LogSpeedStats(time_start, total_tokens);
-
-  std::cout << "Correct: " << correct_answers << " out of "
-            << queries_answers.size() << "\n"
-            << std::flush;
-  if (correct_answers != queries_answers.size()) {
-    return EXIT_FAILURE;
-  }
-  return EXIT_SUCCESS;
-}
-
 int BenchmarkSummary(GemmaEnv& env, const Path& text) {
   std::string prompt("Here is some text to summarize:\n");
   prompt.append(ReadFileToString(text));
@@ -117,6 +62,7 @@ int BenchmarkSummary(GemmaEnv& env, const Path& text) {
 
 int BenchmarkCrossEntropy(GemmaEnv& env, const Path& text,
                           size_t batch_tokens) {
+  const Gemma& gemma = *env.GetGemma();
   std::string input = ReadFileToString(text);
   std::vector<int> prompt = env.Tokenize(input);
   std::cout << "Number of input tokens: " << prompt.size() << "\n";
@@ -128,10 +74,11 @@ int BenchmarkCrossEntropy(GemmaEnv& env, const Path& text,
     size_t num_tokens = std::min<size_t>(prompt.size() - pos, batch_tokens);
     std::vector<int> prompt_slice(prompt.begin() + pos,
                                   prompt.begin() + pos + num_tokens);
-    KVCache kv_cache = KVCache::Create(env.GetModel()->GetModelConfig(),
-                                       env.MutableConfig().prefill_tbatch_size);
-    float entropy = ComputeCrossEntropy(
-        *env.GetModel(), num_tokens, prompt_slice, kv_cache, env.Verbosity());
+    KVCache kv_cache(gemma.Config(), gemma.Inference(),
+                     env.MutableEnv().ctx.allocator);
+    float entropy =
+        ComputeCrossEntropy(*env.GetGemma(), num_tokens, prompt_slice, kv_cache,
+                            env.MutableEnv(), env.Verbosity());
     total_entropy += entropy;
     LogSpeedStats(time_start, pos + num_tokens);
     std::string text_slice = env.StringFromTokens(prompt_slice);
@@ -183,14 +130,7 @@ int main(int argc, char** argv) {
   gcpp::GemmaEnv env(argc, argv);
   gcpp::BenchmarkArgs benchmark_args(argc, argv);
 
-  if (!benchmark_args.goldens.Empty()) {
-    const std::string golden_path =
-        benchmark_args.goldens.path + "/" +
-        gcpp::ModelString(env.GetModel()->Info().model,
-                          env.GetModel()->Info().wrapping) +
-        ".txt";
-    return BenchmarkGoldens(env, golden_path);
-  } else if (!benchmark_args.summarize_text.Empty()) {
+  if (!benchmark_args.summarize_text.Empty()) {
     return BenchmarkSummary(env, benchmark_args.summarize_text);
   } else if (!benchmark_args.cross_entropy.Empty()) {
     return BenchmarkCrossEntropy(env, benchmark_args.cross_entropy,
diff --git a/evals/benchmark_helper.cc b/evals/benchmark_helper.cc
index 2daebdf..3b999b4 100644
--- a/evals/benchmark_helper.cc
+++ b/evals/benchmark_helper.cc
@@ -18,27 +18,21 @@
 #include <stdio.h>
 #include <time.h>
 
-#include <cstdio>
 #include <iostream>
-#include <memory>
 #include <ostream>
 #include <random>
 #include <string>
 #include <vector>
 
-// Placeholder for internal header, do not modify.
-#include "compression/compress.h"  // TypeName
+#include "compression/types.h"  // TypeName
 #include "evals/cross_entropy.h"
-#include "gemma/common.h"  // StringFromType
 #include "gemma/gemma.h"
-#include "gemma/kv_cache.h"
-#include "util/app.h"
-#include "util/args.h"
-#include "util/threading.h"
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/topology.h"
+#include "gemma/gemma_args.h"
+#include "ops/matmul.h"  // MatMulEnv
+#include "util/threading_context.h"
 #include "hwy/highway.h"
-#include "hwy/per_target.h"  // VectorBytes
+#include "hwy/per_target.h"  // DispatchedTarget
+#include "hwy/profiler.h"    // PROFILER_ENABLED
 #include "hwy/timer.h"
 
 namespace gcpp {
@@ -49,53 +43,37 @@ void InitGenerator(const InferenceArgs& inference, std::mt19937& gen) {
     gen.seed(0x12345678);
   } else {
     // Depending on the library implementation, this may still be deterministic.
-    std::random_device rd;
+    std::random_device rd;  // NOLINT
     gen.seed(rd());
   }
 }
 
-GemmaEnv::GemmaEnv(const LoaderArgs& loader, const InferenceArgs& inference,
-                   const AppArgs& app)
-    : topology_(CreateTopology(app)),
-      pools_(CreatePools(topology_, app)),
-      env_(topology_, pools_) {
-  InferenceArgs mutable_inference = inference;
-  AbortIfInvalidArgs(mutable_inference);
-  LoaderArgs mutable_loader = loader;
-  if (const char* err = mutable_loader.Validate()) {
-    mutable_loader.Help();
-    fprintf(stderr, "Skipping model load because: %s\n", err);
-  } else {
-    fprintf(stderr, "Loading model...\n");
-    model_ = AllocateGemma(mutable_loader, env_);
-    // Only allocate one for starters because GenerateBatch might not be called.
-    kv_caches_.resize(1);
-    kv_caches_[0] = KVCache::Create(model_->GetModelConfig(),
-                                    inference.prefill_tbatch_size);
+GemmaEnv::GemmaEnv(const LoaderArgs& loader, const ThreadingArgs& threading,
+                   const InferenceArgs& inference)
+    : ctx_(threading), env_(ctx_), gemma_(loader, inference, ctx_) {
+  const ModelConfig& config = gemma_.Config();
+  // Only allocate one for starters because GenerateBatch might not be called.
+  kv_caches_.push_back(KVCache(config, inference, ctx_.allocator));
+
+  if (inference.verbosity >= 2) {
+    ShowConfig(loader, threading, inference, config, gemma_.WeightReadMode(),
+               ctx_);
   }
+
   InitGenerator(inference, gen_);
+
   runtime_config_ = {
       .max_generated_tokens = inference.max_generated_tokens,
       .temperature = inference.temperature,
       .gen = &gen_,
-      .verbosity = app.verbosity,
+      .verbosity = inference.verbosity,
   };
-}
-
-// Internal init must run before the GemmaEnv ctor above, hence it cannot occur
-// in the argv ctor below because its body runs *after* the delegating ctor.
-// This helper function takes care of the init, and could be applied to any of
-// the *Args classes, it does not matter which.
-static AppArgs MakeAppArgs(int argc, char** argv) {
-  {  // So that indentation matches expectations.
-    // Placeholder for internal init, do not modify.
-  }
-  return AppArgs(argc, argv);
+  inference.CopyTo(runtime_config_);
 }
 
 GemmaEnv::GemmaEnv(int argc, char** argv)
-    : GemmaEnv(LoaderArgs(argc, argv), InferenceArgs(argc, argv),
-               MakeAppArgs(argc, argv)) {}
+    : GemmaEnv(LoaderArgs(argc, argv), ThreadingArgs(argc, argv),
+               InferenceArgs(argc, argv)) {}
 
 QueryResult GemmaEnv::QueryModel(const std::vector<int>& tokens) {
   QueryResult result;
@@ -117,8 +95,8 @@ QueryResult GemmaEnv::QueryModel(const std::vector<int>& tokens) {
   }
   gcpp::TimingInfo timing_info { .verbosity = runtime_config_.verbosity };
   runtime_config_.batch_stream_token = batch_stream_token;
-  model_->Generate(runtime_config_, tokens, /*start_pos=*/0, kv_caches_[0],
-                   timing_info);
+  gemma_.Generate(runtime_config_, tokens, /*start_pos=*/0, kv_caches_[0], env_,
+                  timing_info);
   return result;
 }
 
@@ -127,23 +105,25 @@ void GemmaEnv::QueryModel(
   gcpp::TimingInfo timing_info { .verbosity = runtime_config_.verbosity };
   const StreamFunc previous_stream_token = runtime_config_.stream_token;
   runtime_config_.stream_token = stream_token;
-  model_->Generate(runtime_config_, tokens, /*start_pos=*/0, kv_caches_[0],
-                   timing_info);
+  gemma_.Generate(runtime_config_, tokens, /*start_pos=*/0, kv_caches_[0], env_,
+                  timing_info);
   runtime_config_.stream_token = previous_stream_token;
 }
 
 std::vector<QueryResult> GemmaEnv::BatchQueryModel(
-    const QueriesPromptTokens& queries_prompt) {
+    const QueriesPromptTokens& queries_prompt,
+    const hwy::Span<const size_t>& prefix_end) {
   const size_t num_queries = queries_prompt.size();
   HWY_ASSERT(num_queries != 0);
   std::vector<QueryResult> res(num_queries);
-  const BatchStreamFunc batch_stream_token = [&res, &queries_prompt, this](
-                                                 size_t query_index, size_t pos,
-                                                 int token, float) {
+  const BatchStreamFunc batch_stream_token = [&, this](const size_t query_index,
+                                                       const size_t pos,
+                                                       const int token, float) {
+    HWY_ASSERT(query_index < num_queries);
     std::string token_text;
-    HWY_ASSERT(
-        model_->Tokenizer().Decode(std::vector<int>{token}, &token_text));
+    HWY_ASSERT(gemma_.Tokenizer().Decode(std::vector<int>{token}, &token_text));
     res[query_index].response.append(token_text);
+    HWY_ASSERT(pos == res[query_index].tokens_generated);
     res[query_index].tokens_generated += 1;
     if (res[query_index].tokens_generated ==
         queries_prompt[query_index].size()) {
@@ -151,6 +131,7 @@ std::vector<QueryResult> GemmaEnv::BatchQueryModel(
     }
     return true;
   };
+  runtime_config_.batch_stream_token = batch_stream_token;
   if (runtime_config_.verbosity >= 2) {
     fprintf(stderr, "Max gen: %zu temp: %f tbatch: %zu qbatch: %zu\n",
             runtime_config_.max_generated_tokens, runtime_config_.temperature,
@@ -158,23 +139,16 @@ std::vector<QueryResult> GemmaEnv::BatchQueryModel(
             runtime_config_.decode_qbatch_size);
   }
 
-  // Ensure we have one KVCache per query.
-  if (kv_caches_.size() < num_queries) {
-    kv_caches_.resize(num_queries);
-  }
-  for (size_t i = 1; i < num_queries; ++i) {
-    if (kv_caches_[i].seq_len == 0) {
-      kv_caches_[i] = KVCache::Create(model_->GetModelConfig(),
-                                      runtime_config_.prefill_tbatch_size);
-    }
+  // Ensure we have at least one KVCache per query.
+  while (kv_caches_.size() < num_queries) {
+    kv_caches_.push_back(
+        KVCache(gemma_.Config(), gemma_.Inference(), ctx_.allocator));
   }
+  const hwy::Span<KVCache> kv_caches(&kv_caches_[0], num_queries);
 
+  gcpp::AllQueries all_queries(queries_prompt, kv_caches, prefix_end);
   gcpp::TimingInfo timing_info = {.verbosity = runtime_config_.verbosity};
-  runtime_config_.batch_stream_token = batch_stream_token;
-  std::vector<size_t> queries_pos(num_queries, 0);
-  model_->GenerateBatch(runtime_config_, queries_prompt,
-                        QueriesPos(queries_pos.data(), num_queries),
-                        KVCaches(&kv_caches_[0], num_queries), timing_info);
+  gemma_.GenerateBatch(runtime_config_, all_queries, env_, timing_info);
   return res;
 }
 
@@ -203,8 +177,8 @@ std::vector<QueryResult> GemmaEnv::BatchQueryModel(
 float GemmaEnv::CrossEntropy(const std::string& input) {
   std::vector<int> prompt = Tokenize(input);
   prompt.insert(prompt.begin(), BOS_ID);
-  return ComputeCrossEntropy(*GetModel(), /*max_generated_tokens=*/3072, prompt,
-                             MutableKVCache(),
+  return ComputeCrossEntropy(*GetGemma(), /*max_generated_tokens=*/3072, prompt,
+                             MutableKVCache(), env_,
                              /*verbosity=*/0) /
          static_cast<int>(input.size());
 }
@@ -236,13 +210,37 @@ std::string CacheString() {
   return buf;
 }
 
-void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app,
-                const BoundedTopology& topology, NestedPools& pools) {
-  loader.Print(app.verbosity);
-  inference.Print(app.verbosity);
-  app.Print(app.verbosity);
+static constexpr const char* CompiledConfig() {
+  if constexpr (HWY_IS_ASAN) {
+    return "asan";
+  } else if constexpr (HWY_IS_MSAN) {
+    return "msan";
+  } else if constexpr (HWY_IS_TSAN) {
+    return "tsan";
+  } else if constexpr (HWY_IS_HWASAN) {
+    return "hwasan";
+  } else if constexpr (HWY_IS_UBSAN) {
+    return "ubsan";
+  } else if constexpr (HWY_IS_DEBUG_BUILD) {
+    return "dbg";
+  } else {
+    return "opt";
+  }
+}
 
-  if (app.verbosity >= 2) {
+void ShowConfig(const LoaderArgs& loader, const ThreadingArgs& threading,
+                const InferenceArgs& inference, const ModelConfig& config,
+                const WeightsPtrs::Mode weight_read_mode,
+                const ThreadingContext& ctx) {
+  threading.Print(inference.verbosity);
+  loader.Print(inference.verbosity);
+  inference.Print(inference.verbosity);
+  fprintf(
+      stderr, "Model                         : %s, to_bf16 %d, mmap %d => %s\n",
+      config.Specifier().c_str(), static_cast<int>(loader.to_bf16),
+      static_cast<int>(loader.map), WeightsPtrs::ToString(weight_read_mode));
+
+  if (inference.verbosity >= 2) {
     time_t now = time(nullptr);
     char* dt = ctime(&now);  // NOLINT
     char cpu100[100] = "unknown";
@@ -250,38 +248,34 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app,
 
     fprintf(stderr,
             "Date & Time                   : %s"  // dt includes \n
-            "CPU                           : %s\n"
+            "CPU                           : %s, bind %d\n"
             "CPU topology                  : %s, %s, %s\n"
             "Instruction set               : %s (%zu bits)\n"
-            "Compiled config               : %s\n"
-            "Weight Type                   : %s\n"
-            "EmbedderInput Type            : %s\n",
-            dt, cpu100, topology.TopologyString(), pools.PinString(),
+            "Compiled config               : %s, profiler %d\n"
+            "Memory MiB                    : %4zu\n",
+            dt, cpu100, static_cast<int>(threading.bind),
+            ctx.topology.TopologyString(), ctx.pools.PinString(),
             CacheString().c_str(), hwy::TargetName(hwy::DispatchedTarget()),
-            hwy::VectorBytes() * 8, CompiledConfig(),
-            StringFromType(loader.Info().weight), TypeName<EmbedderInputT>());
+            ctx.allocator.VectorBytes() * 8, CompiledConfig(), PROFILER_ENABLED,
+            ctx.allocator.TotalMiB());
   }
 }
 
-void ShowHelp(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
+void ShowHelp(const LoaderArgs& loader, const ThreadingArgs& threading,
+              const InferenceArgs& inference) {
   std::cerr
       << "\n\ngemma.cpp : a lightweight, standalone C++ inference engine\n"
          "==========================================================\n\n"
-         "To run gemma.cpp, you need to "
-         "specify 3 required model loading arguments:\n"
-         "    --tokenizer\n"
-         "    --weights\n"
-         "    --model,\n"
-         " or with the newer weights format, specify just:\n"
-         "    --weights\n";
+         "To run with pre-2025 weights, specify --tokenizer and --weights.\n"
+         "With the single-file weights format, specify just --weights.\n";
   std::cerr << "\n*Example Usage*\n\n./gemma --tokenizer tokenizer.spm "
-               "--weights 2b-it-sfp.sbs --model 2b-it\n";
+               "--weights gemma2-2b-it-sfp.sbs\n";
   std::cerr << "\n*Model Loading Arguments*\n\n";
   loader.Help();
+  std::cerr << "\n*Threading Arguments*\n\n";
+  threading.Help();
   std::cerr << "\n*Inference Arguments*\n\n";
   inference.Help();
-  std::cerr << "\n*Application Arguments*\n\n";
-  app.Help();
   std::cerr << "\n";
 }
 
diff --git a/evals/benchmark_helper.h b/evals/benchmark_helper.h
index f6e32c0..8f4d96f 100644
--- a/evals/benchmark_helper.h
+++ b/evals/benchmark_helper.h
@@ -18,15 +18,16 @@
 
 #include <stddef.h>
 
-#include <memory>
 #include <random>
 #include <string>
 #include <vector>
 
+#include "gemma/configs.h"
 #include "gemma/gemma.h"
+#include "gemma/gemma_args.h"
+#include "gemma/tokenizer.h"  // WrapAndTokenize
 #include "ops/matmul.h"
-#include "util/app.h"
-#include "util/threading.h"
+#include "util/threading_context.h"
 #include "hwy/base.h"
 
 namespace gcpp {
@@ -46,19 +47,21 @@ class GemmaEnv {
  public:
   // Calls the other constructor with *Args arguments initialized from argv.
   GemmaEnv(int argc, char** argv);
-  GemmaEnv(const LoaderArgs& loader, const InferenceArgs& inference,
-           const AppArgs& app);
+  GemmaEnv(const LoaderArgs& loader, const ThreadingArgs& threading,
+           const InferenceArgs& inference);
+  MatMulEnv& Env() { return env_; }
 
   size_t MaxGeneratedTokens() const {
     return runtime_config_.max_generated_tokens;
   }
-  void SetMaxGeneratedTokens(size_t max_generated_tokens) {
-    runtime_config_.max_generated_tokens = max_generated_tokens;
+  void SetMaxGeneratedTokens(int max_generated_tokens) {
+    runtime_config_.max_generated_tokens =
+        static_cast<size_t>(max_generated_tokens);
   }
 
   std::vector<int> Tokenize(const std::string& input) const {
     std::vector<int> tokens;
-    HWY_ASSERT(model_->Tokenizer().Encode(input, &tokens));
+    HWY_ASSERT(gemma_.Tokenizer().Encode(input, &tokens));
     return tokens;
   }
 
@@ -69,20 +72,23 @@ class GemmaEnv {
   }
 
   std::vector<int> WrapAndTokenize(std::string& input) const {
-    return gcpp::WrapAndTokenize(model_->Tokenizer(), model_->Info(), 0, input);
+    return gcpp::WrapAndTokenize(gemma_.Tokenizer(), gemma_.ChatTemplate(),
+                                 gemma_.Config().wrapping, 0, input);
   }
 
   std::string StringFromTokens(const std::vector<int>& tokens) const {
     std::string string;
-    HWY_ASSERT(model_->Tokenizer().Decode(tokens, &string));
+    HWY_ASSERT(gemma_.Tokenizer().Decode(tokens, &string));
     return string;
   }
 
   // Runs inference on the given input and returns the top-1 result string and
   // the number of tokens that were generated.
   QueryResult QueryModel(const std::vector<int>& tokens);
+  // The default prefix_end means "causal attention".
   std::vector<QueryResult> BatchQueryModel(
-      const QueriesPromptTokens& queries_prompt);
+      const QueriesPromptTokens& queries_prompt,
+      const hwy::Span<const size_t>& prefix_end = hwy::Span<const size_t>());
   // Adds turn structure to input, tokenizes and calls the above overload.
   QueryResult QueryModel(std::string& input);
   std::vector<QueryResult> BatchQueryModel(
@@ -97,20 +103,19 @@ class GemmaEnv {
   // number of bits per token.
   float CrossEntropy(const std::string& input);
 
-  // Returns nullptr if the model failed to load.
-  Gemma* GetModel() const { return model_.get(); }
+  const Gemma* GetGemma() const { return &gemma_; }
 
   int Verbosity() const { return runtime_config_.verbosity; }
   RuntimeConfig& MutableConfig() { return runtime_config_; }
   std::mt19937& MutableGen() { return gen_; }
   KVCache& MutableKVCache() { return kv_caches_[0]; }
+  MatMulEnv& MutableEnv() { return env_; }
 
  private:
-  BoundedTopology topology_;
-  NestedPools pools_;  // Thread pool.
+  ThreadingContext ctx_;
   MatMulEnv env_;
-  std::mt19937 gen_;  // Random number generator.
-  std::unique_ptr<Gemma> model_;
+  Gemma gemma_;
+  std::mt19937 gen_;                // Random number generator.
   std::vector<KVCache> kv_caches_;  // Same number as query batch.
   RuntimeConfig runtime_config_;
 };
@@ -118,9 +123,12 @@ class GemmaEnv {
 // Logs the inference speed in tokens/sec.
 void LogSpeedStats(double time_start, size_t total_tokens);
 
-void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app,
-                const BoundedTopology& topology, NestedPools& pools);
-void ShowHelp(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app);
+void ShowConfig(const LoaderArgs& loader, const ThreadingArgs& threading,
+                const InferenceArgs& inference, const ModelConfig& config,
+                WeightsPtrs::Mode weight_read_mode,
+                const ThreadingContext& ctx);
+void ShowHelp(const LoaderArgs& loader, const ThreadingArgs& threading,
+              const InferenceArgs& inference);
 
 }  // namespace gcpp
 
diff --git a/evals/cross_entropy.cc b/evals/cross_entropy.cc
index e4bf1b1..09c3a42 100644
--- a/evals/cross_entropy.cc
+++ b/evals/cross_entropy.cc
@@ -13,6 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
 // Compiles this file for multiple architectures via "foreach_target.h", to
 // which we pass the filename via macro 'argument'.
 // clang-format off
@@ -38,17 +43,12 @@
 #include <vector>
 
 #include "evals/cross_entropy.h"
-#include "gemma/common.h"
 #include "gemma/gemma.h"
 #include "hwy/base.h"
 
 namespace gcpp {
 
 namespace {
-template <typename TConfig>
-struct GetVocabSize {
-  int operator()() const { return TConfig::kVocabSize; }
-};
 
 static std::string TokenString(const GemmaTokenizer& tokenizer, int token) {
   std::string token_str;
@@ -85,7 +85,7 @@ namespace gcpp {
 namespace HWY_NAMESPACE {
 
 void CallSoftmax(float* HWY_RESTRICT logits, size_t vocab_size) {
-  Softmax(logits, vocab_size);
+  Softmax(logits, vocab_size, /*worker=*/0);
 }
 
 }  // namespace HWY_NAMESPACE
@@ -97,12 +97,12 @@ namespace gcpp {
 
 HWY_EXPORT(CallSoftmax);
 
-float ComputeCrossEntropy(Gemma& gemma, size_t max_generated_tokens,
+float ComputeCrossEntropy(const Gemma& gemma, size_t max_generated_tokens,
                           const std::vector<int>& prompt, KVCache& kv_cache,
-                          int verbosity) {
+                          MatMulEnv& env, int verbosity) {
   const StreamFunc stream_token = [](int, float) { return true; };
 
-  const int vocab_size = gemma.GetModelConfig().vocab_size;
+  const int vocab_size = gemma.Config().vocab_size;
   float cross_entropy = std::log(vocab_size);  // first token; == -log(1/v_s)
   size_t pos = 1;
 
@@ -145,7 +145,7 @@ float ComputeCrossEntropy(Gemma& gemma, size_t max_generated_tokens,
   };
   TimingInfo timing_info;
 
-  gemma.Generate(runtime, prompt0, 0, kv_cache, timing_info);
+  gemma.Generate(runtime, prompt0, 0, kv_cache, env, timing_info);
 
   const float scale = 1.0f / std::log(2.0f);
   return cross_entropy * scale;
diff --git a/evals/cross_entropy.h b/evals/cross_entropy.h
index fed224c..0a143cc 100644
--- a/evals/cross_entropy.h
+++ b/evals/cross_entropy.h
@@ -24,9 +24,9 @@
 
 namespace gcpp {
 
-float ComputeCrossEntropy(Gemma& gemma, size_t max_generated_tokens,
+float ComputeCrossEntropy(const Gemma& gemma, size_t max_generated_tokens,
                           const std::vector<int>& prompt, KVCache& kv_cache,
-                          int verbosity);
+                          MatMulEnv& env, int verbosity);
 
 }  // namespace gcpp
 
diff --git a/evals/debug_prompt.cc b/evals/debug_prompt.cc
index 2d02b3a..66fa466 100644
--- a/evals/debug_prompt.cc
+++ b/evals/debug_prompt.cc
@@ -18,9 +18,9 @@
 #include <string>
 #include <vector>
 
-#include "compression/io.h"
 #include "evals/benchmark_helper.h"
 #include "gemma/gemma.h"  // LayersOutputFunc
+#include "io/io.h"
 #include "util/args.h"
 #include "hwy/base.h"
 #include "nlohmann/json.hpp"
diff --git a/evals/gemma_batch_bench.cc b/evals/gemma_batch_bench.cc
index 44b803f..6d97c61 100644
--- a/evals/gemma_batch_bench.cc
+++ b/evals/gemma_batch_bench.cc
@@ -13,25 +13,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gemma/gemma.h"
-
 #include <stdio.h>
 
 #include <string>
 #include <vector>
 
 #include "evals/benchmark_helper.h"
-#include "gemma/common.h"
+#include "gemma/gemma.h"
 #include "hwy/base.h"
+#include "hwy/nanobenchmark.h"
+#include "hwy/profiler.h"
 #include "hwy/tests/hwy_gtest.h"
 
-// This test can be run manually with the downloaded gemma weights.
-// To run the test, pass the following flags:
-// --model <model> --tokenizer <tokenizer_path> --weights <weights_path>
-// It should pass for the following models:
-// Gemma1: 2b-it (v1 and v1.1), 7b-it (v1 and v1.1), gr2b-it,
-// Gemma2: gemma2-2b-it, 9b-it, 27b-it,
-
 namespace gcpp {
 namespace {
 
@@ -40,61 +33,23 @@ namespace {
 // non-local static variables with dtors.
 GemmaEnv* s_env = nullptr;
 
-class GemmaTest : public ::testing::Test {
+class GemmaBatchBench : public ::testing::Test {
  protected:
   std::vector<std::string> BatchGemmaReply(
       const std::vector<std::string>& inputs) {
-    s_env->SetMaxGeneratedTokens(64);
+    s_env->SetMaxGeneratedTokens(24);
     s_env->MutableConfig().temperature = 0.0f;  // deterministic
-    s_env->MutableConfig().verbosity = 5;
+    s_env->MutableConfig().verbosity = 2;
     std::vector<std::string> replies;
-    // Using the turn structure worsens results sometimes.
-    // However, some models need the turn structure to work.
-    // It would be good to make these tests more consistent.
-    if (s_env->GetModel()->Info().model == Model::GEMMA2_27B ||
-        s_env->GetModel()->Info().model == Model::GRIFFIN_2B) {
-      for (QueryResult result : s_env->BatchQueryModel(inputs)) {
-        replies.push_back(result.response);
-      }
-      return replies;
-    }
-    // Otherwise, do not use turn structure.
-    std::vector<std::vector<int>> prompts_vector;
-    prompts_vector.reserve(inputs.size());
-    for (const auto& input_string : inputs) {
-      prompts_vector.push_back(s_env->TokenizeAndPrependBOS(input_string));
-    }
-    std::vector<PromptTokens> prompt_spans;
-    for (const auto& prompt : prompts_vector) {
-      prompt_spans.push_back(PromptTokens(prompt.data(), prompt.size()));
-    }
-    QueriesPromptTokens prompts(prompt_spans.data(), prompt_spans.size());
-    for (const QueryResult& result : s_env->BatchQueryModel(prompts)) {
+    for (const QueryResult& result : s_env->BatchQueryModel(inputs)) {
       replies.push_back(result.response);
     }
     return replies;
   }
-
-  void GenerateTokens(std::vector<std::string> &kQA, size_t num_questions) {
-    ASSERT_NE(s_env->GetModel(), nullptr);
-
-    std::vector<std::string> inputs;
-    for (size_t i = 0; i < num_questions; ++i) {
-      inputs.push_back(kQA[i]);
-    }
-    std::vector<std::string> responses = BatchGemmaReply(inputs);
-    for (size_t i = 0; i < num_questions; ++i) {
-      std::string response = responses.at(i);
-      fprintf(stderr, "Batch answer %zu '%s'\n\n", i + 1, response.c_str());
-    }
-  }
 };
 
-TEST_F(GemmaTest, RandomQuestionsBatched) {
-  s_env->MutableConfig().decode_qbatch_size = 3;
-  s_env->MutableConfig().verbosity = 5;
-
-  static std::vector<std::string> kQA = {
+TEST_F(GemmaBatchBench, RandomQuestionsBatched) {
+  const std::vector<std::string> questions = {
       {"Write me a poem about Australia?"},
       {"What's the history of Denmark?"},
       {"Write me a comedy story about the USA."},
@@ -128,13 +83,27 @@ TEST_F(GemmaTest, RandomQuestionsBatched) {
       {"Tell me about space travel."},
       {"Explain to me how electric cars work."},
   };
-  static const size_t kNum = kQA.size();
-  GenerateTokens(kQA, kNum);
+
+  // Fills prompts round robin from `questions` until the desired batch size.
+  std::vector<std::string> inputs;
+  inputs.reserve(s_env->MutableConfig().decode_qbatch_size);
+  size_t qpos = 0;
+  for (size_t i = 0; i < inputs.capacity(); ++i) {
+    inputs.push_back(questions[qpos++]);
+    if (qpos == questions.size()) qpos = 0;
+  }
+  std::vector<std::string> responses = BatchGemmaReply(inputs);
+  for (size_t i = 0; i < hwy::Unpredictable1() * 3; ++i) {
+    fprintf(stderr, "Batch answer %zu '%s'\n\n", i, responses[i].c_str());
+  }
+
+  PROFILER_PRINT_RESULTS();
 }
 }  // namespace
 }  // namespace gcpp
 
 int main(int argc, char** argv) {
+  fprintf(stderr, "GemmaEnv setup..\n");
   gcpp::GemmaEnv env(argc, argv);
   gcpp::s_env = &env;
 
diff --git a/evals/gemma_test.cc b/evals/gemma_test.cc
index 7674c5e..12080f9 100644
--- a/evals/gemma_test.cc
+++ b/evals/gemma_test.cc
@@ -21,7 +21,8 @@
 #include <vector>
 
 #include "evals/benchmark_helper.h"
-#include "gemma/common.h"
+#include "gemma/configs.h"
+#include "io/io.h"
 #include "hwy/base.h"
 #include "hwy/tests/hwy_gtest.h"
 
@@ -36,132 +37,75 @@
 namespace gcpp {
 namespace {
 
-// Shared state. Requires argc/argv, so construct in main and use the same raw
-// pointer approach as in benchmarks.cc. Note that the style guide forbids
-// non-local static variables with dtors.
-GemmaEnv* s_env = nullptr;
-
 class GemmaTest : public ::testing::Test {
- protected:
-  std::string GemmaReply(const std::string& prompt) {
-    s_env->SetMaxGeneratedTokens(2048);
-    s_env->MutableConfig().temperature = 0.0f;  // deterministic
-    s_env->MutableConfig().verbosity = 0;
-    // Using the turn structure worsens results sometimes.
-    // However, some models need the turn structure to work.
-    // It would be good to make these tests more consistent.
-    if (s_env->GetModel()->Info().model == Model::GEMMA2_27B ||
-        s_env->GetModel()->Info().model == Model::GRIFFIN_2B) {
-      std::string mutable_prompt = prompt;
-      QueryResult result = s_env->QueryModel(mutable_prompt);  // Uses turns.
-      return result.response;
-    }
-    // Otherwise, do not use turn structure.
-    const std::vector<int> tokens = s_env->TokenizeAndPrependBOS(prompt);
-    QueryResult result = s_env->QueryModel(tokens);
-    return result.response;
+ public:
+  // Requires argc/argv, hence do not use `SetUpTestSuite`.
+  static void InitEnv(int argc, char** argv) {
+    HWY_ASSERT(s_env == nullptr);  // Should only be called once.
+    s_env = new GemmaEnv(argc, argv);
+    const gcpp::ModelConfig& config = s_env->GetGemma()->Config();
+    fprintf(stderr, "Using %s\n", config.Specifier().c_str());
   }
 
+  static void DeleteEnv() { delete s_env; }
+
+ protected:
   std::vector<std::string> BatchGemmaReply(
       const std::vector<std::string>& inputs) {
+    HWY_ASSERT(s_env);  // must have called InitEnv()
     s_env->SetMaxGeneratedTokens(64);
     s_env->MutableConfig().temperature = 0.0f;  // deterministic
     s_env->MutableConfig().verbosity = 0;
+    // Always use turn structure (WrapAndTokenize).
     std::vector<std::string> replies;
-    // Using the turn structure worsens results sometimes.
-    // However, some models need the turn structure to work.
-    // It would be good to make these tests more consistent.
-    if (s_env->GetModel()->Info().model == Model::GEMMA2_27B ||
-        s_env->GetModel()->Info().model == Model::GRIFFIN_2B) {
-      for (QueryResult result : s_env->BatchQueryModel(inputs)) {
-        replies.push_back(result.response);
-      }
-      return replies;
-    }
-    // Otherwise, do not use turn structure.
-    std::vector<std::vector<int>> prompts_vector;
-    prompts_vector.reserve(inputs.size());
-    for (const auto& input_string : inputs) {
-      prompts_vector.push_back(s_env->TokenizeAndPrependBOS(input_string));
-    }
-    std::vector<PromptTokens> prompt_spans;
-    for (const auto& prompt : prompts_vector) {
-      prompt_spans.push_back(PromptTokens(prompt.data(), prompt.size()));
-    }
-    QueriesPromptTokens prompts(prompt_spans.data(), prompt_spans.size());
-    for (const QueryResult& result : s_env->BatchQueryModel(prompts)) {
+    for (QueryResult result : s_env->BatchQueryModel(inputs)) {
       replies.push_back(result.response);
     }
     return replies;
   }
 
-  void TestQuestions(const char* kQA[][2], size_t num_questions, bool batch) {
-    ASSERT_NE(s_env->GetModel(), nullptr);
-    if (batch) {
-      std::vector<std::string> inputs;
-      for (size_t i = 0; i < num_questions; ++i) {
-        fprintf(stderr, "Batch Question %zu\n\n", i + 1);
-        inputs.push_back(kQA[i][0]);
-      }
-      std::vector<std::string> responses = BatchGemmaReply(inputs);
-      for (size_t i = 0; i < num_questions; ++i) {
-        std::string response = responses.at(i);
-        fprintf(stderr, "Batch answer %zu '%s'\n\n", i + 1, response.c_str());
-        EXPECT_TRUE(response.find(kQA[i][1]) != std::string::npos);  // NOLINT
-      }
-    } else {
-      for (size_t i = 0; i < num_questions; ++i) {
-        fprintf(stderr, "Question %zu\n\n", i + 1);
-        std::string response = GemmaReply(kQA[i][0]);
-        fprintf(stderr, "'%s'\n\n", response.c_str());
-        EXPECT_TRUE(response.find(kQA[i][1]) != std::string::npos);  // NOLINT
-      }
-    }
-  }
+  // Shared state. Requires argc/argv, so construct in main via InitEnv.
+  // Note that the style guide forbids non-local static variables with dtors.
+  static GemmaEnv* s_env;
 };
 
-TEST_F(GemmaTest, GeographyBatched) {
-  s_env->MutableConfig().decode_qbatch_size = 3;
-  // 6 are enough to test batching and the loop.
+GemmaEnv* GemmaTest::s_env = nullptr;
+
+TEST_F(GemmaTest, Batched) {
+  // Test remainder handling in MatMul (four rows per tile), but avoid a
+  // second batch in debug builds to speed up the test.
+  s_env->MutableConfig().decode_qbatch_size = HWY_IS_DEBUG_BUILD ? 6 : 3;
   static const char* kQA[][2] = {
       {"What is the capital of Australia?", "Canberra"},
-      {"What is the capital of Denmark?", "Copenhagen"},
-      {"Ljubljana is the capital of which country?", "Slovenia"},
-      {"Is Chicago a country?", "city"},
       {"How many states does the US have?", "50"},
       {"What is the Pacific?", "ocean"},
-  };
-  static const size_t kNum = sizeof(kQA) / sizeof(kQA[0]);
-  TestQuestions(kQA, HWY_MIN(kNum, 3), /*batch=*/false);
-  TestQuestions(kQA, 1, /*batch=*/true);
-  TestQuestions(kQA, kNum, /*batch=*/true);
-}
-
-TEST_F(GemmaTest, History) {
-  static const char* kQA[][2] = {
       {"When was the battle of Hastings?", "1066"},
-  };
-  static const size_t kNum = sizeof(kQA) / sizeof(kQA[0]);
-  TestQuestions(kQA, kNum, /*batch=*/false);
-}
-
-TEST_F(GemmaTest, Arithmetic) {
-  static const char* kQA[][2] = {
       {"what is 13 + 14?", "27"},
       {"what is 7 * 8?", "56"},
   };
-  static const size_t kNum = sizeof(kQA) / sizeof(kQA[0]);
-  TestQuestions(kQA, kNum, /*batch=*/false);
+  const size_t kNum = sizeof(kQA) / sizeof(kQA[0]);
+  std::vector<std::string> inputs;
+  for (size_t i = 0; i < kNum; ++i) {
+    inputs.push_back(kQA[i][0]);
+  }
+  std::vector<std::string> responses = BatchGemmaReply(inputs);
+  HWY_ASSERT(responses.size() == kNum);
+  for (size_t i = 0; i < kNum; ++i) {
+    fprintf(stderr, "#%zu: '%s'\n\n", i, responses[i].c_str());
+    EXPECT_TRUE(responses[i].find(kQA[i][1]) != std::string::npos);  // NOLINT
+  }
 }
 
 TEST_F(GemmaTest, Multiturn) {
-  Gemma* model = s_env->GetModel();
-  ASSERT_NE(model, nullptr);
+  const Gemma* model = s_env->GetGemma();
+  const ModelConfig& config = model->Config();
   size_t abs_pos = 0;
   std::string response;
-  auto stream_token = [&](int token, float) {
-    if (token == EOS_ID) return true;
+  auto stream_token = [&](size_t query_idx, size_t pos, int token, float) {
+    HWY_ASSERT(query_idx == 0);
+    HWY_ASSERT(pos == abs_pos);
     ++abs_pos;
+    if (config.IsEOS(token)) return true;
     std::string token_text;
     EXPECT_TRUE(
         model->Tokenizer().Decode(std::vector<int>{token}, &token_text));
@@ -173,83 +117,44 @@ TEST_F(GemmaTest, Multiturn) {
       .temperature = 0.0f,
       .gen = &s_env->MutableGen(),
       .verbosity = 2,
-      .stream_token = stream_token,
+      .batch_stream_token = stream_token,
   };
   TimingInfo timing_info{.verbosity = 0};
   // First "say" something slightly unusual.
   std::string mutable_prompt = "I have a car and its color is turquoise.";
-  std::vector<int> tokens = WrapAndTokenize(model->Tokenizer(), model->Info(),
-                                            abs_pos, mutable_prompt);
+  std::vector<int> tokens =
+      WrapAndTokenize(model->Tokenizer(), model->ChatTemplate(),
+                      config.wrapping, abs_pos, mutable_prompt);
+
   model->Generate(runtime_config, tokens, abs_pos, s_env->MutableKVCache(),
-                  timing_info);
+                  s_env->MutableEnv(), timing_info);
   // Note: we do not rewind any <end_of_turn> tokens here. If the model
   // produced one and WrapAndTokenize() inserts another one, it will just be
   // duplicated.
   mutable_prompt = "Please repeat all prior statements.";
-  tokens = WrapAndTokenize(model->Tokenizer(), model->Info(), abs_pos,
-                           mutable_prompt);
+  tokens = WrapAndTokenize(model->Tokenizer(), model->ChatTemplate(),
+                           config.wrapping, abs_pos, mutable_prompt);
+
   // Reset the `response` string here, then check that the model actually has
   // access to the previous turn by asking to reproduce.
   response.clear();
   model->Generate(runtime_config, tokens, abs_pos, s_env->MutableKVCache(),
-                  timing_info);
-  fprintf(stderr, "decoded: %s\n", response.c_str());
+                  s_env->MutableEnv(), timing_info);
+  fprintf(stderr, "decoded: '%s'\n", response.c_str());
   bool remembered_turquoise =
       response.find("turquoise") != std::string::npos;              // NOLINT
   bool remembered_car = response.find("car") != std::string::npos;  // NOLINT
   EXPECT_TRUE(remembered_turquoise || remembered_car);
 }
 
-static const char kJingleBells[] = R"(
-Dashing through the snow
-In a one-horse open sleigh
-O'er the fields we go
-Laughing all the way
-Bells on bobtails ring
-Making spirits bright
-What fun it is to ride and sing
-A sleighing song tonight
-)";
-
-// The "Hay Draft" of the Gettysburg Address.
-static const char kGettysburg[] = {
-    "Four score and seven years ago our fathers brought forth, upon this "
-    "continent, a new nation, conceived in Liberty, and dedicated to the "
-    "proposition that all men are created equal.\n\nNow we are engaged in a "
-    "great civil war, testing whether that nation, or any nation, so "
-    "conceived, and so dedicated, can long endure. We are met here on a great "
-    "battlefield of that war. We have come to dedicate a portion of it as a "
-    "final resting place for those who here gave their lives that that nation "
-    "might live. It is altogether fitting and proper that we should do "
-    "this.\n\nBut in a larger sense we can not dedicate -- we can not "
-    "consecrate -- we can not hallow this ground. The brave men, living and "
-    "dead, who struggled, here, have consecrated it far above our poor power "
-    "to add or detract. The world will little note, nor long remember, what we "
-    "say here, but can never forget what they did here. It is for us, the "
-    "living, rather to be dedicated here to the unfinished work which they "
-    "have, thus far, so nobly carried on. It is rather for us to be here "
-    "dedicated to the great task remaining before us -- that from these "
-    "honored dead we take increased devotion to that cause for which they here "
-    "gave the last full measure of devotion -- that we here highly resolve "
-    "that these dead shall not have died in vain; that this nation shall have "
-    "a new birth of freedom; and that this government of the people, by the "
-    "people, for the people, shall not perish from the earth.\n"};
-
 TEST_F(GemmaTest, CrossEntropySmall) {
-  ASSERT_NE(s_env->GetModel(), nullptr);
+  HWY_ASSERT(s_env->GetGemma() != nullptr);
+  const ModelConfig& config = s_env->GetGemma()->Config();
   static const char kSmall[] =
       "The capital of Hungary is Budapest which is located in Europe.";
   float entropy = s_env->CrossEntropy(kSmall);
   fprintf(stderr, "per-token entropy: %f\n", entropy);
-  switch (s_env->GetModel()->Info().model) {
-    case gcpp::Model::GEMMA_2B:
-      // 2B v.1 and v.1.1 produce slightly different results.
-      EXPECT_NEAR(entropy, 2.6f, 0.2f);
-      break;
-    case gcpp::Model::GEMMA_7B:
-      // 7B v.1 and v.1.1 produce slightly different results.
-      EXPECT_NEAR(entropy, 2.8f, 0.2f);
-      break;
+  switch (config.model) {
     case gcpp::Model::GRIFFIN_2B:
       EXPECT_NEAR(entropy, 2.61f, 0.02f);
       break;
@@ -268,76 +173,14 @@ TEST_F(GemmaTest, CrossEntropySmall) {
   }
 }
 
-TEST_F(GemmaTest, CrossEntropyJingleBells) {
-  ASSERT_NE(s_env->GetModel(), nullptr);
-  float entropy = s_env->CrossEntropy(kJingleBells);
-  fprintf(stderr, "per-token entropy: %f\n", entropy);
-  switch (s_env->GetModel()->Info().model) {
-    case gcpp::Model::GEMMA_2B:
-      // 2B v.1 and v.1.1 produce slightly different results.
-      EXPECT_NEAR(entropy, 1.9f, 0.2f);
-      break;
-    case gcpp::Model::GEMMA_7B:
-      // 7B v.1 and v.1.1 produce slightly different results.
-      EXPECT_NEAR(entropy, 1.07f, 0.05f);
-      break;
-    case gcpp::Model::GRIFFIN_2B:
-      EXPECT_NEAR(entropy, 1.62f, 0.02f);
-      break;
-    case gcpp::Model::GEMMA2_2B:
-      EXPECT_NEAR(entropy, 0.49f, 0.02f);
-      break;
-    case gcpp::Model::GEMMA2_9B:
-      EXPECT_NEAR(entropy, 0.37f, 0.02f);
-      break;
-    case gcpp::Model::GEMMA2_27B:
-      EXPECT_NEAR(entropy, 0.33f, 0.02f);
-      break;
-    default:
-      FAIL() << "no entropy expectation for this model";
-      break;
-  }
-}
-
-TEST_F(GemmaTest, CrossEntropyGettysburg) {
-  ASSERT_NE(s_env->GetModel(), nullptr);
-  float entropy = s_env->CrossEntropy(kGettysburg);
-  fprintf(stderr, "per-token entropy: %f\n", entropy);
-  switch (s_env->GetModel()->Info().model) {
-    case gcpp::Model::GEMMA_2B:
-      // 2B v.1 and v.1.1 produce slightly different results.
-      EXPECT_NEAR(entropy, 1.1f, 0.1f);
-      break;
-    case gcpp::Model::GEMMA_7B:
-      // 7B v.1 and v.1.1 produce slightly different results.
-      EXPECT_NEAR(entropy, 0.75f, 0.1f);
-      break;
-    case gcpp::Model::GRIFFIN_2B:
-      EXPECT_NEAR(entropy, 0.71f, 0.02f);
-      break;
-    case gcpp::Model::GEMMA2_2B:
-      EXPECT_NEAR(entropy, 0.20f, 0.02f);
-      break;
-    case gcpp::Model::GEMMA2_9B:
-      EXPECT_NEAR(entropy, 0.15f, 0.02f);
-      break;
-    case gcpp::Model::GEMMA2_27B:
-      EXPECT_NEAR(entropy, 0.14f, 0.02f);
-      break;
-    default:
-      FAIL() << "no entropy expectation for this model";
-      break;
-  }
-}
-
 }  // namespace
 }  // namespace gcpp
 
 int main(int argc, char** argv) {
-  gcpp::GemmaEnv env(argc, argv);
-  gcpp::s_env = &env;
-
   testing::InitGoogleTest(&argc, argv);
-
-  return RUN_ALL_TESTS();
+  gcpp::InternalInit();
+  gcpp::GemmaTest::InitEnv(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  gcpp::GemmaTest::DeleteEnv();
+  return ret;
 }
diff --git a/evals/run_mmlu.cc b/evals/run_mmlu.cc
index 77c9dcd..b6537fe 100644
--- a/evals/run_mmlu.cc
+++ b/evals/run_mmlu.cc
@@ -19,12 +19,11 @@
 #include <string>
 #include <vector>
 
-#include "compression/io.h"  // Path
 #include "evals/benchmark_helper.h"
 #include "gemma/gemma.h"  // Gemma
+#include "io/io.h"        // Path
 #include "util/args.h"
 #include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
 #include "hwy/profiler.h"
 #include "nlohmann/json.hpp"
@@ -89,7 +88,7 @@ void Run(GemmaEnv& env, JsonArgs& json) {
       "A",  "B",   "C",   "D",   //
       " A", " B",  " C",  " D",  //
       "**", "**:", ":**", "The", "Answer", "is", ":", "."};
-  const TokenSet accept_set(env.GetModel()->Tokenizer(), accept_strings);
+  const TokenSet accept_set(env.GetGemma()->Tokenizer(), accept_strings);
 
   for (auto sample : json_data["samples"]) {
     const int id = sample["i"];
@@ -131,8 +130,9 @@ void Run(GemmaEnv& env, JsonArgs& json) {
         .verbosity = env.Verbosity(),
         .stream_token = stream_token,
     };
-    env.GetModel()->Generate(runtime_config, prompt, /*pos=*/0,
-                             env.MutableKVCache(), timing_info);
+    env.GetGemma()->Generate(runtime_config, prompt, /*pos=*/0,
+                             env.MutableKVCache(), env.MutableEnv(),
+                             timing_info);
 
     std::string output_string = env.StringFromTokens(predicted_token_ids);
     fprintf(stderr, "Correct %s, model '%s'\n", correct_answer.c_str(),
diff --git a/examples/hello_world/BUILD.bazel b/examples/hello_world/BUILD.bazel
index 3160103..440e824 100644
--- a/examples/hello_world/BUILD.bazel
+++ b/examples/hello_world/BUILD.bazel
@@ -10,13 +10,11 @@ cc_binary(
     name = "hello_world",
     srcs = ["run.cc"],
     deps = [
-        # Placeholder for internal dep, do not remove.,
-        "//:app",
         "//:args",
+        "//:gemma_args",
         "//:gemma_lib",
-        "//:threading",
+        "//:threading_context",
         "//:tokenizer",
         "@highway//:hwy",
-        "@highway//:thread_pool",
     ],
 )
diff --git a/examples/hello_world/run.cc b/examples/hello_world/run.cc
index 4eb8647..96c56bf 100644
--- a/examples/hello_world/run.cc
+++ b/examples/hello_world/run.cc
@@ -23,19 +23,15 @@
 #include <string>
 #include <vector>
 
-// Placeholder for internal header, do not modify.
 #include "gemma/gemma.h"
+#include "gemma/gemma_args.h"  // LoaderArgs
 #include "gemma/tokenizer.h"
-#include "util/app.h"  // LoaderArgs
 #include "util/args.h"
-#include "util/threading.h"
+#include "util/threading_context.h"
 #include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
 
-int main(int argc, char **argv) { {
-        // Placeholder for internal init, do not modify.
-    }
 
+int main(int argc, char **argv) {
     gcpp::LoaderArgs loader(argc, argv);
     gcpp::InferenceArgs inference(argc, argv);
     gcpp::AppArgs app(argc, argv);
diff --git a/examples/simplified_gemma/BUILD.bazel b/examples/simplified_gemma/BUILD.bazel
index bedb322..98c0f5e 100644
--- a/examples/simplified_gemma/BUILD.bazel
+++ b/examples/simplified_gemma/BUILD.bazel
@@ -10,10 +10,10 @@ cc_library(
     name = "gemma",
     hdrs = ["gemma.hpp"],
     deps = [
-        "//:app",
+        "//:gemma_args",
         "//:gemma_lib",
-        "//:ops",
-        "//:threading",
+        "//:matmul",
+        "//:threading_context",
         "//:tokenizer",
         "@highway//:hwy",
     ],
@@ -24,15 +24,6 @@ cc_binary(
     srcs = ["run.cc"],
     deps = [
         ":gemma",
-        # Placeholder for internal dep, do not remove.,
-        "//:app",
-        "//:args",
-        "//:common",
-        "//:gemma_lib",
-        "//:ops",
-        "//:threading",
-        "//:tokenizer",
-        "@highway//:hwy",
-        "@highway//:thread_pool",
+        "//:gemma_args",
     ],
 )
diff --git a/examples/simplified_gemma/CMakeLists.txt b/examples/simplified_gemma/CMakeLists.txt
index e7e6653..5595164 100644
--- a/examples/simplified_gemma/CMakeLists.txt
+++ b/examples/simplified_gemma/CMakeLists.txt
@@ -14,10 +14,11 @@
 
 cmake_minimum_required(VERSION 3.11)
 project(simplified_gemma)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 include(FetchContent)
-FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG c5bebf84ad01edec97e336f5c97ca4e0df6b4d06)
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 9414b48aeec251b69e6cadbfa42bebb5ddae1c34)
 FetchContent_MakeAvailable(highway)
 FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)
 FetchContent_MakeAvailable(sentencepiece)
@@ -31,7 +32,7 @@ if (NOT BUILD_MODE)
 endif()
 if (BUILD_MODE STREQUAL "local")
   # Relative path to gemma.cpp from examples/simplified_gemma/build/
-  FetchContent_Declare(gemma SOURCE_DIR ../../..) 
+  FetchContent_Declare(gemma SOURCE_DIR ../../..)
 else()
   FetchContent_Declare(gemma GIT_REPOSITORY https://github.com/google/gemma.cpp.git GIT_TAG a9aa63fd2ea6b786ed0706d619588bfe2d43370e)
 endif()
diff --git a/examples/simplified_gemma/README.md b/examples/simplified_gemma/README.md
index d8f9394..37b4f71 100644
--- a/examples/simplified_gemma/README.md
+++ b/examples/simplified_gemma/README.md
@@ -41,7 +41,7 @@ gemma.cpp specifying the tokenizer, compressed weights file, and model type, for
 example:
 
 ```sh
-./simplified_gemma --tokenizer tokenizer.spm --compressed_weights 2b-it-sfp.sbs --model 2b-it
+./simplified_gemma --tokenizer tokenizer.spm --weights 2b-it-sfp.sbs --model 2b-it
 ```
 
 Should print a greeting to the terminal:
diff --git a/examples/simplified_gemma/gemma.hpp b/examples/simplified_gemma/gemma.hpp
index a2a7760..7f6e4c2 100644
--- a/examples/simplified_gemma/gemma.hpp
+++ b/examples/simplified_gemma/gemma.hpp
@@ -24,55 +24,39 @@
 #include <vector>
 
 #include "third_party/gemma_cpp/gemma/gemma.h"
+#include "third_party/gemma_cpp/gemma/gemma_args.h"  // LoaderArgs
 #include "third_party/gemma_cpp/gemma/tokenizer.h"
 #include "third_party/gemma_cpp/ops/matmul.h"
-#include "third_party/gemma_cpp/util/app.h"  // LoaderArgs
-#include "third_party/gemma_cpp/util/threading.h"
+#include "third_party/gemma_cpp/util/threading_context.h"
 #include "third_party/highway/hwy/base.h"
 
 class SimplifiedGemma {
  public:
   SimplifiedGemma(const gcpp::LoaderArgs& loader,
-                  const gcpp::InferenceArgs& inference = gcpp::InferenceArgs(),
-                  const gcpp::AppArgs& app = gcpp::AppArgs())
-      : loader_(loader),
-        inference_(inference),
-        app_(app),
-        topology_(gcpp::CreateTopology(app_)),
-        pools_(gcpp::CreatePools(topology_, app_)),
-        env_(topology_, pools_),
-        model_(gcpp::CreateGemma(loader_, env_)) {
-    Init();
-  }
-
-  SimplifiedGemma(int argc, char** argv)
-      : loader_(argc, argv, /*validate=*/true),
-        inference_(argc, argv),
-        app_(argc, argv),
-        topology_(gcpp::CreateTopology(app_)),
-        pools_(gcpp::CreatePools(topology_, app_)),
-        env_(topology_, pools_),
-        model_(gcpp::CreateGemma(loader_, env_)) {
-    Init();
-  }
-
-  void Init() {
-    // Instantiate model and KV Cache
-    kv_cache_ = gcpp::KVCache::Create(model_.GetModelConfig(),
-                                      inference_.prefill_tbatch_size);
-
+                  const gcpp::ThreadingArgs& threading = gcpp::ThreadingArgs(),
+                  const gcpp::InferenceArgs& inference = gcpp::InferenceArgs())
+      : ctx_(UpdateArgs(threading, inference)),
+        env_(ctx_),
+        gemma_(loader, inference, ctx_),
+        kv_cache_(gemma_.Config(), inference, ctx_.allocator) {
     // Initialize random number generator
     std::random_device rd;
     gen_.seed(rd());
   }
 
+  SimplifiedGemma(int argc, char** argv)
+      : SimplifiedGemma(gcpp::LoaderArgs(argc, argv),
+                        gcpp::ThreadingArgs(argc, argv),
+                        gcpp::InferenceArgs(argc, argv)) {}
+
   void Generate(std::string& prompt, size_t max_generated_tokens = 1024,
                 float temperature = 0.7,
                 const std::set<int>& reject_tokens = {}) {
     size_t generated = 0;
 
     const std::vector<int> tokens = gcpp::WrapAndTokenize(
-        model_.Tokenizer(), loader_.Info(), generated, prompt);
+        gemma_.Tokenizer(), gemma_.ChatTemplate(),
+        gemma_.Config().wrapping, generated, prompt);
     const size_t prompt_size = tokens.size();
 
     // This callback function gets invoked every time a token is generated
@@ -80,9 +64,9 @@ class SimplifiedGemma {
       ++generated;
       if (generated < prompt_size) {
         // print feedback
-      } else if (!this->model_.GetModelConfig().IsEOS(token)) {
+      } else if (!gemma_.Config().IsEOS(token)) {
         std::string token_text;
-        HWY_ASSERT(this->model_.Tokenizer().Decode({token}, &token_text));
+        HWY_ASSERT(gemma_.Tokenizer().Decode({token}, &token_text));
         std::cout << token_text << std::flush;
       }
       return true;
@@ -100,19 +84,15 @@ class SimplifiedGemma {
               return !reject_tokens.contains(token);
             },
     };
-    model_.Generate(runtime_config, tokens, 0, kv_cache_, timing_info);
+    gemma_.Generate(runtime_config, tokens, 0, kv_cache_, env_, timing_info);
   }
   ~SimplifiedGemma() = default;
 
  private:
-  gcpp::LoaderArgs loader_;
-  gcpp::InferenceArgs inference_;
-  gcpp::AppArgs app_;
-  gcpp::BoundedTopology topology_;
-  gcpp::NestedPools pools_;
+  gcpp::ThreadingContext ctx_;
   gcpp::MatMulEnv env_;
-  gcpp::Gemma model_;
+  gcpp::Gemma gemma_;
   gcpp::KVCache kv_cache_;
   std::mt19937 gen_;
   std::string validation_error_;
-};
\ No newline at end of file
+};
diff --git a/examples/simplified_gemma/run.cc b/examples/simplified_gemma/run.cc
index f73ddb5..b7af134 100644
--- a/examples/simplified_gemma/run.cc
+++ b/examples/simplified_gemma/run.cc
@@ -17,30 +17,25 @@
 
 #include <string>
 
-// Placeholder for internal header, do not modify.
 #include "third_party/gemma_cpp/examples/simplified_gemma/gemma.hpp"
-#include "util/app.h"  // LoaderArgs
+#include "gemma/gemma_args.h"  // LoaderArgs
 
 int main(int argc, char** argv) {
-  {
-    // Placeholder for internal init, do not modify.
-  }
-
   // Standard usage: LoaderArgs takes argc and argv as input, then parses
   // necessary flags.
-  gcpp::LoaderArgs loader(argc, argv, /*validate=*/true);
+  gcpp::LoaderArgs loader(argc, argv);
 
   // Optional: LoaderArgs can also take tokenizer and weights paths directly.
   //
   // gcpp::LoaderArgs loader("/path/to/tokenizer", "/path/to/weights",
   // "model_identifier");
 
-  // Optional: InferenceArgs and AppArgs can be passed in as well. If not
+  // Optional: ThreadingArgs and InferenceArgs can be passed in as well. If not
   // specified, default values will be used.
   //
   // gcpp::InferenceArgs inference(argc, argv);
-  // gcpp::AppArgs app(argc, argv);
-  // SimplifiedGemma gemma(loader, inference, app);
+  // gcpp::ThreadingArgs threading(argc, argv);
+  // SimplifiedGemma gemma(loader, threading, inference);
 
   SimplifiedGemma gemma(loader);
   std::string prompt = "Write a greeting to the world.";
diff --git a/gemma/activations.h b/gemma/activations.h
index 86345e2..b222bd9 100644
--- a/gemma/activations.h
+++ b/gemma/activations.h
@@ -16,104 +16,199 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_ACTIVATIONS_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_ACTIVATIONS_H_
 
+#include <math.h>  // sqrtf
 #include <stddef.h>
+#include <stdint.h>
 
-#include "compression/shared.h"  // BF16
-#include "gemma/configs.h"
-#include "ops/matmul.h"          // MatMulEnv
-#include "ops/ops.h"             // CreateInvTimescale
-#include "util/allocator.h"      // RowVectorBatch
-#include "util/threading.h"
-#include "hwy/base.h"  // HWY_DASSERT
-#include "hwy/contrib/thread_pool/thread_pool.h"
+#include <atomic>
+#include <vector>
+
+#include "gemma/configs.h"   // ModelConfig
+#include "ops/matmul.h"      // MatMulEnv
+#include "ops/ops.h"         // CreateInvTimescale
+#include "util/allocator.h"  // Allocator
+#include "util/basics.h"     // BF16
+#include "util/mat.h"        // MatStorageT
 
 namespace gcpp {
 
-struct Activations {
-  explicit Activations(const ModelConfig& config)
-      : weights_config(config),
-        layer_config(config.layer_configs[0]),
-        seq_len(config.seq_len),
-        cache_pos_size(config.CachePosSize()) {}
+struct GriffinActivations {
+  GriffinActivations(const ModelConfig& config, size_t batch_size,
+                     const Allocator& allocator)
+      : griffin_x(
+            MatFactory("griffin_x", batch_size, config.model_dim, allocator)),
+        griffin_y(
+            MatFactory("griffin_y", batch_size, config.model_dim, allocator)),
+        griffin_gate_x(MatFactory("griffin_gate_x", batch_size,
+                                  config.model_dim, allocator)),
+        griffin_multiplier(MatFactory("griffin_mul", batch_size,
+                                      config.model_dim, allocator)) {}
 
-  RowVectorBatch<float> x;  // input
-  RowVectorBatch<float> q;  // query, also KV if MHA.
-  RowVectorBatch<float> logits;
+  void SetBatchSize(size_t batch_size) {
+    if (griffin_x.Rows() == 0) return;
+    griffin_x.OverrideRows(batch_size);
+    griffin_y.OverrideRows(batch_size);
+    griffin_gate_x.OverrideRows(batch_size);
+    griffin_multiplier.OverrideRows(batch_size);
+  }
 
-  // Attention
-  RowVectorBatch<float> pre_att_rms_out;
-  RowVectorBatch<float> att;      // attention vector
-  RowVectorBatch<float> att_out;  // attention output
+  MatStorageT<float> griffin_x;
+  MatStorageT<float> griffin_y;
+  MatStorageT<float> griffin_gate_x;
+  MatStorageT<float> griffin_multiplier;
+};
+
+struct AttentionActivations {
+  // Returns the scale value to use for the query in the attention computation.
+  // Also called by ops_test.
+  static inline float ChooseQueryScale(const ModelConfig& config) {
+    if (config.query_scale == QueryScaleType::SqrtModelDimDivNumHeads)
+      return 1.0f / sqrtf(static_cast<float>(config.model_dim /
+                                             config.layer_configs[0].heads));
+    // QueryScaleType::SqrtKeySize
+    return 1.0f / sqrtf(static_cast<float>(config.layer_configs[0].qkv_dim));
+  }
+
+  AttentionActivations(
+      const ModelConfig& config, const LayerConfig& layer_config,
+      size_t batch_size, size_t seq_len, const Allocator& allocator,
+      std::vector<hwy::AlignedFreeUniquePtr<uint8_t*[]>>& row_ptrs)
+      : config(config),
+
+        // `vocab_size == 0` means it is for Vit part, VitAttention is still MHA
+        // and does not use an external KV cache.
+        q(MatFactory("q", batch_size,
+                     config.vocab_size == 0
+                         ? layer_config.heads * 3 * layer_config.qkv_dim
+                         : layer_config.heads * layer_config.qkv_dim,
+                     allocator)),
+
+        pre_att_rms_out(MatFactory("pre_att_rms_out", batch_size,
+                                   config.model_dim, allocator)),
+        att(MatFactory("att", batch_size, layer_config.heads * seq_len,
+                       allocator)),
+        att_out(MatFactory("att_out", batch_size,
+                           layer_config.heads * layer_config.qkv_dim,
+                           allocator)),
+        att_sums(
+            MatFactory("att_sums", batch_size, config.model_dim, allocator)),
+
+        inv_timescale(
+            CreateInvTimescale(allocator, layer_config.qkv_dim,
+                               layer_config.post_qk == PostQKType::HalfRope)),
+        inv_timescale_global(CreateInvTimescale(
+            allocator, layer_config.qkv_dim,
+            layer_config.post_qk == PostQKType::HalfRope, 1000000.0)),
+
+        div_seq_len(static_cast<uint32_t>(seq_len)),
+        div_heads(static_cast<uint32_t>(layer_config.heads)),
+        query_scale(ChooseQueryScale(config)) {
+    // Batch size can be 0 in experimental code so do not assert.
+    if (batch_size == 0) {
+      static std::atomic_flag warned = ATOMIC_FLAG_INIT;
+      if (!warned.test_and_set()) {
+        HWY_WARN("Creating mostly empty activations with a batch_size of 0.");
+      }
+      return;
+    }
+
+    // For MatMul outputs, precompute their row pointers.
+    // If we forget any MatMul outputs here, debug builds print a warning but
+    // fill them in each MatMul call.
+    q.AllocateAndAttachRowPtrs(row_ptrs);
+    att_sums.AllocateAndAttachRowPtrs(row_ptrs);
+  }
+
+  void SetBatchSize(size_t batch_size) {
+    q.OverrideRows(batch_size);
+
+    pre_att_rms_out.OverrideRows(batch_size);
+    att.OverrideRows(batch_size);
+    att_out.OverrideRows(batch_size);
+    att_sums.OverrideRows(batch_size);
+  }
+
+  const ModelConfig& config;
+
+  MatStorageT<float> q;  // query
+
+  MatStorageT<float> pre_att_rms_out;
+  MatStorageT<float> att;      // attention vector
+  MatStorageT<float> att_out;  // attention output
   // Accumulation of attention outputs over heads
-  RowVectorBatch<float> att_sums;
-
-  // Gated FFW
-  RowVectorBatch<BF16> bf_pre_ffw_rms_out;
-  RowVectorBatch<float> C1;
-  RowVectorBatch<float> C2;
-  RowVectorBatch<float> ffw_out;
-
-  // Griffin
-  RowVectorBatch<float> griffin_x;
-  RowVectorBatch<float> griffin_y;
-  RowVectorBatch<float> griffin_gate_x;
-  RowVectorBatch<float> griffin_multiplier;
+  MatStorageT<BF16> att_sums;
 
   // Rope
-  RowVectorBatch<float> inv_timescale;
-  RowVectorBatch<float> inv_timescale_global;
+  MatStorageT<float> inv_timescale;
+  MatStorageT<float> inv_timescale_global;
 
-  // Dynamic because no default ctor and only initialized in `Allocate`.
-  MatMulEnv* env;
+  hwy::Divisor div_seq_len;
+  // Unfortunately, some models (Griffin) have non-power-of-two heads.
+  hwy::Divisor div_heads;
+  float query_scale;
+};
 
-  PostQKType post_qk = PostQKType::Rope;
-  // And the config.
-  const ModelConfig& weights_config;
-  const LayerConfig& layer_config;
-  size_t seq_len;
-  size_t cache_pos_size = 0;
+struct Activations {
+  Activations(const ModelConfig& config, size_t batch_size, size_t seq_len,
+              const Allocator& allocator,
+              std::vector<hwy::AlignedFreeUniquePtr<uint8_t*[]>>& row_ptrs)
+      : layer_config(config.layer_configs[0]),
 
-  void Allocate(size_t batch_size, MatMulEnv* env) {
-    post_qk = layer_config.post_qk;
-    const size_t model_dim = weights_config.model_dim;
-    const size_t ff_hidden_dim = layer_config.ff_hidden_dim;
-    const size_t vocab_size = weights_config.vocab_size;
-    const size_t qkv_dim = layer_config.qkv_dim;
-    const size_t heads = layer_config.heads;
+        x(MatFactory("x", batch_size, config.model_dim, allocator)),
+        logits(MatFactory("logits", batch_size, config.vocab_size, allocator)),
 
-    x = RowVectorBatch<float>(Extents2D(batch_size, model_dim));
-    q = RowVectorBatch<float>(
-        Extents2D(batch_size, heads * layer_config.QStride()));
-    if (vocab_size > 0) {
-      logits = RowVectorBatch<float>(Extents2D(batch_size, vocab_size));
-    }
+        pre_ffw_rms_out(MatFactory("pre_ffw_rms_out", batch_size,
+                                   config.model_dim, allocator)),
+        C1(MatFactory("C1", batch_size, layer_config.ff_hidden_dim, allocator)),
+        C2(MatFactory("C2", batch_size, layer_config.ff_hidden_dim, allocator)),
+        ffw_out(MatFactory("ffw_out", batch_size, config.model_dim, allocator)),
 
-    pre_att_rms_out = RowVectorBatch<float>(Extents2D(batch_size, model_dim));
-    att = RowVectorBatch<float>(
-        Extents2D(batch_size, heads * weights_config.seq_len));
-    att_out = RowVectorBatch<float>(Extents2D(batch_size, heads * qkv_dim));
-    att_sums = RowVectorBatch<float>(Extents2D(batch_size, model_dim));
+        attention(config, layer_config, batch_size, seq_len, allocator,
+                  row_ptrs),
+        griffin(config, config.model == Model::GRIFFIN_2B ? batch_size : 0,
+                allocator) {
+    HWY_ASSERT(batch_size != 0);
 
-    bf_pre_ffw_rms_out = RowVectorBatch<BF16>(Extents2D(batch_size, model_dim));
-    C1 = RowVectorBatch<float>(Extents2D(batch_size, ff_hidden_dim));
-    C2 = RowVectorBatch<float>(Extents2D(batch_size, ff_hidden_dim));
-    ffw_out = RowVectorBatch<float>(Extents2D(batch_size, model_dim));
+    // For MatMul outputs, precompute their row pointers.
+    // If we forget any MatMul outputs here, debug builds print a warning but
+    // fill them in each MatMul call.
+    x.AllocateAndAttachRowPtrs(row_ptrs);
+    logits.AllocateAndAttachRowPtrs(row_ptrs);
+    C1.AllocateAndAttachRowPtrs(row_ptrs);
+    C2.AllocateAndAttachRowPtrs(row_ptrs);
+    ffw_out.AllocateAndAttachRowPtrs(row_ptrs);
 
-    if (layer_config.type == LayerAttentionType::kGriffinRecurrentBlock) {
-      griffin_x = RowVectorBatch<float>(Extents2D(batch_size, model_dim));
-      griffin_y = RowVectorBatch<float>(Extents2D(batch_size, model_dim));
-      griffin_gate_x = RowVectorBatch<float>(Extents2D(batch_size, model_dim));
-      griffin_multiplier =
-          RowVectorBatch<float>(Extents2D(batch_size, model_dim));
-    }
-
-    inv_timescale = CreateInvTimescale(layer_config.qkv_dim,
-                                       post_qk == PostQKType::HalfRope);
-    inv_timescale_global =
-        CreateInvTimescale(qkv_dim, post_qk == PostQKType::HalfRope, 1000000.0);
-
-    this->env = env;
+    // Note that BindC on any MatMul output considerably slows down Prefill.
   }
+
+  // Negligible CPU time.
+  void SetBatchSize(size_t batch_size) {
+    x.OverrideRows(batch_size);
+    logits.OverrideRows(batch_size);
+
+    pre_ffw_rms_out.OverrideRows(batch_size);
+    C1.OverrideRows(batch_size);
+    C2.OverrideRows(batch_size);
+    ffw_out.OverrideRows(batch_size);
+
+    attention.SetBatchSize(batch_size);
+    griffin.SetBatchSize(batch_size);
+  }
+
+  const LayerConfig& layer_config;
+
+  MatStorageT<float> x;  // input
+  MatStorageT<float> logits;
+
+  // Gated FFW
+  MatStorageT<BF16> pre_ffw_rms_out;
+  // Norm may be large, so prefer to keep as f32.
+  MatStorageT<float> C1;
+  MatStorageT<float> C2;
+  MatStorageT<BF16> ffw_out;
+
+  AttentionActivations attention;
+  GriffinActivations griffin;
 };
 
 }  // namespace gcpp
diff --git a/gemma/attention.cc b/gemma/attention.cc
new file mode 100644
index 0000000..74ea77a
--- /dev/null
+++ b/gemma/attention.cc
@@ -0,0 +1,358 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
+#include "gemma/activations.h"
+#include "gemma/configs.h"  // kMaxQKVDim
+#include "gemma/gemma.h"
+#include "gemma/weights.h"
+#include "util/threading.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/profiler.h"
+
+// Compiles this file for multiple architectures via "foreach_target.h", to
+// which we pass the filename via macro 'argument'.
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "gemma/attention.cc"  // NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+// After highway.h
+#include "compression/compress-inl.h"
+#include "ops/ops-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+
+// Computes Q.K scores, which are "logits" (or scores) stored to att.
+// `k` is a strided view of the kv cache with dimensions [seq_len, qkv_dim].
+static HWY_INLINE void QDotK(const size_t start_pos, const size_t last_pos,
+                             const hwy::Divisor& div_seq_len,
+                             const float* HWY_RESTRICT q,
+                             const MatPtrT<KV_t>& k, float* HWY_RESTRICT att,
+                             const size_t worker) {
+  PROFILER_ZONE2(worker, "Gen.Attention.QDotK");
+  if (HWY_LIKELY(last_pos < static_cast<size_t>(div_seq_len.GetDivisor()))) {
+    // Slightly faster: no wraparound.
+    for (size_t pos = start_pos; pos <= last_pos; ++pos) {
+      const float score = Dot(q, k.Row(pos), k.Cols());
+      att[pos] = score;
+    }
+  } else {
+    for (size_t pos = start_pos; pos <= last_pos; ++pos) {
+      const size_t pos_modulo = div_seq_len.Remainder(pos);
+      const float score = Dot(q, k.Row(pos_modulo), k.Cols());
+      att[pos_modulo] = score;
+    }
+  }
+}
+
+static void PositionalEncodingQK(float* qk, const size_t layer_idx,
+                                 const LayerWeightsPtrs& layer,
+                                 const AttentionActivations& activations,
+                                 const size_t worker, const size_t pos,
+                                 const float mul = 1.0f) {
+  const size_t qkv_dim = layer.layer_config.qkv_dim;
+  const PostQKType& post_qk = layer.layer_config.post_qk;
+  // qk is either q or k, so qkv_dim is the length we operate on.
+  const float* inv_timescale = activations.inv_timescale.PackedScale1();
+  const bool is_global_layer = activations.config.IsGlobalLayer(layer_idx);
+  // TODO: add a config flag instead of hardcoding the model.
+  if (is_global_layer && IsVLM(activations.config.model)) {
+    inv_timescale = activations.inv_timescale_global.PackedScale1();
+  }
+  // PostQKType::Rope
+  if (post_qk == PostQKType::HalfRope) {
+    Rope(qk, qkv_dim / 2, inv_timescale, pos, worker);
+    if (mul != 1.0f) MulByConst(mul, qk, qkv_dim, worker);
+  } else {
+    RopeAndMulBy(mul, qk, qkv_dim, inv_timescale, pos, worker);
+  }
+}
+
+// Accumulates the sum of v (from `kv_cache`) * probability (`att`) into
+// `att_out`. Equivalent in gemma/modules.py:
+// encoded = jnp.einsum('BTNS,BSNH->BTNH', probs, value_proj)
+// `v` is a strided view of the kv cache with dimensions [seq_len, qkv_dim].
+static HWY_INLINE void WeightedSumV(
+    const size_t start_pos, const size_t last_pos,
+    const hwy::Divisor& div_seq_len, const float* HWY_RESTRICT att,
+    const MatPtrT<KV_t>& v, float* HWY_RESTRICT att_out, const size_t worker) {
+  if (HWY_LIKELY(last_pos < static_cast<size_t>(div_seq_len.GetDivisor()))) {
+    // Slightly faster: no wraparound. Could be replaced with MatMul(att, v) if
+    // we supported non-transposed B.
+    // TODO: 2..4x unroll
+    MulByConstTo(att[start_pos], v.Row(start_pos), att_out, v.Cols(), worker);
+    for (size_t pos = start_pos + 1; pos <= last_pos; ++pos) {
+      MulByConstAndAdd(att[pos], v.Row(pos), att_out, v.Cols(), worker);
+    }
+  } else {
+    {
+      const size_t pos_mod = div_seq_len.Remainder(start_pos);
+      MulByConstTo(att[pos_mod], v.Row(pos_mod), att_out, v.Cols(), worker);
+    }
+    for (size_t pos = start_pos + 1; pos <= last_pos; ++pos) {
+      const size_t pos_mod = div_seq_len.Remainder(pos);
+      MulByConstAndAdd(att[pos_mod], v.Row(pos_mod), att_out, v.Cols(), worker);
+    }
+  }
+}
+
+// Calculates the attention outputs for a single q, which may be updated
+// in place for RMSNorm.
+void SingleDotSoftmaxWeightedSum(
+    const size_t pos, const size_t start_pos, const size_t last_pos,
+    float* HWY_RESTRICT q, const MatPtrT<KV_t>& k, const MatPtrT<KV_t>& v,
+    const size_t layer_idx, const LayerWeightsPtrs& layer,
+    const AttentionActivations& activations, float* HWY_RESTRICT att,
+    float* HWY_RESTRICT att_out, const size_t worker) {
+  const float att_cap = activations.config.att_cap;
+  const float query_scale = activations.query_scale;
+  const size_t seq_len =
+      static_cast<size_t>(activations.div_seq_len.GetDivisor());
+
+  // Apply rope and scaling to Q.
+  if (layer.query_norm_scale.HasPtr()) {
+    CallUpcasted(&layer.query_norm_scale, [&](const auto* weights_t) {
+      RMSNormInplace(weights_t->PackedScale1(), 0, q,
+                     layer.layer_config.qkv_dim, worker);
+    });
+  }
+
+  PositionalEncodingQK(q, layer_idx, layer, activations, worker, pos,
+                       query_scale);
+
+  QDotK(start_pos, last_pos, activations.div_seq_len, q, k, att, worker);
+
+  // SoftMax with optional SoftCap yields "probabilities" in att.
+  const size_t att_len = HWY_MIN(last_pos + 1, seq_len);
+  MaybeLogitsSoftCap(att_cap, att, att_len, worker);
+  Softmax(att, att_len, worker, /*temperature=*/1.0f);
+
+  WeightedSumV(start_pos, last_pos, activations.div_seq_len, att, v, att_out,
+               worker);
+}
+
+// The attention window usually starts at 0 unless `pos` is larger than
+// the attention window size, then it is `pos` - window_size + 1.
+static HWY_INLINE size_t StartPos(size_t pos, const ModelConfig& config,
+                                  size_t layer_idx) {
+  const size_t att_window_size = config.attention_window_sizes[layer_idx];
+  return pos - HWY_MIN(att_window_size - 1, pos);
+}
+
+void DotSoftmaxWeightedSum(const size_t num_tokens, const size_t layer_idx,
+                           const LayerWeightsPtrs& layer,
+                           AttentionActivations& activations, QBatch& qbatch,
+                           NestedPools& pools) {
+  static const uint32_t HWY_MAYBE_UNUSED zone_id_par =
+      PROFILER_ADD_ZONE("Gen.Attention.DotSoftmax.par");
+
+  const hwy::Divisor div_qbatch(qbatch.Size());
+  const LayerConfig& layer_config = layer.layer_config;
+  const size_t qkv_dim = layer_config.qkv_dim;
+
+  // A "head group" in the context of GQA refers to a collection of query
+  // heads that share the same key and value heads.
+  const size_t kHeadGroups = layer_config.heads / layer_config.kv_heads;
+
+  const size_t cache_layer_size = layer_config.CacheLayerSize();
+  const size_t seq_len =
+      static_cast<size_t>(activations.div_seq_len.GetDivisor());
+  // All layers should have the same number of heads.
+  HWY_DASSERT(activations.div_heads.GetDivisor() == layer_config.heads);
+
+  // For each head/token/query, compute Q.K, softmax, and weighted V.
+  const auto func = [&](const size_t task, size_t worker) HWY_ATTR {
+    const size_t tq_idx = activations.div_heads.Divide(task);
+    const size_t head = activations.div_heads.Remainder(task);
+#if PROFILER_ENABLED
+    const hwy::Zone zone(worker, zone_id_par);
+#endif
+
+    const size_t qi = div_qbatch.Remainder(tq_idx);
+    const size_t batch_idx = div_qbatch.Divide(tq_idx);
+    auto& kv_cache = qbatch.KV(qi).kv_cache;
+
+    // Find the token position in the query and calculate
+    // the range of cache positions to attend to.
+    const size_t pos = qbatch.Pos(qi) + batch_idx;
+    const size_t start_pos = StartPos(pos, activations.config, layer_idx);
+    size_t last_pos = pos;
+    const size_t prefix_end = qbatch.PrefixEnd(qi);
+    if (prefix_end > 0 && prefix_end - 1 > last_pos) {
+      // last_pos in QDotK and WeightedSumV is inclusive.
+      last_pos = prefix_end - 1;
+    }
+
+    float* HWY_RESTRICT q = activations.q.Row(tq_idx) + head * qkv_dim;
+    float* HWY_RESTRICT att = activations.att.Row(tq_idx) + head * seq_len;
+    float* HWY_RESTRICT att_out =
+        activations.att_out.Row(tq_idx) + head * qkv_dim;
+
+    // Make strided read-only views into the kv cache for
+    // this query and head.
+    const size_t head_offset = (head / kHeadGroups) * qkv_dim * 2;
+    const size_t kv_head_offset = layer_idx * cache_layer_size + head_offset;
+    MatPtrT<KV_t> k("k_view", Extents2D(seq_len, qkv_dim));
+    k.SetPtr(kv_cache.Row(0) + kv_head_offset, kv_cache.Stride());
+    MatPtrT<KV_t> v("v_view", Extents2D(seq_len, qkv_dim));
+    v.SetPtr(kv_cache.Row(0) + kv_head_offset + qkv_dim, kv_cache.Stride());
+
+    SingleDotSoftmaxWeightedSum(pos, start_pos, last_pos, q, k, v, layer_idx,
+                                layer, activations, att, att_out, worker);
+  };
+
+  {
+    PROFILER_ZONE("Gen.Attention.DotSoftmax.ForkJoin");
+    const size_t pkg_idx = 0;
+    // Full parallelism is helpful, SmallParallelFor is insufficient.
+    ParallelFor(num_tokens * div_qbatch.GetDivisor() * layer_config.heads,
+                pools, pkg_idx, func);
+  }
+}
+
+// Different functions use different naming conventions for the number of
+// tokens. Functions that are query-independent, such as RMSNorm*, call the
+// count `num_interleaved`. Functions that are query-dependent, such as
+// `Attention`, use separate `num_tokens` and `num_queries`. `num_tokens` is the
+// number of tokens from one query: 1 for decode, otherwise prefill_tbatch_size.
+
+// Fills activations.q and writes to KV cache.
+static HWY_INLINE void ComputeQKV(size_t num_tokens, const size_t layer_idx,
+                                  const LayerWeightsPtrs& layer,
+                                  AttentionActivations& activations,
+                                  const QBatch& qbatch, const int flags,
+                                  MatMulEnv& env) {
+  PROFILER_ZONE("Gen.Attention.QKV");
+  const hwy::Divisor div_qbatch(qbatch.Size());
+  const size_t num_interleaved = num_tokens * div_qbatch.GetDivisor();
+  const LayerConfig& layer_config = layer.layer_config;
+  const size_t qkv_dim = layer_config.qkv_dim;
+  const size_t kv_heads = layer_config.kv_heads;
+  const size_t cache_layer_size = layer_config.CacheLayerSize();
+
+  // The original qkv_einsum_w has shape [(heads + kv_heads * 2), qkv_dim,
+  // model_dim], which we reshaped to (heads + kv_heads * 2) * qkv_dim rows.
+  CallMatMul(activations.pre_att_rms_out, layer.qkv_einsum_w1,
+             /*add=*/nullptr, env, activations.q);
+
+  // Set up MatMul row pointers for writing to KV, which consists of
+  // `kv_heads` pairs of (k, v) vectors. This safely handles wraparound
+  // because rows are computed modulo seq_len.
+  MatPtrT<KV_t> kv_rows("kv", Extents2D(activations.pre_att_rms_out.Rows(),
+                                        layer.qkv_einsum_w2.Rows()));
+  for (size_t interleaved_idx = 0; interleaved_idx < num_interleaved;
+       ++interleaved_idx) {
+    const size_t qi = div_qbatch.Remainder(interleaved_idx);
+    const size_t batch_idx = div_qbatch.Divide(interleaved_idx);
+    const size_t cache_pos =
+        activations.div_seq_len.Remainder(qbatch.Pos(qi) + batch_idx);
+    env.row_ptrs[2][interleaved_idx] = reinterpret_cast<uint8_t*>(
+        qbatch.KV(qi).kv_cache.Row(cache_pos) + layer_idx * cache_layer_size);
+  }
+  kv_rows.AttachRowPtrs(env.row_ptrs[2].get());
+  CallMatMul(activations.pre_att_rms_out, layer.qkv_einsum_w2,
+             /*add=*/nullptr, env, kv_rows);
+
+  // Apply positional encodings for K.
+  // Note that 2D parallelism is not worth the fork/join overhead because the
+  // tasks are very lightweight.
+  env.ctx.pools.Pool(0).Run(
+      0, kv_heads * num_interleaved,
+      [&](uint64_t task, size_t thread) HWY_ATTR {
+        const size_t head = task % kv_heads;
+        const size_t interleaved_idx = task / kv_heads;
+        const size_t qi = div_qbatch.Remainder(interleaved_idx);
+        const size_t batch_idx = div_qbatch.Divide(interleaved_idx);
+        const size_t pos = qbatch.Pos(qi) + batch_idx;
+        const size_t cache_pos = activations.div_seq_len.Remainder(pos);
+        auto& kv_cache = qbatch.KV(qi).kv_cache;
+        KV_t* HWY_RESTRICT kv = kv_cache.Row(cache_pos) +
+                                layer_idx * cache_layer_size +
+                                head * qkv_dim * 2;
+
+        HWY_ALIGN float kv_f32[2 * kMaxQKVDim];
+        const hn::ScalableTag<float> df;
+        DecompressAndZeroPad(df, MakeSpan(kv, 2 * qkv_dim), 0, kv_f32,
+                             2 * qkv_dim);
+
+        // Apply further processing to K.
+        if (layer.key_norm_scale.HasPtr()) {
+          CallUpcasted(&layer.key_norm_scale, [&](const auto* weights_t) {
+            RMSNormInplace(weights_t->PackedScale1(), 0, kv_f32, qkv_dim,
+                           thread);
+          });
+        }
+
+        PositionalEncodingQK(kv_f32, layer_idx, layer, activations, thread,
+                             pos);
+        CompressPerThread tls;
+        Compress(kv_f32, 2 * qkv_dim, tls, MakeSpan(kv, 2 * qkv_dim), 0);
+      });
+}
+
+// Sums encoded (`att_out`) over num_heads (`layer_config.heads`) and
+// head_dim (`qkv_dim`) into output (`layer_out`).
+static HWY_INLINE void SumHeads(const LayerWeightsPtrs& layer,
+                                AttentionActivations& activations,
+                                MatMulEnv& env) {
+  PROFILER_ZONE("Gen.Attention.SumHeads");
+  const LayerConfig& layer_config = layer.layer_config;
+  // att_weights and att_out are concatenated heads, each of length
+  // layer_config.qkv_dim. Thus the [num_interleaved,
+  // layer_config.model_dim] matmul output is the sum over heads. Compare
+  // gemma/modules.py: attn_output = self.attn_vec_einsum('BTNH,NHD->BTD',
+  // encoded)
+  HWY_DASSERT(layer_config.model_dim != 0 && layer_config.heads != 0 &&
+              layer_config.qkv_dim != 0);
+  const float* add = layer_config.softmax_attn_output_biases
+                         ? layer.attention_output_biases.PackedScale1()
+                         : nullptr;
+  CallMatMul(activations.att_out, layer.att_weights, add, env,
+             activations.att_sums);
+}
+
+void GemmaAttention(size_t num_tokens, const size_t layer_idx,
+                    const LayerWeightsPtrs& layer,
+                    AttentionActivations& activations, QBatch& qbatch,
+                    MatMulEnv& env, int flags) {
+  const LayerConfig& layer_config = layer.layer_config;
+  HWY_DASSERT(!layer_config.IsMHA());  // No longer supported.
+  HWY_DASSERT_M((layer_config.heads % layer_config.kv_heads) == 0,
+                "query heads must be a multiple of key-value heads");
+  (void)layer_config;  // only used in HWY_DASSERT
+
+  ComputeQKV(num_tokens, layer_idx, layer, activations, qbatch, flags, env);
+  DotSoftmaxWeightedSum(num_tokens, layer_idx, layer, activations, qbatch,
+                        env.ctx.pools);
+  SumHeads(layer, activations, env);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
diff --git a/gemma/attention.h b/gemma/attention.h
new file mode 100644
index 0000000..42b2be1
--- /dev/null
+++ b/gemma/attention.h
@@ -0,0 +1,59 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_ATTENTION_H_
+#define THIRD_PARTY_GEMMA_CPP_GEMMA_ATTENTION_H_
+
+// Declares GemmaAttention for all SIMD targets.
+
+#include <stddef.h>
+
+#include "gemma/gemma.h"
+#include "hwy/highway.h"
+
+namespace gcpp {
+
+// Passed to HWY_VISIT_TARGETS; declares for one target.
+#define GEMMA_DECL_ATTENTION(TARGET, NAMESPACE)                              \
+  namespace NAMESPACE {                                                      \
+  void SingleDotSoftmaxWeightedSum(                                          \
+      const size_t pos, const size_t start_pos, const size_t last_pos,       \
+      float* HWY_RESTRICT q, const MatPtrT<KV_t>& k, const MatPtrT<KV_t>& v, \
+      size_t layer_idx, const LayerWeightsPtrs& layer,                       \
+      const AttentionActivations& activations, float* HWY_RESTRICT att,      \
+      float* HWY_RESTRICT att_out, size_t worker);                           \
+                                                                             \
+  void DotSoftmaxWeightedSum(const size_t num_tokens, size_t layer_idx,      \
+                             const LayerWeightsPtrs& layer,                  \
+                             AttentionActivations& activations,              \
+                             QBatch& qbatch, NestedPools& pools);            \
+                                                                             \
+  void GemmaAttention(size_t num_tokens, const size_t layer_idx,             \
+                      const LayerWeightsPtrs& layer,                         \
+                      AttentionActivations& activations, QBatch& qbatch,     \
+                      MatMulEnv& env, int flags);                            \
+  /* NOLINTNEXTLINE(google-readability-namespace-comments) */                \
+  }  // namespace NAMESPACE
+
+// Function declarations for each SIMD target. Allows direct call from the
+// per-target namespace. We may later replace this with dynamic dispatch if
+// the overhead is acceptable.
+HWY_VISIT_TARGETS(GEMMA_DECL_ATTENTION)
+
+#undef GEMMA_DECL_ATTENTION
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_ATTENTION_H_
diff --git a/gemma/bindings/GemmaInterop.cs b/gemma/bindings/GemmaInterop.cs
new file mode 100644
index 0000000..0fb3ee8
--- /dev/null
+++ b/gemma/bindings/GemmaInterop.cs
@@ -0,0 +1,473 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+using System.Text;
+namespace GemmaCpp
+{
+    public class GemmaException : Exception
+    {
+        public GemmaException(string message) : base(message) { }
+    }
+
+    public class Gemma : IDisposable
+    {
+        private IntPtr _context;
+        private bool _disposed;
+
+        // Optional: Allow setting DLL path
+        public static string DllPath { get; set; } = "gemma.dll";
+
+        [DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
+        private static extern IntPtr LoadLibrary(string lpFileName);
+
+        static Gemma()
+        {
+            // Load DLL from specified path
+            if (LoadLibrary(DllPath) == IntPtr.Zero)
+            {
+                throw new DllNotFoundException($"Failed to load {DllPath}. Error: {Marshal.GetLastWin32Error()}");
+            }
+        }
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern IntPtr GemmaCreate(
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string tokenizerPath,
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string modelType,
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string weightsPath,
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string weightType,
+            int maxGeneratedTokens);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern void GemmaDestroy(IntPtr context);
+
+        // Delegate type for token callbacks
+        public delegate bool TokenCallback(string token);
+
+        // Keep delegate alive for duration of calls
+        private GCHandle _callbackHandle;
+
+        [UnmanagedFunctionPointer(CallingConvention.Cdecl)]
+        private delegate bool GemmaTokenCallback(
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string text,
+            IntPtr userData);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern int GemmaGenerate(
+            IntPtr context,
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string prompt,
+            [Out] byte[] output,
+            int maxOutputChars,
+            GemmaTokenCallback callback,
+            IntPtr userData);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern int GemmaGenerateMultimodal(
+            IntPtr context,
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string prompt,
+            IntPtr image_data, // Renamed param to match C API
+            int image_width,   // Added dimension
+            int image_height,  // Added dimension
+            [MarshalAs(UnmanagedType.LPUTF8Str)] StringBuilder output, // Output should be StringBuilder for multimodal
+            int maxOutputChars,
+            GemmaTokenCallback callback,
+            IntPtr userData);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern int GemmaCountTokens(
+            IntPtr context,
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string text);
+
+        // Configuration function imports
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern void GemmaSetMaxGeneratedTokens(IntPtr context, int value);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern void GemmaSetMultiturn(IntPtr context, int value);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern void GemmaSetTemperature(IntPtr context, float value);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern void GemmaSetTopK(IntPtr context, int value);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern void GemmaSetDeterministic(IntPtr context, int value);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern void GemmaSetPrefillTbatchSize(IntPtr context, int value);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaResetConversation")]
+        private static extern void GemmaResetConversation(IntPtr context);
+
+        // Conversation management function imports
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaCreateConversation")]
+        private static extern int GemmaCreateConversation(
+            IntPtr context,
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string conversationName);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaSwitchConversation")]
+        private static extern int GemmaSwitchConversation(
+            IntPtr context,
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string conversationName);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaDeleteConversation")]
+        private static extern int GemmaDeleteConversation(
+            IntPtr context,
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string conversationName);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaHasConversation")]
+        private static extern int GemmaHasConversation(
+            IntPtr context,
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string conversationName);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaGetCurrentConversation")]
+        [return: MarshalAs(UnmanagedType.LPUTF8Str)] // Marshal the const char* return value as a string
+        private static extern string GemmaGetCurrentConversation(IntPtr context);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaSaveConversation")]
+        private static extern void GemmaSaveConversation(IntPtr context);
+
+        // Native callback delegate type
+        [UnmanagedFunctionPointer(CallingConvention.Cdecl)]
+        private delegate void GemmaLogCallback(
+            [MarshalAs(UnmanagedType.LPUTF8Str)] string message,
+            IntPtr userData);
+
+        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
+        private static extern void GemmaSetLogCallback(
+            IntPtr context,
+            GemmaLogCallback callback,
+            IntPtr userData);
+
+        private GCHandle _logCallbackHandle;
+        private bool _loggingEnabled = false;
+
+        public Gemma(string tokenizerPath, string weightsPath, int maxGeneratedTokens = 8192)
+        {
+            _context = GemmaCreate(tokenizerPath, weightsPath, maxGeneratedTokens);
+            if (_context == IntPtr.Zero)
+            {
+                throw new GemmaException("Failed to create Gemma context");
+            }
+        }
+
+        // Enable debug logging
+        public void EnableLogging(bool enable = true)
+        {
+            if (enable && !_loggingEnabled)
+            {
+                GemmaLogCallback logCallback = (message, _) =>
+                {
+                    Debug.WriteLine($"Gemma: {message}");
+                };
+                _logCallbackHandle = GCHandle.Alloc(logCallback);
+                GemmaSetLogCallback(_context, logCallback, IntPtr.Zero);
+                _loggingEnabled = true;
+            }
+            else if (!enable && _loggingEnabled)
+            {
+                if (_logCallbackHandle.IsAllocated)
+                    _logCallbackHandle.Free();
+                GemmaSetLogCallback(_context, null, IntPtr.Zero);
+                _loggingEnabled = false;
+            }
+        }
+
+        // Configuration methods
+        public void SetMultiturn(bool enable)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            GemmaSetMultiturn(_context, enable ? 1 : 0);
+            Debug.WriteLine($"Gemma: Set multiturn to {(enable ? "enabled" : "disabled")}");
+        }
+
+        public void SetTemperature(float temperature)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            GemmaSetTemperature(_context, temperature);
+            Debug.WriteLine($"Gemma: Set temperature to {temperature}");
+        }
+
+        public void SetTopK(int topK)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            GemmaSetTopK(_context, topK);
+            Debug.WriteLine($"Gemma: Set topK to {topK}");
+        }
+
+        public void SetDeterministic(bool deterministic)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            GemmaSetDeterministic(_context, deterministic ? 1 : 0);
+            Debug.WriteLine($"Gemma: Set deterministic to {(deterministic ? "true" : "false")}");
+        }
+
+        // Renamed public method
+        public void ResetConversation()
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            GemmaResetConversation(_context); // Call P/Invoke method
+            Debug.WriteLine("Gemma: Reset active conversation");
+        }
+
+        // Conversation management methods
+        public bool CreateConversation(string conversationName)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            bool result = GemmaCreateConversation(_context, conversationName) != 0; // Call P/Invoke method
+            Debug.WriteLine($"Gemma: Create conversation '{conversationName}' - {(result ? "succeeded" : "failed")}");
+            return result;
+        }
+
+        public bool SwitchConversation(string conversationName)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            bool result = GemmaSwitchConversation(_context, conversationName) != 0; // Call P/Invoke method
+            Debug.WriteLine($"Gemma: Switch to conversation '{conversationName}' - {(result ? "succeeded" : "failed")}");
+            return result;
+        }
+
+        public bool DeleteConversation(string conversationName)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            bool result = GemmaDeleteConversation(_context, conversationName) != 0; // Call P/Invoke method
+            Debug.WriteLine($"Gemma: Delete conversation '{conversationName}' - {(result ? "succeeded" : "failed")}");
+            return result;
+        }
+
+        public bool HasConversation(string conversationName)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            bool result = GemmaHasConversation(_context, conversationName) != 0; // Call P/Invoke method
+            Debug.WriteLine($"Gemma: Has conversation '{conversationName}' - {result}");
+            return result;
+        }
+
+        public string GetCurrentConversation()
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            string currentConversation = GemmaGetCurrentConversation(_context); // Call P/Invoke method
+            Debug.WriteLine($"Gemma: Current conversation is '{currentConversation}'");
+            return currentConversation;
+        }
+
+        public void SaveConversation()
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            GemmaSaveConversation(_context);
+            Debug.WriteLine($"Gemma: Saved current conversation ('{GetCurrentConversation()}') to prewarmed cache.");
+        }
+
+        public int CountTokens(string prompt)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+            int count = GemmaCountTokens(_context, prompt);
+            return count;
+        }
+
+        public string Generate(string prompt, int maxOutputChars = 4096)
+        {
+            return Generate(prompt, null, maxOutputChars);
+        }
+
+        public string Generate(string prompt, TokenCallback callback, int maxOutputChars = 4096)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            var outputBuffer = new byte[maxOutputChars * 4];  // Allow for worst case UTF-8 size
+            GemmaTokenCallback nativeCallback = null;
+
+            // Track token count for debugging
+            int tokenCount = 0;
+
+            if (callback != null)
+            {
+                nativeCallback = (text, _) =>
+                {
+                    tokenCount++;
+                    // Log token for debugging
+                    Debug.WriteLine($"Token {tokenCount}: '{text}'");
+
+                    // Pass token to user callback
+                    return callback(text);
+                };
+                _callbackHandle = GCHandle.Alloc(nativeCallback);
+            }
+
+            try
+            {
+                int length = GemmaGenerate(_context, prompt, outputBuffer, maxOutputChars,
+                    nativeCallback, IntPtr.Zero);
+
+                if (length < 0)
+                    throw new GemmaException("Generation failed");
+
+                Debug.WriteLine($"Generation complete: {tokenCount} tokens processed, result length: {length}");
+
+                // Convert the byte buffer to a string using UTF-8 encoding
+                string result = Encoding.UTF8.GetString(outputBuffer, 0, length);
+                return result;
+            }
+            finally
+            {
+                if (_callbackHandle.IsAllocated)
+                    _callbackHandle.Free();
+            }
+        }
+
+        public string GenerateMultimodal(string prompt, float[] imageData, int imageWidth, int imageHeight, int maxOutputChars = 4096)
+        {
+            // Pass width and height to the overloaded method
+            return GenerateMultimodal(prompt, imageData, imageWidth, imageHeight, null, maxOutputChars);
+        }
+
+        public string GenerateMultimodal(string prompt, float[] imageData, int imageWidth, int imageHeight, TokenCallback callback, int maxOutputChars = 4096)
+        {
+            if (_disposed)
+                throw new ObjectDisposedException(nameof(Gemma));
+
+            if (_context == IntPtr.Zero)
+                throw new GemmaException("Gemma context is invalid");
+
+            if (imageData == null || imageData.Length == 0)
+                throw new ArgumentException("Image data cannot be null or empty", nameof(imageData));
+
+            if (imageWidth <= 0 || imageHeight <= 0)
+                throw new ArgumentException("Image dimensions must be positive");
+
+            if (imageData.Length < imageWidth * imageHeight * 3)
+                throw new ArgumentException("Image data array is too small for the specified dimensions");
+
+            var output = new StringBuilder(maxOutputChars);
+            GemmaTokenCallback nativeCallback = null;
+
+            if (callback != null)
+            {
+                nativeCallback = (text, _) => callback(text);
+                _callbackHandle = GCHandle.Alloc(nativeCallback);
+            }
+
+            // Pin the image data so it doesn't move during the native call
+            GCHandle imageHandle = GCHandle.Alloc(imageData, GCHandleType.Pinned);
+
+            try
+            {
+                IntPtr imagePtr = imageHandle.AddrOfPinnedObject();
+
+                // Pass image dimensions to the native call
+                int length = GemmaGenerateMultimodal(_context, prompt, imagePtr, imageWidth, imageHeight, output, maxOutputChars,
+                    nativeCallback, IntPtr.Zero);
+
+                if (length < 0)
+                    throw new GemmaException("Multimodal generation failed");
+
+                return output.ToString();
+            }
+            finally
+            {
+                imageHandle.Free();
+
+                if (_callbackHandle.IsAllocated)
+                    _callbackHandle.Free();
+            }
+        }
+
+        public void Dispose()
+        {
+            if (!_disposed)
+            {
+                if (_context != IntPtr.Zero)
+                {
+                    GemmaDestroy(_context);
+                    _context = IntPtr.Zero;
+                }
+                if (_logCallbackHandle.IsAllocated)
+                    _logCallbackHandle.Free();
+                _disposed = true;
+            }
+        }
+
+        ~Gemma()
+        {
+            Dispose();
+        }
+    }
+}
diff --git a/gemma/bindings/c_api.cc b/gemma/bindings/c_api.cc
new file mode 100644
index 0000000..cba2ffb
--- /dev/null
+++ b/gemma/bindings/c_api.cc
@@ -0,0 +1,139 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GEMMA_EXPORTS
+#define GEMMA_EXPORTS
+#endif
+
+#include "gemma/bindings/c_api.h"
+
+extern "C" {
+
+GEMMA_API GemmaContext* GemmaCreate(const char* tokenizer_path,
+                                    const char* weights_path,
+                                    int max_generated_tokens) {
+  try {
+    GemmaContext* ctx = GemmaContext::Create(tokenizer_path, weights_path,
+                                             max_generated_tokens);
+    return ctx;
+  } catch (...) {
+    return nullptr;
+  }
+}
+
+GEMMA_API void GemmaDestroy(GemmaContext* ctx) {
+  delete ctx;
+}
+
+GEMMA_API int GemmaGenerate(GemmaContext* ctx, const char* prompt, char* output,
+                            int max_output_chars, GemmaTokenCallback callback,
+                            void* user_data) {
+  if (!ctx) return -1;
+  return ctx->Generate(prompt, output, max_output_chars, callback, user_data);
+}
+
+GEMMA_API int GemmaGenerateMultimodal(GemmaContext* ctx, const char* prompt,
+                                      const void* image_data, int image_width,
+                                      int image_height, char* output,
+                                      int max_output_chars,
+                                      GemmaTokenCallback callback,
+                                      void* user_data) {
+  if (!ctx) return -1;
+
+  return ctx->GenerateMultimodal(prompt, image_data, image_width, image_height,
+                                 output, max_output_chars, callback, user_data);
+}
+
+GEMMA_API int GemmaCountTokens(GemmaContext* ctx, const char* text) {
+  if (!ctx || !text) return -1;
+  return ctx->CountTokens(text);
+}
+
+GEMMA_API void GemmaSetLogCallback(GemmaContext* ctx, GemmaLogCallback callback,
+                                   void* user_data) {
+  if (!ctx) return;
+  ctx->SetLogCallback(callback, user_data);
+}
+
+// Configuration functions implementation
+GEMMA_API void GemmaSetMaxGeneratedTokens(GemmaContext* ctx, int value) {
+  if (!ctx) return;
+  ctx->SetMaxGeneratedTokens(value);
+}
+
+GEMMA_API void GemmaSetMultiturn(GemmaContext* ctx, int value) {
+  if (!ctx) return;
+  ctx->SetMultiturn(value);
+}
+
+GEMMA_API void GemmaSetTemperature(GemmaContext* ctx, float value) {
+  if (!ctx) return;
+  ctx->SetTemperature(value);
+}
+
+GEMMA_API void GemmaSetTopK(GemmaContext* ctx, int value) {
+  if (!ctx) return;
+  ctx->SetTopK(value);
+}
+
+GEMMA_API void GemmaSetDeterministic(GemmaContext* ctx, int value) {
+  if (!ctx) return;
+  ctx->SetDeterministic(value != 0);
+}
+
+GEMMA_API void GemmaSetPrefillTbatchSize(GemmaContext* ctx, int value) {
+  if (!ctx) return;
+  ctx->SetPrefillTbatchSize(value);
+}
+
+GEMMA_API void GemmaResetConversation(GemmaContext* ctx) {  // Renamed function
+  if (!ctx) return;
+  ctx->ResetConversation();
+}
+
+GEMMA_API int GemmaCreateConversation(GemmaContext* ctx,
+                                      const char* conversation_name) {
+  if (!ctx || !conversation_name) return 0;
+  return ctx->CreateConversation(conversation_name) ? 1 : 0;
+}
+
+GEMMA_API int GemmaSwitchConversation(GemmaContext* ctx,
+                                      const char* conversation_name) {
+  if (!ctx || !conversation_name) return 0;
+  return ctx->SwitchConversation(conversation_name) ? 1 : 0;
+}
+
+GEMMA_API int GemmaDeleteConversation(GemmaContext* ctx,
+                                      const char* conversation_name) {
+  if (!ctx || !conversation_name) return 0;
+  return ctx->DeleteConversation(conversation_name) ? 1 : 0;
+}
+
+GEMMA_API int GemmaHasConversation(GemmaContext* ctx,
+                                   const char* conversation_name) {
+  if (!ctx || !conversation_name) return 0;
+  return ctx->HasConversation(conversation_name) ? 1 : 0;
+}
+
+GEMMA_API const char* GemmaGetCurrentConversation(GemmaContext* ctx) {
+  if (!ctx) return nullptr;
+  return ctx->GetCurrentConversation();
+}
+
+GEMMA_API void GemmaSaveConversation(GemmaContext* ctx) {
+  if (!ctx) return;
+  ctx->SaveConversation();
+}
+}
diff --git a/gemma/bindings/c_api.h b/gemma/bindings/c_api.h
new file mode 100644
index 0000000..6d369b8
--- /dev/null
+++ b/gemma/bindings/c_api.h
@@ -0,0 +1,86 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_C_API_H_
+#define THIRD_PARTY_GEMMA_C_API_H_
+
+#include "gemma/bindings/context.h"
+
+#ifdef _WIN32
+#ifdef GEMMA_EXPORTS
+#define GEMMA_API __declspec(dllexport)
+#else
+#define GEMMA_API __declspec(dllimport)
+#endif
+#else
+#define GEMMA_API __attribute__((visibility("default")))
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+typedef gcpp::GemmaContext GemmaContext;
+#else
+typedef struct GemmaContext GemmaContext;
+#endif
+
+typedef bool (*GemmaTokenCallback)(const char* text, void* user_data);
+typedef void (*GemmaLogCallback)(const char* message, void* user_data);
+
+GEMMA_API GemmaContext* GemmaCreate(const char* tokenizer_path,
+                                    const char* weights_path,
+                                    int max_generated_tokens);
+GEMMA_API void GemmaDestroy(GemmaContext* ctx);
+GEMMA_API int GemmaGenerate(GemmaContext* ctx, const char* prompt, char* output,
+                            int max_output_chars, GemmaTokenCallback callback,
+                            void* user_data);
+GEMMA_API int GemmaGenerateMultimodal(GemmaContext* ctx, const char* prompt,
+                                      const void* image_data, int image_width,
+                                      int image_height, char* output,
+                                      int max_output_chars,
+                                      GemmaTokenCallback callback,
+                                      void* user_data);
+
+GEMMA_API int GemmaCountTokens(GemmaContext* ctx, const char* text);
+
+GEMMA_API void GemmaSetLogCallback(GemmaContext* ctx, GemmaLogCallback callback,
+                                   void* user_data);
+
+// Configuration functions
+GEMMA_API void GemmaSetMultiturn(GemmaContext* ctx, int value);
+GEMMA_API void GemmaSetTemperature(GemmaContext* ctx, float value);
+GEMMA_API void GemmaSetTopK(GemmaContext* ctx, int value);
+GEMMA_API void GemmaSetDeterministic(GemmaContext* ctx, int value);
+GEMMA_API void GemmaResetConversation(GemmaContext* ctx);
+
+// Conversation management functions (renamed)
+GEMMA_API int GemmaCreateConversation(GemmaContext* ctx,
+                                      const char* conversation_name);
+GEMMA_API int GemmaSwitchConversation(GemmaContext* ctx,
+                                      const char* conversation_name);
+GEMMA_API int GemmaDeleteConversation(GemmaContext* ctx,
+                                      const char* conversation_name);
+GEMMA_API int GemmaHasConversation(GemmaContext* ctx,
+                                   const char* conversation_name);
+GEMMA_API const char* GemmaGetCurrentConversation(GemmaContext* ctx);
+GEMMA_API void GemmaSaveConversation(GemmaContext* ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // THIRD_PARTY_GEMMA_C_API_H_
diff --git a/gemma/bindings/context.cc b/gemma/bindings/context.cc
new file mode 100644
index 0000000..76ebe1e
--- /dev/null
+++ b/gemma/bindings/context.cc
@@ -0,0 +1,350 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gemma/bindings/context.h"
+
+#include <stddef.h>
+#include <string.h>  // strncpy
+
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "evals/benchmark_helper.h"  // InitGenerator
+#include "gemma/gemma.h"
+#include "gemma/gemma_args.h"
+#include "gemma/tokenizer.h"  // WrapAndTokenize
+#include "util/threading.h"
+#include "util/threading_context.h"
+#include "hwy/profiler.h"
+#include "hwy/timer.h"
+
+#ifdef _WIN32
+#include <Windows.h>
+#endif
+
+#include "gemma/kv_cache.h"
+#include "paligemma/image.h"
+
+namespace gcpp {
+
+// ConversationData constructor implementation
+ConversationData::ConversationData(const ModelConfig& model_config,
+                                   const InferenceArgs& inference_args,
+                                   const Allocator& allocator)
+    : kv_cache(
+          std::make_unique<KVCache>(model_config, inference_args, allocator)),
+      abs_pos(0) {}
+
+// ConversationData copy constructor implementation
+ConversationData::ConversationData(const ConversationData& other)
+    : kv_cache(nullptr), abs_pos(other.abs_pos) {
+  if (other.kv_cache) {
+    kv_cache = std::make_unique<KVCache>(other.kv_cache->Copy());
+  }
+}
+
+// Initialize static members
+GemmaLogCallback GemmaContext::s_log_callback = nullptr;
+void* GemmaContext::s_log_user_data = nullptr;
+
+GemmaContext* GemmaContext::Create(const char* tokenizer_path,
+                                   const char* weights_path,
+                                   int max_generated_tokens) {
+  std::stringstream ss;
+  ss << "Creating GemmaContext with tokenizer_path: "
+     << (tokenizer_path ? tokenizer_path : "null")
+     << ", weights_path: " << (weights_path ? weights_path : "null")
+     << ", max_generated_tokens: " << max_generated_tokens;
+  LogDebug(ss.str().c_str());
+
+  ThreadingArgs threading_args;
+  threading_args.spin = gcpp::Tristate::kFalse;
+
+  LoaderArgs loader(tokenizer_path, weights_path);
+  LogDebug("LoaderArgs created");
+
+  // Initialize cached args
+  LogDebug("Initializing inference args");
+  InferenceArgs inference_args;
+  inference_args.Init();
+  inference_args.max_generated_tokens = max_generated_tokens;
+  inference_args.temperature = 0.7f;
+  inference_args.top_k = 1;
+  inference_args.deterministic = false;
+
+  ss.str("");
+  ss << "Inference args initialized with max_tokens: " << max_generated_tokens
+     << ", temperature: " << inference_args.temperature
+     << ", top_k: " << inference_args.top_k << ", deterministic: "
+     << (inference_args.deterministic ? "true" : "false");
+  LogDebug(ss.str().c_str());
+
+  return new GemmaContext(loader, inference_args, threading_args,
+                          max_generated_tokens);
+}
+
+GemmaContext::GemmaContext(const LoaderArgs& loader,
+                           const InferenceArgs& inference_args,
+                           const ThreadingArgs& threading_args,
+                           int max_generated_tokens)
+    : inference_args(inference_args),
+      threading_args(threading_args),
+      ctx(UpdateArgs(threading_args, inference_args)),
+      matmul_env(ctx),
+      active_conversation_name("default"),
+      model(loader, inference_args, matmul_env.ctx) {
+  std::stringstream ss;
+
+  LogDebug("Creating initial ConversationData");
+  // Create the initial ConversationData object using make_shared
+  active_conversation = std::make_shared<ConversationData>(
+      model.Config(), inference_args, ctx.allocator);
+
+  LogDebug(
+      "Storing initial ConversationData in conversation_cache[\"default\"]");
+  // Store the shared_ptr in the map under the "default" key
+  conversation_cache["default"] = active_conversation;
+
+  LogDebug("GemmaContext constructor completed");
+}
+
+// Internal implementation shared by Generate and GenerateMultimodal
+int GemmaContext::GenerateInternal(const char* prompt_string,
+                                   const void* image_data, int image_width,
+                                   int image_height, char* output,
+                                   int max_output_chars,
+                                   GemmaTokenCallback callback,
+                                   void* user_data) {
+  PROFILER_ZONE("Gen.Internal");
+  size_t tokens_generated_this_turn = 0;  // differentiates prefill from reply
+  size_t prompt_size = 0;
+  std::stringstream ss;
+  result_buffer.clear();
+
+  InitGenerator(inference_args, gen);
+
+  // Ensure we have an active conversation
+  if (!active_conversation || !active_conversation->kv_cache) {
+    LogDebug("Generate called with null active_conversation or kv_cache");
+    return -1;
+  }
+
+  // callback function invoked for each generated token.
+  auto stream_token = [&, callback, user_data](int token, float) {
+    // Use abs_pos from the active conversation
+    ++(active_conversation->abs_pos);
+    const bool in_prompt = tokens_generated_this_turn < prompt_size;
+    const bool first_response_token = tokens_generated_this_turn == prompt_size;
+    ++tokens_generated_this_turn;
+    if (in_prompt || model.Config().IsEOS(token)) {
+      return true;
+    }
+
+    std::string token_text;
+    HWY_ASSERT(model.Tokenizer().Decode(std::vector<int>{token}, &token_text));
+    if (first_response_token) {
+      token_text.erase(0, token_text.find_first_not_of(" \t\n"));
+    }
+
+    // if we have a managed callback, pass it the token text
+    if (callback) {
+      if (!callback(token_text.c_str(), user_data)) {
+        LogDebug("Callback returned false, stopping generation");
+        return false;
+      }
+    }
+
+    result_buffer.append(token_text);
+    return true;
+  };
+
+  // set up runtime config
+  TimingInfo timing_info = {};
+  RuntimeConfig runtime_config = {.gen = &gen,
+                                  .stream_token = stream_token,
+                                  .use_spinning = threading_args.spin};
+  inference_args.CopyTo(runtime_config);
+  size_t prefix_end = 0;
+
+  const ModelConfig& model_config = model.Config();
+
+  // generate
+  std::vector<int> prompt;
+  const size_t pool_dim = model_config.vit_config.pool_dim;
+  ImageTokens image_tokens(
+      "image_tokens",
+      image_data
+          ? Extents2D(model_config.vit_config.seq_len / (pool_dim * pool_dim),
+                      model_config.model_dim)
+          : Extents2D(0, 0),
+      ctx.allocator, MatPadding::kOdd);
+  if (image_data != nullptr) {
+    HWY_ASSERT(model_config.wrapping == PromptWrapping::PALIGEMMA ||
+               model_config.wrapping == PromptWrapping::GEMMA_VLM);
+
+    Image image;
+    image.Set(image_width, image_height, static_cast<const float*>(image_data));
+
+    // We may need to resize the supplied image depending on whether we're using
+    // PaliGemma or Gemma 3.
+    const size_t image_size = model_config.vit_config.image_size;
+    image.Resize(image_size, image_size);
+
+    // Use the existing runtime_config defined earlier in the function.
+    // RuntimeConfig runtime_config = { ... }; // This was already defined
+    double image_tokens_start = hwy::platform::Now();
+    // Pass the populated image object to GenerateImageTokens
+    model.GenerateImageTokens(runtime_config,
+                              active_conversation->kv_cache->SeqLen(), image,
+                              image_tokens, matmul_env);
+    double image_tokens_duration = hwy::platform::Now() - image_tokens_start;
+
+    ss.str("");
+    ss << "\n\n[ Timing info ] Image token generation took: ";
+    ss << static_cast<int>(image_tokens_duration * 1000) << " ms\n",
+        LogDebug(ss.str().c_str());
+
+    prompt = WrapAndTokenize(
+        model.Tokenizer(), model.ChatTemplate(), model_config.wrapping,
+        active_conversation->abs_pos, prompt_string, image_tokens.Rows());
+    runtime_config.image_tokens = &image_tokens;
+    prompt_size = prompt.size();
+    // The end of the prefix for prefix-LM style attention in Paligemma.
+    // See Figure 2 of https://arxiv.org/abs/2407.07726.
+    prefix_end = prompt_size;
+  } else {
+    // Text-only case (original logic)
+    // Use abs_pos from the active conversation
+    prompt = WrapAndTokenize(model.Tokenizer(), model.ChatTemplate(),
+                             model_config.wrapping,
+                             active_conversation->abs_pos, prompt_string);
+    prompt_size = prompt.size();
+  }
+
+  // Check if prompt generation failed (e.g., multimodal not implemented yet)
+  if (prompt.empty() && image_data != nullptr) {
+    // Already logged the error, just ensure we don't proceed.
+    return -1;
+  }
+
+  // Create a span from the prompt vector - Generate() expects a hwy::Span,
+  // which has a different memory footprint to that of a std::vector.
+  hwy::Span<const int> prompt_span(prompt.data(), prompt.size());
+
+  // Pass the KVCache object by reference from the active conversation
+  model.Generate(runtime_config, prompt_span, active_conversation->abs_pos,
+                 prefix_end, *active_conversation->kv_cache, matmul_env,
+                 timing_info);
+
+  // prepare for next turn
+  if (!inference_args.multiturn ||
+      model_config.wrapping == PromptWrapping::PALIGEMMA) {
+    // If not multiturn, or Paligemma (which handles turns differently),
+    // reset the *active* conversation's position.
+    active_conversation->abs_pos = 0;
+    InitGenerator(inference_args, gen);
+  } else {
+    // Multi-turn Gemma: Rewind position in the active conversation
+    // The last token was either EOS, then it should be ignored because it is
+    // never part of the dialog, see Table 5 in the Gemma-2 paper:
+    // https://arxiv.org/pdf/2408.00118
+    // Or we have hit max_generated_tokens, then the last token will be lost.
+    // (We could store it in stream_token, and then prepend to the next turn,
+    // but it's not worth the complexity, as multi-turn with max_generated is
+    // not a common use case.)
+    // In either case, we need to rewind the active conversation's abs_pos by
+    // one.
+    HWY_ASSERT(active_conversation->abs_pos > 0);
+    active_conversation->abs_pos--;
+  }
+
+  // Copy result buffer to output C-string (ensure null termination)
+  strncpy(output, result_buffer.c_str(), max_output_chars - 1);
+  output[max_output_chars - 1] = '\0';
+
+  return static_cast<int>(strlen(output));
+}
+
+// Public Generate method (wrapper for text-only)
+int GemmaContext::Generate(const char* prompt_string, char* output,
+                           int max_output_chars, GemmaTokenCallback callback,
+                           void* user_data) {
+  // Call the internal implementation with null image_data and 0 dimensions
+  return GenerateInternal(prompt_string, nullptr, 0, 0, output,
+                          max_output_chars, callback, user_data);
+}
+
+// Public GenerateMultimodal method (wrapper)
+int GemmaContext::GenerateMultimodal(const char* prompt_string,
+                                     const void* image_data, int image_width,
+                                     int image_height, char* output,
+                                     int max_output_chars,
+                                     GemmaTokenCallback callback,
+                                     void* user_data) {
+  if (image_data == nullptr) {
+    LogDebug(
+        "GenerateMultimodal called with null image_data. Use Generate for "
+        "text-only.");
+    // Or potentially call GenerateInternal with null image_data anyway?
+    // Returning error seems safer.
+    return -1;
+  }
+
+  return GenerateInternal(prompt_string, image_data, image_width, image_height,
+                          output, max_output_chars, callback, user_data);
+}
+
+int GemmaContext::CountTokens(const char* text) {
+  LogDebug("CountTokens method started");
+  std::stringstream ss;
+  ss << "CountTokens called with text: '" << (text ? text : "null") << "'";
+  LogDebug(ss.str().c_str());
+
+  if (!text) {
+    LogDebug("CountTokens failed: Invalid parameters");
+    if (!text) LogDebug("  text is null");
+    return -1;
+  }
+
+  try {
+    LogDebug("Creating text string");
+    std::string text_str(text);
+
+    LogDebug("Creating tokens vector");
+    std::vector<int> tokens;
+
+    LogDebug("Encoding text to tokens");
+    HWY_ASSERT(model.Tokenizer().Encode(text_str, &tokens));
+
+    ss.str("");
+    ss << "Text tokenized into " << tokens.size() << " tokens";
+    LogDebug(ss.str().c_str());
+
+    LogDebug("CountTokens completed successfully");
+    return static_cast<int>(tokens.size());
+  } catch (...) {
+    LogDebug("Unknown exception in CountTokens");
+    return -1;
+  }
+}
+
+// Get the name of the currently active conversation
+const char* GemmaContext::GetCurrentConversation() {
+  return active_conversation_name.c_str();
+}
+
+}  // namespace gcpp
diff --git a/gemma/bindings/context.h b/gemma/bindings/context.h
new file mode 100644
index 0000000..859a644
--- /dev/null
+++ b/gemma/bindings/context.h
@@ -0,0 +1,316 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_CONTEXT_H_
+#define THIRD_PARTY_GEMMA_CPP_GEMMA_CONTEXT_H_
+
+#include <memory>  // For std::shared_ptr, std::make_shared
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// Logging
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <stdio.h>
+#endif
+
+#include "gemma/gemma.h"
+#include "gemma/gemma_args.h"
+#include "gemma/kv_cache.h"
+#include "ops/matmul.h"  // MatMulEnv
+#include "hwy/base.h"
+#include "hwy/highway.h"
+
+namespace gcpp {
+
+// Struct to hold data for a single conversation thread
+struct ConversationData {
+  ConversationData(const ModelConfig& model_config,
+                   const InferenceArgs& inference_args,
+                   const Allocator& allocator);
+  ConversationData(const ConversationData& other);
+
+  std::unique_ptr<KVCache> kv_cache;
+  size_t abs_pos = 0;
+};
+
+typedef bool (*GemmaTokenCallback)(const char* text, void* user_data);
+typedef void (*GemmaLogCallback)(const char* message, void* user_data);
+
+class GemmaContext {
+ private:
+  GemmaContext(const LoaderArgs& loader, const InferenceArgs& inference_args,
+               const ThreadingArgs& threading_args, int max_generated_tokens);
+
+ public:
+  static GemmaContext* Create(const char* tokenizer_path,
+                              const char* weights_path,
+                              int max_generated_tokens);
+
+  // Returns length of generated text, or -1 on error
+  int Generate(const char* prompt_string, char* output, int max_output_chars,
+               GemmaTokenCallback callback, void* user_data);
+  // Returns length of generated text, or -1 on error
+  int GenerateMultimodal(const char* prompt_string, const void* image_data,
+                         int image_width, int image_height, char* output,
+                         int max_output_chars, GemmaTokenCallback callback,
+                         void* user_data);
+
+  // Returns number of tokens in text, or -1 on error
+  int CountTokens(const char* text);
+
+  // Add new method to set logger
+  static void SetLogCallback(GemmaLogCallback callback, void* user_data) {
+    s_log_callback = callback;
+    s_log_user_data = user_data;
+  }
+
+  // Set max generated tokens
+  void SetMaxGeneratedTokens(size_t value) {
+    inference_args.max_generated_tokens = value;
+    LogDebug("Setting max_generated_tokens to configured value");
+  }
+
+  // Set multiturn flag (0 = disabled, 1 = enabled)
+  void SetMultiturn(int value) {
+    inference_args.multiturn = value;
+    LogDebug("Setting multiturn to configured value");
+  }
+
+  // Set temperature for token generation
+  void SetTemperature(float value) {
+    inference_args.temperature = value;
+    LogDebug("Setting temperature to configured value");
+  }
+
+  // Set top_k parameter for sampling
+  void SetTopK(int value) {
+    inference_args.top_k = value;
+    LogDebug("Setting top_k to configured value");
+  }
+
+  // Set deterministic flag
+  void SetDeterministic(bool value) {
+    inference_args.deterministic = value;
+    // Reset the random number generator for deterministic generation
+    if (value) {
+      gen.seed(0x87654321);
+    }
+    LogDebug("Setting deterministic flag to configured value");
+  }
+
+  // Set prefill_tbatch_size
+  void SetPrefillTbatchSize(size_t value) {
+    inference_args.prefill_tbatch_size = value;
+    LogDebug("Setting prefill_tbatch_size to configured value");
+  }
+
+  void SaveConversation() {
+    if (!active_conversation || active_conversation_name.empty()) {
+      if (!active_conversation) {
+        LogDebug("SaveConversation: No active conversation to save.");
+      } else {  // active_conversation_name must be empty
+        LogDebug(
+            "SaveConversation: Active conversation name is empty. Cannot "
+            "save.");
+      }
+      return;
+    }
+    std::string log_msg = "SaveConversation: Attempting to save '";
+    log_msg += active_conversation_name;
+    log_msg += "' to prewarmed_cache.";
+    LogDebug(log_msg.c_str());
+
+    // Create a deep copy of the active_conversation via copy ctor.
+    auto conversation_copy =
+        std::make_shared<ConversationData>(*active_conversation);
+
+    // Store the deep copy in prewarmed_cache.
+    // If a conversation with the same name already exists, it will be
+    // overwritten. std::shared_ptr will handle the destruction of the old
+    // object if it's being replaced.
+    prewarmed_cache[active_conversation_name] = conversation_copy;
+
+    log_msg = "SaveConversation: Successfully saved '";
+    log_msg += active_conversation_name;
+    log_msg += "' to prewarmed_cache.";
+    LogDebug(log_msg.c_str());
+  }
+
+  // Reset the currently active conversation
+  void ResetConversation() {
+    if (active_conversation) {
+      std::string log_prefix = "ResetConversation ('";
+      log_prefix += active_conversation_name.empty() ? "[unnamed]"
+                                                     : active_conversation_name;
+      log_prefix += "'): ";
+      LogDebug((log_prefix + "Attempting to reset.").c_str());
+      // Attempt to restore from prewarmed_cache first, regardless of name.
+      auto it = prewarmed_cache.find(active_conversation_name);
+      if (it != prewarmed_cache.end() && it->second && it->second->kv_cache) {
+        // Found in prewarmed_cache and the cached entry is valid.
+        LogDebug((log_prefix + "Found in prewarmed_cache. Restoring state.")
+                     .c_str());
+        active_conversation->abs_pos = it->second->abs_pos;
+        // Perform a deep copy of the KVCache from the prewarmed version.
+        active_conversation->kv_cache =
+            std::make_unique<KVCache>(it->second->kv_cache->Copy());
+        LogDebug((log_prefix + "Successfully restored from prewarmed_cache.")
+                     .c_str());
+        return;
+      }
+
+      // If not found in prewarmed_cache or prewarmed_cache entry is invalid,
+      // rewind to initial state.
+      active_conversation->abs_pos = 0;
+      // Replace the cache within the current ConversationData object
+      active_conversation->kv_cache = std::make_unique<KVCache>(
+          model.Config(), inference_args, ctx.allocator);
+
+      LogDebug((log_prefix + "Successfully rewound to initial state.").c_str());
+    } else {
+      LogDebug("Cannot reset conversation: active_conversation is null");
+    }
+  }
+
+  // Create a new named conversation
+  bool CreateConversation(const char* conversation_name) {
+    std::string name(conversation_name);
+    if (conversation_cache.count(name)) {
+      LogDebug("Conversation already exists");
+      return false;
+    }
+    LogDebug("Creating new conversation");
+    // Create a new ConversationData object using make_shared
+    conversation_cache[name] = std::make_shared<ConversationData>(
+        model.Config(), inference_args, ctx.allocator);
+    return true;
+  }
+
+  // Switch to a named conversation
+  bool SwitchConversation(const char* conversation_name) {
+    std::string name(conversation_name);
+    auto it = conversation_cache.find(name);
+    if (it == conversation_cache.end()) {
+      LogDebug("Conversation not found");
+      return false;
+    }
+    LogDebug("Switching active conversation");
+    active_conversation = it->second;
+    active_conversation_name = conversation_name;
+    return true;
+  }
+
+  // Delete a named conversation
+  bool DeleteConversation(const char* conversation_name) {
+    std::string name(conversation_name);
+    auto it = conversation_cache.find(name);
+
+    if (it == conversation_cache.end()) {
+      LogDebug("Conversation not found for deletion");
+      return false;
+    }
+    if (name == "default") {
+      LogDebug("Cannot delete the default conversation");
+      return false;
+    }
+    if (it->second == active_conversation) {
+      LogDebug("Cannot delete the currently active conversation");
+      return false;
+    }
+
+    LogDebug("Deleting conversation");
+    conversation_cache.erase(it);
+
+    auto it2 = prewarmed_cache.find(name);
+    if (it2 != prewarmed_cache.end()) {
+      prewarmed_cache.erase(it2);
+    }
+
+    return true;
+  }
+
+  // Check if a named conversation exists
+  bool HasConversation(const char* conversation_name) {
+    std::string name(conversation_name);
+    return conversation_cache.count(name);
+  }
+
+  // Get the name of the currently active conversation
+  const char* GetCurrentConversation();
+
+ private:
+  // Internal implementation shared by Generate and GenerateMultimodal
+  int GenerateInternal(const char* prompt_string,
+                       const void* image_data,  // Null for text-only generation
+                       int image_width,
+                       int image_height,
+                       char* output, int max_output_chars,
+                       GemmaTokenCallback callback, void* user_data);
+
+  // Pointer to the currently active conversation's data
+  std::shared_ptr<ConversationData> active_conversation;
+
+  // Cache of all named conversations
+  std::unordered_map<std::string, std::shared_ptr<ConversationData>>
+      conversation_cache;
+  std::unordered_map<std::string, std::shared_ptr<ConversationData>>
+      prewarmed_cache;
+
+  // Buffers (potentially could be moved into ConversationData if needed
+  // per-conversation)
+  std::string prompt_buffer;
+  std::string result_buffer;
+  std::vector<int> token_buffer;
+
+  // Cached args (remain global for the context)
+  InferenceArgs inference_args;
+  ThreadingArgs threading_args;
+  ThreadingContext ctx;
+  MatMulEnv matmul_env;
+
+  std::string active_conversation_name;
+
+  // Model itself (don't move this, needs to be below the args above)
+  Gemma model;
+
+  // Random generator (remains global for the context)
+  std::mt19937 gen;
+
+  // Static members for logging
+  static GemmaLogCallback s_log_callback;
+  static void* s_log_user_data;
+
+  // Use logging helper method to print messages into a managed callback if
+  // necessary
+  static void LogDebug(const char* message) {
+    if (s_log_callback != nullptr) {
+      s_log_callback(message, s_log_user_data);
+    } else {
+#ifdef _WIN32
+      OutputDebugStringA(message);
+#else
+      printf("%s", message);
+#endif
+    }
+  }
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_CONTEXT_H_
diff --git a/gemma/common.cc b/gemma/common.cc
deleted file mode 100644
index 0d8977b..0000000
--- a/gemma/common.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gemma/common.h"
-
-#include <math.h>  // sqrtf
-#include <stddef.h>
-#include <string.h>
-
-#include <algorithm>  // std::transform
-#include <cctype>
-#include <string>
-#include <vector>
-
-#include "compression/shared.h"
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-namespace gcpp {
-
-constexpr const char* kModelFlags[] = {
-    "2b-pt", "2b-it",                // Gemma 2B
-    "7b-pt", "7b-it",                // Gemma 7B
-    "gr2b-pt", "gr2b-it",            // RecurrentGemma
-    "tiny",                          // Gemma Tiny (mostly for debugging)
-    "gemma2-2b-pt", "gemma2-2b-it",  // Gemma2 2B
-    "9b-pt", "9b-it",                // Gemma2 9B
-    "27b-pt", "27b-it",              // Gemma2 27B
-    "paligemma-224",                 // PaliGemma 224
-    "paligemma-448",                 // PaliGemma 448
-    "paligemma2-3b-224",             // PaliGemma2 3B 224
-    "paligemma2-3b-448",             // PaliGemma2 3B 448
-    "paligemma2-10b-224",            // PaliGemma2 10B 224
-    "paligemma2-10b-448",            // PaliGemma2 10B 448
-    "gemma3-4b",                     // Gemma3 4B
-    "gemma3-1b",                     // Gemma3 1B
-    "gemma3-12b",                    // Gemma3 12B
-    "gemma3-27b",                    // Gemma3 27B
-};
-constexpr Model kModelTypes[] = {
-    Model::GEMMA_2B, Model::GEMMA_2B,      // Gemma 2B
-    Model::GEMMA_7B, Model::GEMMA_7B,      // Gemma 7B
-    Model::GRIFFIN_2B, Model::GRIFFIN_2B,  // RecurrentGemma
-    Model::GEMMA_TINY,                     // Gemma Tiny
-    Model::GEMMA2_2B, Model::GEMMA2_2B,    // Gemma2 2B
-    Model::GEMMA2_9B, Model::GEMMA2_9B,    // Gemma2 9B
-    Model::GEMMA2_27B, Model::GEMMA2_27B,  // Gemma2 27B
-    Model::PALIGEMMA_224,                  // PaliGemma 224
-    Model::PALIGEMMA_448,                  // PaliGemma 448
-    Model::PALIGEMMA2_3B_224,              // PaliGemma2 3B 224
-    Model::PALIGEMMA2_3B_448,              // PaliGemma2 3B 448
-    Model::PALIGEMMA2_10B_224,             // PaliGemma2 10B 224
-    Model::PALIGEMMA2_10B_448,             // PaliGemma2 10B 448
-    Model::GEMMA3_4B,                      // Gemma3 4B
-    Model::GEMMA3_1B,                      // Gemma3 1B
-    Model::GEMMA3_12B,                     // Gemma3 12B
-    Model::GEMMA3_27B,                     // Gemma3 27B
-};
-constexpr PromptWrapping kPromptWrapping[] = {
-    PromptWrapping::GEMMA_PT, PromptWrapping::GEMMA_IT,    // Gemma 2B
-    PromptWrapping::GEMMA_PT, PromptWrapping::GEMMA_IT,    // Gemma 7B
-    PromptWrapping::GEMMA_PT, PromptWrapping::GEMMA_IT,    // RecurrentGemma
-    PromptWrapping::GEMMA_IT,                              // Gemma Tiny
-    PromptWrapping::GEMMA_PT, PromptWrapping::GEMMA_IT,    // Gemma2 2B
-    PromptWrapping::GEMMA_PT, PromptWrapping::GEMMA_IT,    // Gemma2 9B
-    PromptWrapping::GEMMA_PT, PromptWrapping::GEMMA_IT,    // Gemma2 27B
-    PromptWrapping::PALIGEMMA, PromptWrapping::PALIGEMMA,  // PaliGemma 224/448
-    PromptWrapping::PALIGEMMA, PromptWrapping::PALIGEMMA,  // PG2 3B 224/448
-    PromptWrapping::PALIGEMMA, PromptWrapping::PALIGEMMA,  // PG2 10B 224/448
-    PromptWrapping::GEMMA_VLM,                             // Gemma3 4B
-    PromptWrapping::GEMMA_PT,                              // Gemma3 1B
-    PromptWrapping::GEMMA_VLM,                             // Gemma3 12B
-    PromptWrapping::GEMMA_VLM,                             // Gemma3 27B
-};
-
-constexpr size_t kNumModelFlags = std::size(kModelFlags);
-static_assert(kNumModelFlags == std::size(kModelTypes));
-static_assert(kNumModelFlags == std::size(kPromptWrapping));
-
-const char* ParseModelTypeAndWrapping(const std::string& model_flag,
-                                      Model& model, PromptWrapping& wrapping) {
-  static std::string kErrorMessageBuffer =
-      "Invalid or missing model flag, need to specify one of ";
-  for (size_t i = 0; i + 1 < kNumModelFlags; ++i) {
-    kErrorMessageBuffer.append(kModelFlags[i]);
-    kErrorMessageBuffer.append(", ");
-  }
-  kErrorMessageBuffer.append(kModelFlags[kNumModelFlags - 1]);
-  kErrorMessageBuffer.append(".");
-  std::string model_type_lc = model_flag;
-  std::transform(model_type_lc.begin(), model_type_lc.end(),
-                 model_type_lc.begin(), ::tolower);
-  for (size_t i = 0; i < kNumModelFlags; ++i) {
-    if (kModelFlags[i] == model_type_lc) {
-      model = kModelTypes[i];
-      wrapping = kPromptWrapping[i];
-      HWY_ASSERT(std::string(ModelString(model, wrapping)) == model_type_lc);
-      return nullptr;
-    }
-  }
-  return kErrorMessageBuffer.c_str();
-}
-
-const char* ModelString(Model model, PromptWrapping wrapping) {
-  for (size_t i = 0; i < kNumModelFlags; i++) {
-    if (kModelTypes[i] == model && kPromptWrapping[i] == wrapping)
-      return kModelFlags[i];
-  }
-  HWY_ABORT("Unknown model %d wrapping %d\n", static_cast<int>(model),
-            static_cast<int>(wrapping));
-}
-
-const char* StringFromType(Type type) {
-  return kTypeStrings[static_cast<size_t>(type)];
-}
-
-const char* ParseType(const std::string& type_string, Type& type) {
-  constexpr size_t kNum = std::size(kTypeStrings);
-  static std::string kErrorMessageBuffer =
-      "Invalid or missing type, need to specify one of ";
-  for (size_t i = 0; i + 1 < kNum; ++i) {
-    kErrorMessageBuffer.append(kTypeStrings[i]);
-    kErrorMessageBuffer.append(", ");
-  }
-  kErrorMessageBuffer.append(kTypeStrings[kNum - 1]);
-  kErrorMessageBuffer.append(".");
-  std::string type_lc = type_string;
-  std::transform(type_lc.begin(), type_lc.end(), type_lc.begin(), ::tolower);
-  for (size_t i = 0; i < kNum; ++i) {
-    if (kTypeStrings[i] == type_lc) {
-      type = static_cast<Type>(i);
-      HWY_ASSERT(std::string(StringFromType(type)) == type_lc);
-      return nullptr;
-    }
-  }
-  return kErrorMessageBuffer.c_str();
-}
-
-void Wrap(const ModelInfo& info, size_t pos, std::string& prompt) {
-
-  // Instruction-tuned models are trained to expect control tokens.
-  if (info.wrapping == PromptWrapping::GEMMA_IT) {
-    // Prepend "<end_of_turn>" if this is a multi-turn dialogue continuation.
-    const std::string start = (pos == 0)
-                                  ? "<start_of_turn>user\n"
-                                  : "<end_of_turn>\n<start_of_turn>user\n";
-    prompt = start + prompt + "<end_of_turn>\n<start_of_turn>model\n";
-  }
-}
-
-float EmbeddingScaling(size_t model_dim) {
-  // Round to bf16 to match Gemma's Embedder, which casts before mul.
-  return hwy::ConvertScalarTo<float>(hwy::ConvertScalarTo<hwy::bfloat16_t>(
-      sqrtf(static_cast<float>(model_dim))));
-}
-
-float ChooseQueryScale(const ModelConfig& config) {
-  if (config.query_scale == QueryScaleType::SqrtModelDimDivNumHeads)
-    return 1.0f / sqrtf(static_cast<float>(config.model_dim /
-                                           config.layer_configs[0].heads));
-  // QueryScaleType::SqrtKeySize
-  return 1.0f / sqrtf(static_cast<float>(config.layer_configs[0].qkv_dim));
-}
-
-}  // namespace gcpp
diff --git a/gemma/common.h b/gemma/common.h
deleted file mode 100644
index 984b0ba..0000000
--- a/gemma/common.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_H_
-
-#include <stddef.h>
-
-#include <string>
-
-#include "compression/shared.h"  // PromptWrapping
-#include "gemma/configs.h"  // IWYU pragma: export
-#include "hwy/base.h"  // ConvertScalarTo
-
-namespace gcpp {
-
-// Struct to bundle model information.
-struct ModelInfo {
-  Model model;
-  PromptWrapping wrapping;
-  Type weight;
-};
-
-// Returns error string or nullptr if OK.
-// Thread-hostile.
-const char* ParseModelTypeAndWrapping(const std::string& model_flag,
-                                      Model& model, PromptWrapping& wrapping);
-const char* ParseType(const std::string& type_string, Type& type);
-
-// Inverse of ParseModelTypeAndWrapping.
-const char* ModelString(Model model, PromptWrapping wrapping);
-const char* StringFromType(Type type);
-
-// Wraps the given prompt using the expected control tokens for IT models.
-void Wrap(const ModelInfo& info, size_t pos, std::string& prompt);
-
-// Returns the scale value to use for the embedding (basically sqrt model_dim).
-float EmbeddingScaling(size_t model_dim);
-
-// Returns the scale value to use for the query in the attention computation.
-float ChooseQueryScale(const ModelConfig& config);
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_H_
diff --git a/gemma/configs.cc b/gemma/configs.cc
index 276c8f9..562500d 100644
--- a/gemma/configs.cc
+++ b/gemma/configs.cc
@@ -15,22 +15,31 @@
 
 #include "gemma/configs.h"
 
-#include <cstddef>
-#include <iostream>
+#include <stddef.h>
+#include <stdio.h>
 
+#include <string>
+#include <vector>
+
+#include "compression/types.h"  // Type
+#include "io/fields.h"           // IFields
+#include "io/io.h"               // Path
 #include "hwy/base.h"
 
 namespace gcpp {
 
+static constexpr size_t kVocabSize = 256000;
+
+static constexpr size_t kGemmaV3VocabSize = 262144;
+
 static ModelConfig ConfigNoSSM() {
   ModelConfig config;
-  config.scale_names = {"att_ein",      "qkv_ein",   "gr_lin_x_w", "gr_lin_y_w",
-                        "gr_lin_out_w", "gr_gate_w", "gating_ein", "linear_w"};
+  config.scale_base_names = {"att_ein",    "qkv_ein",      "gr_lin_x_w",
+                             "gr_lin_y_w", "gr_lin_out_w", "gr_gate_w",
+                             "gating_ein", "linear_w"};
   return config;
 }
 
-static ModelConfig ConfigBaseGemmaV1() { return ConfigNoSSM(); }
-
 static ModelConfig ConfigBaseGemmaV2() {
   ModelConfig config = ConfigNoSSM();
   config.att_cap = 50.0f;
@@ -54,17 +63,17 @@ static LayerConfig LayerConfigGemma2_27B(size_t model_dim) {
 
 static ModelConfig ConfigGemma2_27B() {
   ModelConfig config = ConfigBaseGemmaV2();
-  config.model_name = "Gemma2_27B";
+  config.display_name = "Gemma2_27B";
   config.model = Model::GEMMA2_27B;
   config.model_dim = 4608;
   config.vocab_size = kVocabSize;
-  config.seq_len = 8192;
+  config.max_seq_len = 8192;
   LayerConfig layer_config = LayerConfigGemma2_27B(config.model_dim);
-  config.layer_configs = {46, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.num_layers = 46;
+  config.layer_configs = {config.num_layers, layer_config};
   config.query_scale = QueryScaleType::SqrtModelDimDivNumHeads;
   config.attention_window_sizes =
-      RepeatedAttentionWindowSizes<46, 2>({4096, 8192});
+      RepeatedAttentionWindowSizes<46, 2>({4096, config.max_seq_len});
   return config;
 }
 
@@ -82,17 +91,17 @@ static LayerConfig LayerConfigGemma2_9B(size_t model_dim) {
 
 static ModelConfig ConfigGemma2_9B() {
   ModelConfig config = ConfigBaseGemmaV2();
-  config.model_name = "Gemma2_9B";
+  config.display_name = "Gemma2_9B";
   config.model = Model::GEMMA2_9B;
   config.model_dim = 3584;
   config.vocab_size = kVocabSize;
-  config.seq_len = 8192;
+  config.max_seq_len = 8192;
   LayerConfig layer_config = LayerConfigGemma2_9B(config.model_dim);
-  config.layer_configs = {42, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.num_layers = 42;
+  config.layer_configs = {config.num_layers, layer_config};
   config.query_scale = QueryScaleType::SqrtKeySize;
   config.attention_window_sizes =
-      RepeatedAttentionWindowSizes<42, 2>({4096, 8192});
+      RepeatedAttentionWindowSizes<42, 2>({4096, config.max_seq_len});
   return config;
 }
 
@@ -110,66 +119,17 @@ static LayerConfig LayerConfigGemma2_2B(size_t model_dim) {
 
 static ModelConfig ConfigGemma2_2B() {
   ModelConfig config = ConfigBaseGemmaV2();
-  config.model_name = "Gemma2_2B";
+  config.display_name = "Gemma2_2B";
   config.model = Model::GEMMA2_2B;
   config.model_dim = 2304;
   config.vocab_size = kVocabSize;
-  config.seq_len = 8192;
+  config.max_seq_len = 8192;
   LayerConfig layer_config = LayerConfigGemma2_2B(config.model_dim);
-  config.layer_configs = {26, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.num_layers = 26;
+  config.layer_configs = {config.num_layers, layer_config};
   config.query_scale = QueryScaleType::SqrtKeySize;
   config.attention_window_sizes =
-      RepeatedAttentionWindowSizes<26, 2>({4096, 8192});
-  return config;
-}
-
-static LayerConfig LayerConfigGemma7B(size_t model_dim) {
-  LayerConfig config;
-  config.model_dim = model_dim;
-  config.ff_hidden_dim = 16 * 3072 / 2;  // = 24576
-  config.heads = 16;
-  config.kv_heads = 16;
-  config.qkv_dim = 256;
-  return config;
-}
-
-static ModelConfig ConfigGemma7B() {
-  ModelConfig config = ConfigBaseGemmaV1();
-  config.model_name = "Gemma7B";
-  config.model = Model::GEMMA_7B;
-  config.model_dim = 3072;
-  config.vocab_size = kVocabSize;
-  config.seq_len = kSeqLen;
-  LayerConfig layer_config = LayerConfigGemma7B(config.model_dim);
-  config.layer_configs = {28, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
-  config.query_scale = QueryScaleType::SqrtKeySize;
-  config.attention_window_sizes = FixedAttentionWindowSizes<28>(kSeqLen);
-  return config;
-}
-
-static LayerConfig LayerConfigGemma2B(size_t model_dim) {
-  LayerConfig config;
-  config.model_dim = model_dim;
-  config.ff_hidden_dim = 16 * 2048 / 2;  // = 16384
-  config.heads = 8;
-  config.kv_heads = 1;
-  config.qkv_dim = 256;
-  return config;
-}
-
-static ModelConfig ConfigGemma2B() {
-  ModelConfig config = ConfigBaseGemmaV1();
-  config.model_name = "Gemma2B";
-  config.model = Model::GEMMA_2B;
-  config.model_dim = 2048;
-  config.vocab_size = kVocabSize;
-  config.seq_len = kSeqLen;
-  LayerConfig layer_config = LayerConfigGemma2B(config.model_dim);
-  config.layer_configs = {18, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
-  config.attention_window_sizes = FixedAttentionWindowSizes<18>(kSeqLen);
+      RepeatedAttentionWindowSizes<26, 2>({4096, config.max_seq_len});
   return config;
 }
 
@@ -185,17 +145,18 @@ static LayerConfig LayerConfigGemmaTiny(size_t model_dim) {
 
 static ModelConfig ConfigGemmaTiny() {
   ModelConfig config = ConfigNoSSM();
-  config.model_name = "GemmaTiny";
+  config.display_name = "GemmaTiny";
   config.model = Model::GEMMA_TINY;
-  config.model_dim = 128;
-  config.vocab_size = 64;
-  config.seq_len = 32;
+  config.wrapping = PromptWrapping::GEMMA_IT;
+  config.model_dim = 32;
+  config.vocab_size = 32;  // at least two f32 vectors
+  config.max_seq_len = 32;
   LayerConfig layer_config = LayerConfigGemmaTiny(config.model_dim);
-  config.layer_configs = {3, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.num_layers = 2;
+  config.layer_configs = {config.num_layers, layer_config};
   config.query_scale = QueryScaleType::SqrtKeySize;
-  config.attention_window_sizes = FixedAttentionWindowSizes<3>(32);
-  // This is required for optimize_test to pass.
+  config.attention_window_sizes = FixedAttentionWindowSizes<2>(32);
+  config.att_cap = 50.0f;
   config.final_cap = 30.0f;
   config.eos_id = 11;
   config.secondary_eos_id = 11;
@@ -223,23 +184,23 @@ static LayerConfig LayerConfigGriffin2B(size_t model_dim) {
 
 static ModelConfig ConfigGriffin2B() {
   ModelConfig config = ConfigNoSSM();
-  config.model_name = "Griffin2B";
+  config.display_name = "Griffin2B";
   config.model = Model::GRIFFIN_2B;
-  // Griffin uses local attention, so kSeqLen is actually the local attention
-  // window.
+  // Griffin uses local attention, so max_seq_len is actually the local
+  // attention window.
   config.model_dim = 2560;
   config.vocab_size = kVocabSize;
-  config.seq_len = 2048;
+  config.max_seq_len = 2048;
   LayerConfig layer_config = LayerConfigGriffin2B(config.model_dim);
-  config.layer_configs = {26, layer_config};
-  for (size_t i = 2; i < config.layer_configs.size(); i += 3) {
+  config.num_layers = 26;
+  config.layer_configs = {config.num_layers, layer_config};
+  for (size_t i = 2; i < config.num_layers; i += 3) {
     config.layer_configs[i].type = LayerAttentionType::kGemma;
     config.layer_configs[i].griffin_dim = 0;
   }
-  config.num_tensor_scales = 140;
-  config.attention_window_sizes = FixedAttentionWindowSizes<26>(config.seq_len);
+  config.attention_window_sizes =
+      FixedAttentionWindowSizes<26>(config.max_seq_len);
   config.use_local_attention = true;
-  // This is required for optimize_test to pass.
   config.final_cap = 0.0f;
   return config;
 }
@@ -273,26 +234,10 @@ static void AddVitConfig(ModelConfig& config, size_t image_size = 224) {
   config.vit_config.num_scales = 4 * config.vit_config.layer_configs.size();
 }
 
-static ModelConfig ConfigPaliGemma_224() {
-  ModelConfig config = ConfigGemma2B();
-  config.model_name = "PaliGemma_224";
-  config.model = Model::PALIGEMMA_224;
-  AddVitConfig(config);
-  return config;
-}
-
-static ModelConfig ConfigPaliGemma_448() {
-  ModelConfig config = ConfigGemma2B();
-  config.model_name = "PaliGemma_448";
-  config.model = Model::PALIGEMMA_448;
-  AddVitConfig(config, /*image_size=*/448);
-  return config;
-}
-
 ModelConfig GetVitConfig(const ModelConfig& config) {
   ModelConfig vit_config = ConfigNoSSM();
   vit_config.model_dim = config.vit_config.model_dim;
-  vit_config.seq_len = config.vit_config.seq_len;
+  vit_config.max_seq_len = config.vit_config.seq_len;
   vit_config.layer_configs = config.vit_config.layer_configs;
   vit_config.pool_dim = config.vit_config.pool_dim;
   vit_config.wrapping = config.wrapping;
@@ -303,32 +248,36 @@ ModelConfig GetVitConfig(const ModelConfig& config) {
 
 static ModelConfig ConfigPaliGemma2_3B_224() {
   ModelConfig config = ConfigGemma2_2B();
-  config.model_name = "PaliGemma2_3B_224";
+  config.display_name = "PaliGemma2_3B_224";
   config.model = Model::PALIGEMMA2_3B_224;
+  config.wrapping = PromptWrapping::PALIGEMMA;
   AddVitConfig(config);
   return config;
 }
 
 static ModelConfig ConfigPaliGemma2_3B_448() {
   ModelConfig config = ConfigGemma2_2B();
-  config.model_name = "PaliGemma2_3B_448";
+  config.display_name = "PaliGemma2_3B_448";
   config.model = Model::PALIGEMMA2_3B_448;
+  config.wrapping = PromptWrapping::PALIGEMMA;
   AddVitConfig(config, /*image_size=*/448);
   return config;
 }
 
 static ModelConfig ConfigPaliGemma2_10B_224() {
   ModelConfig config = ConfigGemma2_9B();
-  config.model_name = "PaliGemma2_10B_224";
+  config.display_name = "PaliGemma2_10B_224";
   config.model = Model::PALIGEMMA2_10B_224;
+  config.wrapping = PromptWrapping::PALIGEMMA;
   AddVitConfig(config);
   return config;
 }
 
 static ModelConfig ConfigPaliGemma2_10B_448() {
   ModelConfig config = ConfigGemma2_9B();
-  config.model_name = "PaliGemma2_10B_448";
+  config.display_name = "PaliGemma2_10B_448";
   config.model = Model::PALIGEMMA2_10B_448;
+  config.wrapping = PromptWrapping::PALIGEMMA;
   AddVitConfig(config, /*image_size=*/448);
   return config;
 }
@@ -358,18 +307,19 @@ static LayerConfig LayerConfigGemma3_1B_LM(size_t model_dim) {
 
 static ModelConfig ConfigGemma3_1B() {
   ModelConfig config = ConfigBaseGemmaV3();
-  config.model_name = "Gemma3_1B";
+  config.display_name = "Gemma3_1B";
   config.model = Model::GEMMA3_1B;
+  config.wrapping = PromptWrapping::GEMMA_VLM;
   config.model_dim = 1152;
-  config.vocab_size = 262144;  // new vocab size / tokenizer
-  config.seq_len = 32 * 1024;
+  config.vocab_size = kGemmaV3VocabSize;  // new vocab size / tokenizer
+  config.max_seq_len = 32 * 1024;
   LayerConfig layer_config = LayerConfigGemma3_1B_LM(config.model_dim);
-  config.layer_configs = {26, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.num_layers = 26;
+  config.layer_configs = {config.num_layers, layer_config};
   config.query_scale = QueryScaleType::SqrtKeySize;
   // interleaved local / global attention
   config.attention_window_sizes = RepeatedAttentionWindowSizes<26, 6>(
-      {512, 512, 512, 512, 512, config.seq_len});
+      {512, 512, 512, 512, 512, config.max_seq_len});
   return config;
 }
 
@@ -389,27 +339,29 @@ static LayerConfig LayerConfigGemma3_4B_LM(size_t model_dim) {
 // Until we have the SigLIP checkpoints included, we use the LM config directly.
 static ModelConfig ConfigGemma3_4B_LM() {
   ModelConfig config = ConfigBaseGemmaV3();
-  config.model_name = "Gemma3_4B";
+  config.display_name = "Gemma3_4B";
   config.model = Model::GEMMA3_4B;
+  config.wrapping = PromptWrapping::GEMMA_VLM;
   config.model_dim = 2560;
-  config.vocab_size = 262144;  // new vocab size / tokenizer
-  config.seq_len = 32 * 1024;
+  config.vocab_size = kGemmaV3VocabSize;  // new vocab size / tokenizer
+  config.max_seq_len = 32 * 1024;
   LayerConfig layer_config = LayerConfigGemma3_4B_LM(config.model_dim);
-  config.layer_configs = {34, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.num_layers = 34;
+  config.layer_configs = {config.num_layers, layer_config};
   config.query_scale = QueryScaleType::SqrtKeySize;
   // interleaved local / global attention
   config.attention_window_sizes = RepeatedAttentionWindowSizes<34, 6>(
-      {1024, 1024, 1024, 1024, 1024, config.seq_len});
+      {1024, 1024, 1024, 1024, 1024, config.max_seq_len});
   return config;
 }
 
 static ModelConfig ConfigGemma3_4B() {
   ModelConfig config = ConfigGemma3_4B_LM();
-  config.model_name = "Gemma3_4B";
+  config.display_name = "Gemma3_4B";
   config.model = Model::GEMMA3_4B;
+  config.wrapping = PromptWrapping::GEMMA_VLM;
   AddVitConfig(config, /*image_size=*/896);
-  config.vocab_size = 262144;
+  config.vocab_size = kGemmaV3VocabSize;
   config.vit_config.pool_dim = 4;
   const size_t num_patches =
       config.vit_config.image_size / config.vit_config.patch_width;
@@ -436,27 +388,29 @@ static LayerConfig LayerConfigGemma3_12B_LM(size_t model_dim) {
 
 static ModelConfig ConfigGemma3_12B_LM() {
   ModelConfig config = ConfigBaseGemmaV3();
-  config.model_name = "Gemma3_12B";
+  config.display_name = "Gemma3_12B";
   config.model = Model::GEMMA3_12B;
+  config.wrapping = PromptWrapping::GEMMA_VLM;
   config.model_dim = 3840;
-  config.vocab_size = 262144;  // new vocab size / tokenizer
-  config.seq_len = 32 * 1024;
+  config.vocab_size = kGemmaV3VocabSize;  // new vocab size / tokenizer
+  config.max_seq_len = 32 * 1024;
   LayerConfig layer_config = LayerConfigGemma3_12B_LM(config.model_dim);
-  config.layer_configs = {48, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.num_layers = 48;
+  config.layer_configs = {config.num_layers, layer_config};
   config.query_scale = QueryScaleType::SqrtKeySize;
   // interleaved local / global attention
   config.attention_window_sizes = RepeatedAttentionWindowSizes<48, 6>(
-      {1024, 1024, 1024, 1024, 1024, config.seq_len});
+      {1024, 1024, 1024, 1024, 1024, config.max_seq_len});
   return config;
 }
 
 static ModelConfig ConfigGemma3_12B() {
   ModelConfig config = ConfigGemma3_12B_LM();
-  config.model_name = "Gemma3_12B";
+  config.display_name = "Gemma3_12B";
   config.model = Model::GEMMA3_12B;
+  config.wrapping = PromptWrapping::GEMMA_VLM;
   AddVitConfig(config, /*image_size=*/896);
-  config.vocab_size = 262144;
+  config.vocab_size = kGemmaV3VocabSize;
   config.vit_config.pool_dim = 4;
   const size_t num_patches =
       config.vit_config.image_size / config.vit_config.patch_width;
@@ -483,27 +437,29 @@ static LayerConfig LayerConfigGemma3_27B_LM(size_t model_dim) {
 
 static ModelConfig ConfigGemma3_27B_LM() {
   ModelConfig config = ConfigBaseGemmaV3();
-  config.model_name = "Gemma3_27B";
+  config.display_name = "Gemma3_27B";
   config.model = Model::GEMMA3_27B;
+  config.wrapping = PromptWrapping::GEMMA_VLM;
   config.model_dim = 5376;
-  config.vocab_size = 262144;  // new vocab size / tokenizer
-  config.seq_len = 32 * 1024;
+  config.vocab_size = kGemmaV3VocabSize;  // new vocab size / tokenizer
+  config.max_seq_len = 32 * 1024;
   LayerConfig layer_config = LayerConfigGemma3_27B_LM(config.model_dim);
-  config.layer_configs = {62, layer_config};
-  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.num_layers = 62;
+  config.layer_configs = {config.num_layers, layer_config};
   config.query_scale = QueryScaleType::SqrtKeySize;
   // interleaved local / global attention
   config.attention_window_sizes = RepeatedAttentionWindowSizes<62, 6>(
-      {1024, 1024, 1024, 1024, 1024, config.seq_len});
+      {1024, 1024, 1024, 1024, 1024, config.max_seq_len});
   return config;
 }
 
 static ModelConfig ConfigGemma3_27B() {
   ModelConfig config = ConfigGemma3_27B_LM();
-  config.model_name = "Gemma3_27B";
+  config.display_name = "Gemma3_27B";
   config.model = Model::GEMMA3_27B;
+  config.wrapping = PromptWrapping::GEMMA_VLM;
   AddVitConfig(config, /*image_size=*/896);
-  config.vocab_size = 262144;
+  config.vocab_size = kGemmaV3VocabSize;
   config.vit_config.pool_dim = 4;
   const size_t num_patches =
       config.vit_config.image_size / config.vit_config.patch_width;
@@ -515,12 +471,8 @@ static ModelConfig ConfigGemma3_27B() {
   return config;
 }
 
-ModelConfig ConfigFromModel(Model model) {
+static ModelConfig ConfigFromModel(Model model) {
   switch (model) {
-    case Model::GEMMA_2B:
-      return ConfigGemma2B();
-    case Model::GEMMA_7B:
-      return ConfigGemma7B();
     case Model::GEMMA2_2B:
       return ConfigGemma2_2B();
     case Model::GEMMA2_9B:
@@ -531,10 +483,6 @@ ModelConfig ConfigFromModel(Model model) {
       return ConfigGriffin2B();
     case Model::GEMMA_TINY:
       return ConfigGemmaTiny();
-    case Model::PALIGEMMA_224:
-      return ConfigPaliGemma_224();
-    case Model::PALIGEMMA_448:
-      return ConfigPaliGemma_448();
     case Model::PALIGEMMA2_3B_224:
       return ConfigPaliGemma2_3B_224();
     case Model::PALIGEMMA2_3B_448:
@@ -556,124 +504,249 @@ ModelConfig ConfigFromModel(Model model) {
   }
 }
 
-#define TEST_EQUAL(a, b)                                               \
-  if (a != b) {                                                        \
-    if (debug)                                                         \
-      std::cerr << #a << "=" << a << " != " << #b << "=" << b << "\n"; \
-    result = false;                                                    \
+const char* ModelPrefix(Model model) {
+  switch (model) {
+    case Model::UNKNOWN:
+      return "unknown";
+    case Model::GEMMA2_2B:
+      return "gemma2-2b";
+    case Model::GEMMA2_9B:
+      return "9b";
+    case Model::GEMMA2_27B:
+      return "27b";
+    case Model::GRIFFIN_2B:
+      return "gr2b";
+    case Model::GEMMA_TINY:
+      return "tiny";
+    case Model::PALIGEMMA2_3B_224:
+      return "paligemma2-3b-224";
+    case Model::PALIGEMMA2_3B_448:
+      return "paligemma2-3b-448";
+    case Model::PALIGEMMA2_10B_224:
+      return "paligemma2-10b-224";
+    case Model::PALIGEMMA2_10B_448:
+      return "paligemma2-10b-448";
+    case Model::GEMMA3_4B:
+      return "gemma3-4b";
+    case Model::GEMMA3_1B:
+      return "gemma3-1b";
+    case Model::GEMMA3_12B:
+      return "gemma3-12b";
+    case Model::GEMMA3_27B:
+      return "gemma3-27b";
+    default:
+      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
   }
-
-#define RETURN_IF_NOT_EQUAL(a, b)                                      \
-  if (a != b) {                                                        \
-    if (debug)                                                         \
-      std::cerr << #a << "=" << a << " != " << #b << "=" << b << "\n"; \
-    return false;                                                      \
-  }
-
-#define WARN_IF_NOT_EQUAL(a, b)                                        \
-  if (a != b) {                                                        \
-    std::cerr << #a << "=" << a << " != " << #b << "=" << b << "\n";   \
-  }
-
-bool LayerConfig::TestEqual(const LayerConfig& other, bool partial,
-                            bool debug) const {
-  bool result = true;
-  // Optimized gating may not be set correctly in the c++ configs.
-  if (debug) {
-    WARN_IF_NOT_EQUAL(optimized_gating, other.optimized_gating)
-  }
-  TEST_EQUAL(model_dim, other.model_dim);
-  TEST_EQUAL(griffin_dim, other.griffin_dim);
-  TEST_EQUAL(ff_hidden_dim, other.ff_hidden_dim);
-  TEST_EQUAL(heads, other.heads);
-  TEST_EQUAL(kv_heads, other.kv_heads);
-  TEST_EQUAL(qkv_dim, other.qkv_dim);
-  TEST_EQUAL(conv1d_width, other.conv1d_width);
-  if (!partial) {
-    TEST_EQUAL(ff_biases, other.ff_biases);
-    TEST_EQUAL(softmax_attn_output_biases, other.softmax_attn_output_biases);
-  }
-  TEST_EQUAL(static_cast<int>(post_norm), static_cast<int>(other.post_norm));
-  TEST_EQUAL(static_cast<int>(type), static_cast<int>(other.type));
-  TEST_EQUAL(static_cast<int>(activation), static_cast<int>(other.activation));
-  TEST_EQUAL(static_cast<int>(post_qk), static_cast<int>(other.post_qk));
-  return result;
 }
 
-bool VitConfig::TestEqual(const VitConfig& other, bool partial,
-                          bool debug) const {
-  bool result = true;
-  TEST_EQUAL(model_dim, other.model_dim);
-  TEST_EQUAL(seq_len, other.seq_len);
-  if (!partial) {
-    TEST_EQUAL(num_scales, other.num_scales);
+PromptWrapping ChooseWrapping(const Model model, Tristate wrapping) {
+  if (IsPaliGemma(model)) {
+    if (wrapping != Tristate::kDefault) {
+      HWY_WARN("Ignoring unnecessary --wrapping for PaliGemma models.");
+    }
+    return PromptWrapping::PALIGEMMA;
   }
-  TEST_EQUAL(patch_width, other.patch_width);
-  TEST_EQUAL(image_size, other.image_size);
-  RETURN_IF_NOT_EQUAL(layer_configs.size(), other.layer_configs.size());
-  for (size_t i = 0; i < layer_configs.size(); ++i) {
-    result &=
-        layer_configs[i].TestEqual(other.layer_configs[i], partial, debug);
+  if (IsVLM(model)) {
+    if (wrapping != Tristate::kDefault) {
+      HWY_WARN("Ignoring unnecessary --wrapping for VLM models.");
+    }
+    return PromptWrapping::GEMMA_VLM;
   }
-  return result;
+  // Default to IT unless --wrapping=0.
+  return wrapping == Tristate::kFalse ? PromptWrapping::GEMMA_PT
+                                      : PromptWrapping::GEMMA_IT;
 }
 
-bool ModelConfig::TestEqual(const ModelConfig& other, bool partial,
-                            bool debug) const {
-  bool result = true;
-  TEST_EQUAL(model_family_version, other.model_family_version);
-  // We don't care about model_name, model, wrapping, or weight being different,
-  // but will output in debug mode if they are.
-  if (debug) {
-    WARN_IF_NOT_EQUAL(model_name, other.model_name);
-    WARN_IF_NOT_EQUAL(static_cast<int>(model), static_cast<int>(other.model));
-    WARN_IF_NOT_EQUAL(static_cast<int>(wrapping),
-                      static_cast<int>(other.wrapping));
-    WARN_IF_NOT_EQUAL(static_cast<int>(weight), static_cast<int>(other.weight));
+ModelConfig::ModelConfig(const Model model, Type weight,
+                         PromptWrapping wrapping) {
+  HWY_ASSERT(weight != Type::kUnknown);
+  HWY_ASSERT(wrapping != PromptWrapping::kSentinel);
+  this->model = model;
+  if (model != Model::UNKNOWN) *this = ConfigFromModel(model);
+  HWY_ASSERT(this->model == model);
+  this->weight = weight;
+  this->wrapping = wrapping;
+}
+
+static Model FindModel(const std::string& specifier) {
+  Model found_model = Model::UNKNOWN;
+  ForEachModel([&](Model model) {
+    // Some model names are prefixes of other model names
+    const std::string prefix = std::string(ModelPrefix(model)) + "-";
+    if (specifier.rfind(prefix, 0) == 0) {  // Starts with prefix.
+      // We only expect one match.
+      HWY_ASSERT_M(found_model == Model::UNKNOWN, specifier.c_str());
+      found_model = model;
+    }
+  });
+  HWY_ASSERT_M(found_model != Model::UNKNOWN, specifier.c_str());
+  return found_model;
+}
+
+static Type FindType(const std::string& specifier) {
+  Type found_type = Type::kUnknown;
+  for (size_t i = 1; i < kNumTypes; ++i) {
+    const Type type = static_cast<Type>(i);
+    if (specifier.find(TypeName(type)) != std::string::npos) {  // NOLINT
+      // We only expect one match.
+      HWY_ASSERT_M(found_type == Type::kUnknown, specifier.c_str());
+      found_type = type;
+    }
   }
-  TEST_EQUAL(model_dim, other.model_dim);
-  TEST_EQUAL(vocab_size, other.vocab_size);
-  TEST_EQUAL(seq_len, other.seq_len);
-  if (!partial) {
-    TEST_EQUAL(num_tensor_scales, other.num_tensor_scales);
+  HWY_ASSERT_M(found_type != Type::kUnknown, specifier.c_str());
+  return found_type;
+}
+
+static PromptWrapping FindWrapping(const std::string& specifier) {
+  PromptWrapping found_wrapping = PromptWrapping::kSentinel;
+  for (size_t i = 0; i < static_cast<size_t>(PromptWrapping::kSentinel); ++i) {
+    const PromptWrapping w = static_cast<PromptWrapping>(i);
+    if (specifier.find(WrappingSuffix(w)) != std::string::npos) {  // NOLINT
+      // We expect zero or one match.
+      HWY_ASSERT_M(found_wrapping == PromptWrapping::kSentinel,
+                   specifier.c_str());
+      found_wrapping = w;
+    }
   }
-  TEST_EQUAL(att_cap, other.att_cap);
-  TEST_EQUAL(final_cap, other.final_cap);
-  TEST_EQUAL(absolute_pe, other.absolute_pe);
-  TEST_EQUAL(use_local_attention, other.use_local_attention);
-  TEST_EQUAL(static_cast<int>(query_scale),
-             static_cast<int>(other.query_scale));
-  RETURN_IF_NOT_EQUAL(layer_configs.size(), other.layer_configs.size());
-  for (size_t i = 0; i < layer_configs.size(); ++i) {
-    result &=
-        layer_configs[i].TestEqual(other.layer_configs[i], partial, debug);
+  if (found_wrapping == PromptWrapping::kSentinel) {
+    return ChooseWrapping(FindModel(specifier));
   }
-  RETURN_IF_NOT_EQUAL(attention_window_sizes.size(),
-                     other.attention_window_sizes.size());
-  for (size_t i = 0; i < attention_window_sizes.size(); ++i) {
-    TEST_EQUAL(attention_window_sizes[i], other.attention_window_sizes[i]);
+  return found_wrapping;
+}
+
+// Obtains model/weight/wrapping by finding prefix and suffix strings.
+ModelConfig::ModelConfig(const std::string& specifier)
+    : ModelConfig(FindModel(specifier), FindType(specifier),
+                  FindWrapping(specifier)) {}
+
+std::string ModelConfig::Specifier() const {
+  HWY_ASSERT(model != Model::UNKNOWN);
+  HWY_ASSERT(weight != Type::kUnknown);
+  HWY_ASSERT(wrapping != PromptWrapping::kSentinel);
+
+  std::string base_name = ModelPrefix(model);
+
+  base_name += '-';
+  base_name += TypeName(weight);
+
+  if (wrapping != PromptWrapping::GEMMA_VLM &&
+      wrapping != PromptWrapping::PALIGEMMA) {
+    base_name += WrappingSuffix(wrapping);
   }
-  if (!partial) {
-    if (scale_names != other.scale_names) {
-      result = false;
-      if (debug) {
-        std::cerr << "scale_names mismatch\n";
+
+  return base_name;
+}
+
+// Returns whether all fields match.
+static bool AllEqual(const IFields& a, const IFields& b, bool print) {
+  const std::vector<uint32_t> serialized_a = a.Write();
+  const std::vector<uint32_t> serialized_b = b.Write();
+  if (serialized_a != serialized_b) {
+    if (print) {
+      fprintf(stderr, "%s differs. Recommend generating a diff:\n", a.Name());
+      a.Print();
+      b.Print();
+    }
+    return false;
+  }
+  return true;
+}
+
+bool LayerConfig::TestEqual(const LayerConfig& other, bool print) const {
+  return AllEqual(*this, other, print);
+}
+
+bool VitConfig::TestEqual(const VitConfig& other, bool print) const {
+  return AllEqual(*this, other, print);
+}
+
+bool ModelConfig::TestEqual(const ModelConfig& other, bool print) const {
+  // Early out to guard the loop below; a differing number of layers will anyway
+  // cause a mismatch.
+  if (layer_configs.size() != other.layer_configs.size()) {
+    if (print) {
+      HWY_WARN("Layer configs size mismatch %zu vs %zu", layer_configs.size(),
+               other.layer_configs.size());
+    }
+    return false;
+  }
+
+  // Copy so we can 'ignore' fields by setting them to the same value.
+  ModelConfig a = *this;
+  ModelConfig b = other;
+  // Called by `OverwriteWithCanonical`, so ignore the fields it will set.
+  a.display_name = b.display_name;
+  a.model = b.model;
+
+  // The following are not yet set by config_converter.py, so we here ignore
+  // them for purposes of comparison, and there overwrite the converter's config
+  // with the canonical ModelConfig constructed via (deduced) enum, so that
+  // these fields will be set.
+  // `vit_config` is also not yet set, but we must not ignore it because
+  // otherwise PaliGemma models will be indistinguishable for `configs_test`.
+  a.pool_dim = b.pool_dim;  // ViT
+  a.eos_id = b.eos_id;
+  a.secondary_eos_id = b.secondary_eos_id;
+  a.scale_base_names = b.scale_base_names;
+  for (size_t i = 0; i < a.layer_configs.size(); ++i) {
+    a.layer_configs[i].optimized_gating = b.layer_configs[i].optimized_gating;
+  }
+
+  return AllEqual(a, b, print);
+}
+
+// Constructs the canonical ModelConfig for each model. If there is one for
+// which TestEqual returns true, overwrites `*this` with that and returns true.
+bool ModelConfig::OverwriteWithCanonical() {
+  bool found = false;
+  const bool print = false;
+  ForEachModel([&](Model model) {
+    const ModelConfig config(model, weight, wrapping);
+    if (config.TestEqual(*this, print)) {
+      HWY_ASSERT(!found);  // Should only find one.
+      found = true;
+      *this = config;
+    }
+  });
+  return found;
+}
+
+Model DeduceModel(const Path& blob_path, size_t layers, int layer_types) {
+  switch (layers) {
+    case 2:
+      return Model::GEMMA_TINY;
+    case 26:
+      if (layer_types & kDeducedGriffin) return Model::GRIFFIN_2B;
+      if (layer_types & kDeducedViT) return Model::GEMMA3_1B;
+      return Model::GEMMA2_2B;
+    case 27:
+      return (layer_types & kDeduced448) ? Model::PALIGEMMA2_3B_448
+                                         : Model::PALIGEMMA2_3B_224;
+    case 34:
+      return Model::GEMMA3_4B;
+    case 42:
+      if (layer_types & kDeducedViT) {
+        return (layer_types & kDeduced448) ? Model::PALIGEMMA2_10B_448
+                                           : Model::PALIGEMMA2_10B_224;
       }
-    }
-  }
-  TEST_EQUAL(norm_num_groups, other.norm_num_groups);
-  result &= vit_config.TestEqual(other.vit_config, partial, debug);
-  return result;
-}
+      return Model::GEMMA2_9B;
+    case 46:
+      return Model::GEMMA2_27B;
+    case 48:
+      return Model::GEMMA3_12B;
+    case 62:
+      return Model::GEMMA3_27B;
 
-Model ModelFromConfig(const ModelConfig& config) {
-  for (Model model : kAllModels) {
-    ModelConfig model_config = ConfigFromModel(model);
-    if (config.TestEqual(model_config, /*partial=*/true, /*debug=*/false)) {
-      return model;
-    }
+    // TODO: detect these.
+    /*
+    return Model::GEMMA2_772M;
+    return Model::PALIGEMMA2_772M_224;
+    */
+    default:
+      HWY_WARN("Failed to deduce model type from %s, layer count %zu types %x.",
+               blob_path.path.c_str(), layers, layer_types);
+      return Model::UNKNOWN;
   }
-  return Model::UNKNOWN;
 }
 
 }  // namespace gcpp
diff --git a/gemma/configs.h b/gemma/configs.h
index 837e067..19e6278 100644
--- a/gemma/configs.h
+++ b/gemma/configs.h
@@ -19,35 +19,52 @@
 // Model configurations
 
 #include <stddef.h>
+#include <stdint.h>
 
-#include <algorithm>
 #include <array>
-#include <cstdint>
 #include <string>
-#include <unordered_set>
 #include <vector>
 
-#include "compression/fields.h"  // IFieldsVisitor
-#include "compression/shared.h"  // BF16
+#include "compression/types.h"  // Type
+#include "io/fields.h"           // IFieldsVisitor
+#include "io/io.h"               // Path
+#include "util/basics.h"
 
 namespace gcpp {
 
-// Allow changing pre-allocated kv cache size as a compiler flag
-#ifndef GEMMA_MAX_SEQLEN
-#define GEMMA_MAX_SEQLEN 4096
-#endif  // !GEMMA_MAX_SEQLEN
-
-// Allow changing k parameter of `SampleTopK` as a compiler flag
-#ifndef GEMMA_TOPK
-#define GEMMA_TOPK 1
-#endif  // !GEMMA_TOPK
-
-static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN;
-static constexpr size_t kTopK = GEMMA_TOPK;
-static constexpr size_t kVocabSize = 256000;
 static constexpr size_t kMaxConv1DWidth = 4;
+static constexpr size_t kMaxQKVDim = 1024;
 
-using EmbedderInputT = BF16;
+// Instruction-tuned models require extra 'turn structure' tokens in prompts.
+enum class PromptWrapping {
+  GEMMA_IT,
+  GEMMA_PT,
+  GEMMA_VLM,  // for >1B Gemma3
+  PALIGEMMA,
+  kSentinel  // must be last
+};
+
+// This is used in `ModelConfig.Specifier`, so the strings will not change,
+// though new ones may be added.
+static inline const char* WrappingSuffix(PromptWrapping wrapping) {
+  switch (wrapping) {
+    case PromptWrapping::GEMMA_IT:
+      return "-it";
+    case PromptWrapping::GEMMA_PT:
+      return "-pt";
+    case PromptWrapping::GEMMA_VLM:
+      return "-vlm";
+    case PromptWrapping::PALIGEMMA:
+      return "-pg";
+    default:
+      return "-?";
+  }
+}
+
+static inline bool EnumValid(PromptWrapping wrapping) {
+  return static_cast<size_t>(wrapping) <
+         static_cast<size_t>(PromptWrapping::kSentinel);
+}
 
 enum class LayerAttentionType {
   kGemma,
@@ -55,63 +72,68 @@ enum class LayerAttentionType {
   kVit,
 };
 
-inline bool EnumValid(LayerAttentionType type) {
-  return static_cast<int>(type) >= 0 &&
-         static_cast<int>(type) <= static_cast<int>(LayerAttentionType::kVit);
+static inline bool EnumValid(LayerAttentionType type) {
+  return type == LayerAttentionType::kGemma ||
+         type == LayerAttentionType::kGriffinRecurrentBlock ||
+         type == LayerAttentionType::kVit;
 }
 
 // Post attention and ffw normalization type.
 enum class PostNormType {
   None,
   Scale,
+  kSentinel  // must be last
 };
 
-inline bool EnumValid(PostNormType type) {
-  return static_cast<int>(type) >= 0 &&
-      static_cast<int>(type) <= static_cast<int>(PostNormType::Scale);
+static inline bool EnumValid(PostNormType type) {
+  return static_cast<size_t>(type) <
+         static_cast<size_t>(PostNormType::kSentinel);
 }
 
 // Post qk projection operation type.
 enum class PostQKType {
   Rope,
   HalfRope,
+  kSentinel  // must be last
 };
 
-inline bool EnumValid(PostQKType type) {
-  return static_cast<int>(type) >= 0 &&
-      static_cast<int>(type) <= static_cast<int>(PostQKType::HalfRope);
+static inline bool EnumValid(PostQKType type) {
+  return static_cast<size_t>(type) <
+         static_cast<size_t>(PostNormType::kSentinel);
 }
 
 // FFW activation function.
 enum class ActivationType {
   Gelu,
+  kSentinel  // must be last
 };
 
-inline bool EnumValid(ActivationType type) {
-  return static_cast<int>(type) >= 0 &&
-      static_cast<int>(type) <= static_cast<int>(ActivationType::Gelu);
+static inline bool EnumValid(ActivationType type) {
+  return static_cast<size_t>(type) <
+         static_cast<size_t>(ActivationType::kSentinel);
 }
 
 // Attention query scale.
 enum class QueryScaleType {
   SqrtKeySize,
   SqrtModelDimDivNumHeads,
+  kSentinel  // must be last
 };
 
-inline bool EnumValid(QueryScaleType type) {
-  return static_cast<int>(type) >= 0 &&
-      static_cast<int>(type) <=
-      static_cast<int>(QueryScaleType::SqrtModelDimDivNumHeads);
+static inline bool EnumValid(QueryScaleType type) {
+  return static_cast<size_t>(type) <
+         static_cast<size_t>(QueryScaleType::kSentinel);
 }
 
 // Residual connection type.
 enum class ResidualType {
   Add,
+  kSentinel  // must be last
 };
 
-inline bool EnumValid(ResidualType type) {
-  return static_cast<int>(type) >= 0 &&
-      static_cast<int>(type) <= static_cast<int>(ResidualType::Add);
+static inline bool EnumValid(ResidualType type) {
+  return static_cast<size_t>(type) <
+         static_cast<size_t>(ResidualType::kSentinel);
 }
 
 template <size_t kNum>
@@ -137,17 +159,15 @@ std::vector<uint32_t> RepeatedAttentionWindowSizes(
 
 // Model variants: see configs.cc for details.
 enum class Model {
-  UNKNOWN,
-  GEMMA_2B,
-  GEMMA_7B,
-  GEMMA2_9B,
+  UNKNOWN = 0,
+  // 1 and 2 are obsolete.
+  GEMMA2_9B = 3,
   GEMMA2_27B,
   GRIFFIN_2B,
-  GEMMA_TINY,
+  GEMMA_TINY,  // for testing only
   GEMMA2_2B,
-  PALIGEMMA_224,
-  PALIGEMMA_448,
-  PALIGEMMA2_3B_224,
+  // 8 and 9 are obsolete.
+  PALIGEMMA2_3B_224 = 10,
   PALIGEMMA2_3B_448,
   PALIGEMMA2_10B_224,
   PALIGEMMA2_10B_448,
@@ -155,43 +175,64 @@ enum class Model {
   GEMMA3_1B,
   GEMMA3_12B,
   GEMMA3_27B,
+  kSentinel,
 };
 
-// Allows the Model enum to be iterated over.
-static constexpr Model kAllModels[] = {
-    Model::GEMMA_2B, Model::GEMMA_7B, Model::GEMMA2_9B, Model::GEMMA2_27B,
-    Model::GRIFFIN_2B, Model::GEMMA_TINY, Model::GEMMA2_2B,
-    Model::PALIGEMMA_224, Model::PALIGEMMA_448, Model::PALIGEMMA2_3B_224,
-    Model::PALIGEMMA2_3B_448, Model::PALIGEMMA2_10B_224,
-    Model::PALIGEMMA2_10B_448, Model::GEMMA3_4B, Model::GEMMA3_1B,
-    Model::GEMMA3_12B, Model::GEMMA3_27B,
-};
+// Returns canonical model name without the PromptWrapping suffix. This is used
+// in Specifier and thus does not change.
+const char* ModelPrefix(Model model);
 
-inline bool EnumValid(Model model) {
-  for (Model m : kAllModels) {
-    if (m == model) return true;
+// Gemma3 is multimodal and has a different prompt wrapping than PaliGemma.
+// This is used for deducing the PromptWrapping for pre-2025 BlobStore.
+static inline bool IsVLM(Model model) {
+  return model == Model::GEMMA3_4B || model == Model::GEMMA3_1B ||
+         model == Model::GEMMA3_12B || model == Model::GEMMA3_27B;
+}
+
+static inline bool IsPaliGemma(Model model) {
+  if (model == Model::PALIGEMMA2_3B_224 || model == Model::PALIGEMMA2_3B_448 ||
+      model == Model::PALIGEMMA2_10B_224 ||
+      model == Model::PALIGEMMA2_10B_448) {
+    return true;
   }
   return false;
 }
 
+// Visits every valid model enum, skipping `UNKNOWN` and `kSentinel`.
+template <class Func>
+void ForEachModel(const Func& func) {
+  for (size_t i = static_cast<size_t>(Model::GEMMA2_9B);
+       i < static_cast<size_t>(Model::kSentinel); ++i) {
+    if (i == 8 || i == 9) continue;
+    func(static_cast<Model>(i));
+  }
+}
+
+static inline bool EnumValid(Model model) {
+  // Valid for purposes of serialization, even if unknown.
+  if (model == Model::UNKNOWN) return true;
+  const size_t i = static_cast<size_t>(model);
+  if (i >= static_cast<size_t>(Model::GEMMA2_9B) &&
+      i < static_cast<size_t>(Model::kSentinel) && i != 8 && i != 9) {
+    return true;
+  }
+  return false;
+}
+
+struct InternalLayerConfig : public IFields {
+  const char* Name() const override { return "InternalLayerConfig"; }
+
+  // Source of truth for field ordering.
+  void VisitFields(IFieldsVisitor& visitor) override {
+    // Append new fields here, then update `python/configs.cc`.
+  }
+};
+
+// Per-layer configuration.
 struct LayerConfig : public IFields {
-  // Returns true if *this and other are equal.
-  // If partial is true, then we don't check for items that are only set after
-  // the tensors are loaded from the checkpoint.
-  // If debug is true, then we output the mismatched fields to stderr.
-  bool TestEqual(const LayerConfig& other, bool partial, bool debug) const;
-
-  size_t CacheLayerSize() const { return kv_heads * qkv_dim * 2; }
-
-  // Multi-Head Attention?
-  bool IsMHA() const { return heads == kv_heads; }
-
-  // Stride between subsequent queries. Each of Q, K, V are of length kQKVDim,
-  // but for MHA we store them as Q,K,V, Q,K,V, .. instead of Q..Q, K..K, V..V.
-  size_t QStride() const { return qkv_dim * (IsMHA() ? 3 : 1); }
-
   const char* Name() const override { return "LayerConfig"; }
 
+  // Source of truth for field ordering.
   void VisitFields(IFieldsVisitor& visitor) override {
     visitor(model_dim);
     visitor(griffin_dim);
@@ -208,35 +249,41 @@ struct LayerConfig : public IFields {
     visitor(activation);
     visitor(post_qk);
     visitor(use_qk_norm);
+    internal.VisitFields(visitor);
+    // Append new fields here, then update `python/configs.cc`.
   }
 
+  // Returns whether all fields match.
+  bool TestEqual(const LayerConfig& other, bool print) const;
+
+  size_t CacheLayerSize() const { return kv_heads * qkv_dim * 2; }
+
+  // Multi-Head Attention?
+  bool IsMHA() const { return heads == kv_heads; }
+
   uint32_t model_dim = 0;
   uint32_t griffin_dim = 0;
   uint32_t ff_hidden_dim = 0;
   uint32_t heads = 0;
   uint32_t kv_heads = 0;
-  uint32_t qkv_dim = 0;
-  uint32_t conv1d_width = 0;  // griffin only
+  uint32_t qkv_dim = 0;       // length of Q, K, V vectors (contiguous).
+  uint32_t conv1d_width = 0;  // Griffin only
   bool ff_biases = false;
-  bool softmax_attn_output_biases = false;
-  bool optimized_gating = true;
+  bool softmax_attn_output_biases = false;  // for Griffin
+  bool optimized_gating = true;             // for Gemma3
   PostNormType post_norm = PostNormType::None;
   LayerAttentionType type = LayerAttentionType::kGemma;
   ActivationType activation = ActivationType::Gelu;
   PostQKType post_qk = PostQKType::Rope;
   bool use_qk_norm = false;
+  InternalLayerConfig internal;
 };
 
 // Dimensions related to image processing.
 struct VitConfig : public IFields {
-  // Returns true if *this and other are equal.
-  // If partial is true, then we don't check for items that are only set after
-  // the tensors are loaded from the checkpoint.
-  // If debug is true, then we output the mismatched fields to stderr.
-  bool TestEqual(const VitConfig& other, bool partial, bool debug) const;
-
   const char* Name() const override { return "VitConfig"; }
 
+  // Source of truth for field ordering.
   void VisitFields(IFieldsVisitor& visitor) override {
     visitor(model_dim);
     visitor(seq_len);
@@ -245,8 +292,12 @@ struct VitConfig : public IFields {
     visitor(image_size);
     visitor(layer_configs);
     visitor(pool_dim);
+    // Append new fields here, then update `python/configs.cc`.
   }
 
+  // Returns whether all fields match.
+  bool TestEqual(const VitConfig& other, bool print) const;
+
   uint32_t model_dim = 0;
   uint32_t seq_len = 0;
   uint32_t num_scales = 0;
@@ -256,20 +307,96 @@ struct VitConfig : public IFields {
   std::vector<LayerConfig> layer_configs;
 };
 
+// Returns a valid `PromptWrapping` for the given `model`, for passing to the
+// `ModelConfig` ctor when the caller does not care about the wrapping. The
+// wrapping mode is either determined by the model (for PaliGemma and Gemma3),
+// or defaults to IT, subject to user override for PT.
+PromptWrapping ChooseWrapping(Model model,
+                              Tristate wrapping = Tristate::kDefault);
+
+struct InternalModelConfig : public IFields {
+  const char* Name() const override { return "InternalModelConfig"; }
+
+  // Source of truth for field ordering.
+  void VisitFields(IFieldsVisitor& visitor) override {
+    // Append new fields here, then update `python/configs.cc`.
+  }
+};
+
 struct ModelConfig : public IFields {
-  // Returns true if *this and other are equal.
-  // If partial is true, then we don't check for items that are only set after
-  // the tensors are loaded from the checkpoint.
-  // If debug is true, then we output the mismatched fields to stderr.
-  bool TestEqual(const ModelConfig& other, bool partial, bool debug) const;
+  // Preferred usage (single-file format): default-construct, then deserialize
+  // from a blob. Also used by `config_converter.py`, which sets sufficient
+  // fields for `TestEqual` and then calls `OverwriteWithCanonical()`.
+  ModelConfig() = default;
+  // For use by `model_store.cc` for pre-2025 format after deducing the model
+  // from tensors plus a user-specified `wrapping` override (`ChooseWrapping`).
+  ModelConfig(Model model, Type weight, PromptWrapping wrapping);
+  // Parses a string returned by `Specifier()`. Used by the exporter to select
+  // the model from command line arguments. Do not use this elsewhere - the
+  // second ctor is preferred because it is type-checked.
+  ModelConfig(const std::string& specifier);
+
+  const char* Name() const override { return "ModelConfig"; }
+
+  // Source of truth for field ordering.
+  void VisitFields(IFieldsVisitor& visitor) override {
+    visitor(model_family_version);
+    visitor(display_name);
+    visitor(model);
+    visitor(wrapping);
+    visitor(weight);
+
+    visitor(num_layers);
+    visitor(model_dim);
+    visitor(vocab_size);
+    visitor(max_seq_len);
+
+    visitor(unused_num_tensor_scales);
+
+    visitor(att_cap);
+    visitor(final_cap);
+
+    visitor(absolute_pe);
+    visitor(use_local_attention);
+    visitor(query_scale);
+    visitor(layer_configs);
+    visitor(attention_window_sizes);
+    visitor(norm_num_groups);
+    visitor(vit_config);
+    visitor(pool_dim);
+
+    visitor(eos_id);
+    visitor(secondary_eos_id);
+
+    visitor(scale_base_names);
+
+    internal.VisitFields(visitor);
+
+    // Append new fields here, then update `python/configs.cc`.
+  }
+
+  // Returns whether all fields match except `model` and `display_name`, and
+  // some others that are not yet set by config_converter.py. This is for
+  // internal use by `OverwriteWithCanonical`, but potentially useful elsewhere.
+  bool TestEqual(const ModelConfig& other, bool print) const;
+
+  // For each model, constructs its canonical `ModelConfig` and if `TestEqual`
+  // returns true, overwrites `*this` with that. Otherwise, returns false to
+  // indicate this is not a known model. Called by `config_converter.py`.
+  bool OverwriteWithCanonical();
+
+  // Returns a string encoding of the model family, size, weight, and
+  // `PromptWrapping`. Stable/unchanging; can be used as the model file name.
+  // The third ctor also expects a string returned by this.
+  std::string Specifier() const;
 
   void AddLayerConfig(const LayerConfig& layer_config) {
     layer_configs.push_back(layer_config);
+    HWY_ASSERT(layer_configs.size() <= num_layers);
   }
 
-  size_t CachePosSize() const {
-    size_t num_layers = layer_configs.size();
-    return num_layers * layer_configs[0].CacheLayerSize();
+  bool IsGlobalLayer(size_t layer_idx) const {
+    return attention_window_sizes[layer_idx] == max_seq_len;
   }
 
   size_t NumLayersOfTypeBefore(LayerAttentionType type, size_t num) const {
@@ -287,77 +414,77 @@ struct ModelConfig : public IFields {
   size_t NumHeads() const {
     uint32_t num_heads = 0;
     for (const auto& layer_config : layer_configs) {
-      num_heads = std::max(num_heads, layer_config.heads);
+      num_heads = HWY_MAX(num_heads, layer_config.heads);
     }
     return num_heads;
   }
 
-  const char* Name() const override { return "ModelConfig"; }
+  size_t KVCacheCols() const {
+    size_t num_layers = layer_configs.size();
+    return num_layers * layer_configs[0].CacheLayerSize();
+  }
 
   bool IsEOS(int id) const { return (id == eos_id || id == secondary_eos_id); }
 
-  void VisitFields(IFieldsVisitor& visitor) override {
-    visitor(model_family_version);
-    visitor(model_name);
-    visitor(model);
-    visitor(wrapping);
-    visitor(weight);
-    visitor(num_layers);
-    visitor(model_dim);
-    visitor(vocab_size);
-    visitor(seq_len);
-    visitor(num_tensor_scales);
-    visitor(att_cap);
-    visitor(final_cap);
-    visitor(absolute_pe);
-    visitor(use_local_attention);
-    visitor(query_scale);
-    visitor(layer_configs);
-    visitor(attention_window_sizes);
-    visitor(norm_num_groups);
-    visitor(vit_config);
-    visitor(pool_dim);
-    visitor(eos_id);
-    visitor(secondary_eos_id);
-  }
-
-  // Major version of the model family. It is used as a fallback to distinguish
-  // between model types when there is no explicit information in the config.
+  // Major version of the model family, reflecting architecture changes. This is
+  // more convenient to compare than `Model` because that also includes the
+  // model size.
   uint32_t model_family_version = 1;
-  std::string model_name;
-  Model model = Model::UNKNOWN;
+  // For display only, may change. Use `Specifier()` for setting the
+  // file name. Not checked by `TestEqual` because `config_converter.py` does
+  // not set this.
+  std::string display_name;
+  Model model = Model::UNKNOWN;  // Not checked by `TestEqual`, see above.
   PromptWrapping wrapping = PromptWrapping::GEMMA_PT;
   Type weight = Type::kUnknown;
+
   uint32_t num_layers = 0;
   uint32_t model_dim = 0;
   uint32_t vocab_size = 0;
-  uint32_t seq_len = 0;
-  uint32_t num_tensor_scales = 0;
+  uint32_t max_seq_len = 0;
+
+  // We no longer set nor use this: config_converter is not able to set this,
+  // and only pre-2025 format stores scales, and we do not require advance
+  // knowledge of how many there will be. Any scales present will just be
+  // assigned in order to the tensors matching `scale_base_names`.
+  uint32_t unused_num_tensor_scales = 0;
+
   float att_cap = 0.0f;
   float final_cap = 0.0f;
+
   bool absolute_pe = false;
-  bool use_local_attention = false;  // griffin only
+  bool use_local_attention = false;  // Griffin only
   QueryScaleType query_scale = QueryScaleType::SqrtKeySize;
   std::vector<LayerConfig> layer_configs;
   std::vector<uint32_t> attention_window_sizes;
-  std::unordered_set<std::string> scale_names;
   uint32_t norm_num_groups = 1;
+
   // Dimensions related to image processing.
   VitConfig vit_config;
   uint32_t pool_dim = 1;  // used only for VitConfig copy
+
   int eos_id = 1;
   int secondary_eos_id = 1;
+
+  // Tensor base names without a layer suffix, used by `ModelStore` only for
+  // pre-2025 format.
+  std::vector<std::string> scale_base_names;
+
+  InternalModelConfig internal;
 };
 
-// Returns the config for the given model.
-ModelConfig ConfigFromModel(Model model);
-
-// Returns the model for the given config, if it matches any standard model.
-Model ModelFromConfig(const ModelConfig& config);
-
 // Returns the sub-config for the ViT model of the PaliGemma model.
 ModelConfig GetVitConfig(const ModelConfig& config);
 
+enum DeducedLayerTypes {
+  kDeducedGriffin = 1,
+  kDeducedViT = 2,
+  kDeduced448 = 4,   // For ViT, 448x448 resolution instead of 224x224.
+};
+
+// layer_types is one or more of `DeducedLayerTypes`.
+Model DeduceModel(const Path& blob_path, size_t layers, int layer_types);
+
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_CONFIGS_H_
diff --git a/gemma/configs_test.cc b/gemma/configs_test.cc
index 3efd2cb..0ca4a84 100644
--- a/gemma/configs_test.cc
+++ b/gemma/configs_test.cc
@@ -1,461 +1,44 @@
 #include "gemma/configs.h"
 
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <type_traits>
+#include <stdio.h>
+
+#include <string>
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "hwy/aligned_allocator.h"
+#include "compression/types.h"  // Type
+#include "io/fields.h"           // Type
 
 namespace gcpp {
 
-template <size_t kNum>
-constexpr std::array<LayerAttentionType, kNum> OldFixedLayerConfig(
-    LayerAttentionType type) {
-  std::array<LayerAttentionType, kNum> config = {};
-  for (LayerAttentionType& l : config) {
-    l = type;
-  }
-  return config;
-}
+TEST(ConfigsTest, TestAll) {
+  ForEachModel([&](Model model) {
+    ModelConfig config(model, Type::kSFP, ChooseWrapping(model));
+    fprintf(stderr, "Testing %s (%s)\n", config.display_name.c_str(),
+            config.Specifier().c_str());
+    HWY_ASSERT(config.model == model);
 
-template <size_t kNum>
-constexpr std::array<size_t, kNum> OldFixedAttentionWindowSizes(
-    size_t window_size) {
-  std::array<size_t, kNum> window_size_configs = {};
-  for (size_t& l : window_size_configs) {
-    l = window_size;
-  }
-  return window_size_configs;
-}
+    // We can deduce the model/display_name from all other fields.
+    config.model = Model::UNKNOWN;
+    const std::string saved_display_name = config.display_name;
+    config.display_name.clear();
+    HWY_ASSERT(config.OverwriteWithCanonical());
+    HWY_ASSERT(config.model == model);
+    HWY_ASSERT(config.display_name == saved_display_name);
 
-// Repeat window_size_pattern for kNum / kPatternSize times.
-template <size_t kNum, size_t kPatternSize>
-constexpr std::array<size_t, kNum> OldRepeatedAttentionWindowSizes(
-    const std::array<size_t, kPatternSize>& window_size_pattern) {
-  static_assert(kNum % kPatternSize == 0,
-                "kNum must be a multiple of kPatternSize");
-  std::array<size_t, kNum> window_size_configs = {};
-  for (size_t i = 0; i < kNum; ++i) {
-    window_size_configs[i] = window_size_pattern[i % kPatternSize];
-  }
-  return window_size_configs;
-}
-
-template <size_t kNumLayers>
-constexpr size_t OldNumLayersOfTypeBefore(
-    const std::array<LayerAttentionType, kNumLayers>& layers,
-    LayerAttentionType type, size_t num) {
-  size_t count = 0;
-  for (size_t i = 0; i < num; i++) {
-    if (layers[i] == type) count++;
-  }
-  return count;
-}
-
-template <class TConfig, typename = void>
-struct CacheLayerSize {
-  constexpr size_t operator()() const {
-    return TConfig::kKVHeads * TConfig::kQKVDim * 2;
-  }
-};
-
-template <class TConfig, typename = void>
-struct CachePosSize {
-  constexpr size_t operator()() const {
-    return TConfig::kGemmaLayers * CacheLayerSize<TConfig>()();
-  }
-};
-
-struct OldConfigNoVit {
-  struct VitConfig {
-    // Some of these are needed to make the compiler happy when trying to
-    // generate code that will actually never be used.
-    using Weight = float;
-    static constexpr int kLayers = 0;
-    static constexpr std::array<LayerAttentionType, 0> kLayerConfig =
-        OldFixedLayerConfig<0>(LayerAttentionType::kVit);
-    static constexpr int kModelDim = 0;
-    static constexpr int kFFHiddenDim = 0;
-    static constexpr int kHeads = 1;  // Avoid division by 0 in griffin gate_w.
-    static constexpr int kKVHeads = 0;
-    static constexpr int kQKVDim = 0;
-    static constexpr int kSeqLen = 0;
-    static constexpr ResidualType kResidual = ResidualType::Add;
-    static constexpr int kGriffinLayers = 0;
-    static constexpr int kConv1dWidth = 0;
-    static constexpr bool kFFBiases = false;
-    static constexpr bool kSoftmaxAttnOutputBiases = false;
-    static constexpr PostNormType kPostNorm = PostNormType::None;
-  };
-};
-
-struct OldConfigNoSSM : OldConfigNoVit {
-  static constexpr int kGriffinLayers = 0;
-
-  static constexpr int kConv1dWidth = 0;
-  static constexpr bool kFFBiases = false;
-  static constexpr bool kSoftmaxAttnOutputBiases = false;
-  static constexpr bool kUseHalfRope = false;
-  static constexpr bool kUseLocalAttention = false;
-  static constexpr bool kInterleaveQKV = true;
-  static constexpr PostQKType kPostQK = PostQKType::Rope;
-  static constexpr ActivationType kActivation = ActivationType::Gelu;
-  static constexpr ResidualType kResidual = ResidualType::Add;
-};
-
-struct OldConfigBaseGemmaV1 : OldConfigNoSSM {
-  static constexpr float kAttCap = 0.0f;
-  static constexpr float kFinalCap = 0.0f;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-};
-
-struct OldConfigBaseGemmaV2 : OldConfigNoSSM {
-  static constexpr float kAttCap = 50.0f;
-  static constexpr float kFinalCap = 30.0f;
-  static constexpr PostNormType kPostNorm = PostNormType::Scale;
-};
-
-template <typename TWeight>
-struct OldConfigGemma2_27B : public OldConfigBaseGemmaV2 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 8192;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 46> kLayerConfig =
-      OldFixedLayerConfig<46>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 46> kAttentionWindowSizes =
-      OldRepeatedAttentionWindowSizes<46, 2>({4096, kSeqLen});
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 4608;
-  static constexpr int kFFHiddenDim = 16 * 4608 / 2;  // = 36864
-  static constexpr int kHeads = 32;
-  static constexpr int kKVHeads = 16;
-  static constexpr int kQKVDim = 128;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr QueryScaleType kQueryScale =
-      QueryScaleType::SqrtModelDimDivNumHeads;
-};
-
-template <typename TWeight>
-struct OldConfigGemma2_9B : public OldConfigBaseGemmaV2 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 8192;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 42> kLayerConfig =
-      OldFixedLayerConfig<42>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 42> kAttentionWindowSizes =
-      OldRepeatedAttentionWindowSizes<42, 2>({4096, kSeqLen});
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 3584;
-  static constexpr int kFFHiddenDim = 8 * 3584 / 2;  // = 14336
-  static constexpr int kHeads = 16;
-  static constexpr int kKVHeads = 8;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-};
-
-template <typename TWeight>
-struct OldConfigGemma7B : public OldConfigBaseGemmaV1 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = gcpp::kSeqLen;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 28> kLayerConfig =
-      OldFixedLayerConfig<28>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 28> kAttentionWindowSizes =
-      OldFixedAttentionWindowSizes<28>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 3072;
-  static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
-  static constexpr int kHeads = 16;
-  static constexpr int kKVHeads = 16;  // standard MHA
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-};
-
-template <typename TWeight>
-struct OldConfigGemma2B : public OldConfigBaseGemmaV1 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = gcpp::kSeqLen;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 18> kLayerConfig =
-      OldFixedLayerConfig<18>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 18> kAttentionWindowSizes =
-      OldFixedAttentionWindowSizes<18>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 2048;
-  static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
-  static constexpr int kHeads = 8;
-  static constexpr int kKVHeads = 1;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-};
-
-template <typename TWeight>
-struct OldConfigPaliGemma_224 : public OldConfigGemma2B<TWeight> {
-  // On the LM side, the vocab size is one difference to Gemma1-2B in the
-  // architecture. PaliGemma adds 1024 <locNNNN> and 128 <segNNN> tokens.
-  static constexpr int kVocabSize = 256000 + 1024 + 128;  // = 257152
-
-  // Sub-config for the Vision-Transformer part.
-  struct VitConfig : public OldConfigNoSSM {
-    using Weight = TWeight;
-    // The ViT parts. https://arxiv.org/abs/2305.13035
-    // "SoViT-400m/14 [...] has a width of 1152, depth 27, and MLP dim 4304."
-    static constexpr std::array<LayerAttentionType, 27> kLayerConfig =
-        OldFixedLayerConfig<27>(LayerAttentionType::kVit);
-    static constexpr int kLayers = kLayerConfig.size();
-    static constexpr int kNumTensorScales = 4 * kLayers;
-    static constexpr int kModelDim = 1152;
-    static constexpr int kFFHiddenDim = 4304;
-    static constexpr int kHeads = 16;
-    static constexpr int kKVHeads = 16;  // standard MHA
-    static constexpr int kQKVDim = 72;
-    static constexpr int kSeqLen = 16 * 16;  // 256
-    static constexpr bool kFFBiases = true;
-    // The Vit part does not have a vocabulary, the image patches are embedded.
-    static constexpr int kVocabSize = 0;
-    // Dimensions related to image processing.
-    static constexpr int kPatchWidth = 14;
-    static constexpr int kImageSize = 224;
-    // Necessary constant for the layer configuration.
-    static constexpr PostNormType kPostNorm = PostNormType::None;
-  };
-};
-
-template <typename TWeight>
-struct OldConfigGemma2_2B : public OldConfigBaseGemmaV2 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 8192;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 26> kLayerConfig =
-      OldFixedLayerConfig<26>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
-      OldRepeatedAttentionWindowSizes<26, 2>({4096, kSeqLen});
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 2304;
-  static constexpr int kFFHiddenDim = 8 * 2304 / 2;  // = 9216
-  static constexpr int kHeads = 8;
-  static constexpr int kKVHeads = 4;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-};
-
-template <typename TWeight>
-struct OldConfigGemmaTiny : public OldConfigNoSSM {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 32;
-  static constexpr int kVocabSize = 64;
-  static constexpr std::array<LayerAttentionType, 3> kLayerConfig =
-      OldFixedLayerConfig<3>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 3> kAttentionWindowSizes =
-      OldFixedAttentionWindowSizes<3>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 128;
-  static constexpr int kFFHiddenDim = 256;
-  static constexpr int kHeads = 4;
-  static constexpr int kKVHeads = 1;
-  static constexpr int kQKVDim = 16;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-
-  static constexpr float kAttCap = 0.0f;
-  // This is required for optimize_test to pass.
-  static constexpr float kFinalCap = 30.0f;
-};
-
-template <typename TWeight>
-struct OldConfigGriffin2B : OldConfigNoVit {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  // Griffin uses local attention, so kSeqLen is actually the local attention
-  // window.
-  static constexpr int kSeqLen = 2048;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 26> kLayerConfig = {
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-  };
-  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
-      OldFixedAttentionWindowSizes<26>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kGemmaLayers = OldNumLayersOfTypeBefore(
-      kLayerConfig, LayerAttentionType::kGemma, kLayers);
-  static constexpr int kGriffinLayers = OldNumLayersOfTypeBefore(
-      kLayerConfig, LayerAttentionType::kGriffinRecurrentBlock, kLayers);
-  static constexpr int kModelDim = 2560;
-  static constexpr int kFFHiddenDim = 7680;
-  static constexpr int kHeads = 10;
-  static constexpr int kKVHeads = 1;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-
-  // No SoftCap.
-  static constexpr float kAttCap = 0.0f;
-  static constexpr float kFinalCap = 0.0f;
-
-  // SSM config.
-  static constexpr int kConv1dWidth = 4;
-  static constexpr bool kFFBiases = true;
-  static constexpr bool kSoftmaxAttnOutputBiases = true;
-  static constexpr bool kUseHalfRope = true;
-  static constexpr bool kUseLocalAttention = true;
-  static constexpr bool kInterleaveQKV = false;
-  static constexpr int kNumTensorScales = 140;
-  static constexpr PostQKType kPostQK = PostQKType::Rope;
-  static constexpr ActivationType kActivation = ActivationType::Gelu;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-  static constexpr ResidualType kResidual = ResidualType::Add;
-};
-
-template <class TConfig>
-void AssertMatch(const ModelConfig& config) {
-  ASSERT_EQ(TConfig::kModelDim, config.model_dim);
-  if constexpr (TConfig::VitConfig::kModelDim != 0) {
-    ASSERT_EQ(TConfig::VitConfig::kModelDim, config.vit_config.model_dim);
-    ASSERT_EQ(TConfig::VitConfig::kSeqLen, config.vit_config.seq_len);
-    ASSERT_EQ(TConfig::VitConfig::kNumTensorScales,
-              config.vit_config.num_scales);
-    for (size_t i = 0; i < config.vit_config.layer_configs.size(); ++i) {
-      ASSERT_EQ(TConfig::VitConfig::kLayerConfig[i],
-                config.vit_config.layer_configs[i].type);
-    }
-  }
-  ASSERT_EQ(TConfig::kVocabSize, config.vocab_size);
-  ASSERT_EQ(TConfig::kSeqLen, config.seq_len);
-  ASSERT_EQ(TConfig::kAttCap, config.att_cap);
-  ASSERT_EQ(TConfig::kFinalCap, config.final_cap);
-  ASSERT_EQ(TConfig::kAbsolutePE, config.absolute_pe);
-  ASSERT_EQ(TConfig::kUseLocalAttention, config.use_local_attention);
-  ASSERT_EQ(TConfig::kQueryScale, config.query_scale);
-  ASSERT_EQ(TConfig::kGemmaLayers,
-            config.NumLayersOfType(LayerAttentionType::kGemma));
-  ASSERT_EQ(TConfig::kGriffinLayers,
-            config.NumLayersOfType(LayerAttentionType::kGriffinRecurrentBlock));
-  for (size_t i = 0; i < config.layer_configs.size(); ++i) {
-    ASSERT_EQ(TConfig::kModelDim, config.layer_configs[i].model_dim);
-    ASSERT_EQ(TConfig::kFFHiddenDim, config.layer_configs[i].ff_hidden_dim);
-    ASSERT_EQ(TConfig::kHeads, config.layer_configs[i].heads);
-    ASSERT_EQ(TConfig::kKVHeads, config.layer_configs[i].kv_heads);
-    ASSERT_EQ(TConfig::kQKVDim, config.layer_configs[i].qkv_dim);
-    ASSERT_EQ(TConfig::kConv1dWidth, config.layer_configs[i].conv1d_width);
-    ASSERT_EQ(TConfig::kFFBiases, config.layer_configs[i].ff_biases);
-    ASSERT_EQ(TConfig::kSoftmaxAttnOutputBiases,
-              config.layer_configs[i].softmax_attn_output_biases);
-    ASSERT_EQ(TConfig::kPostNorm, config.layer_configs[i].post_norm);
-    ASSERT_EQ(TConfig::kLayerConfig[i], config.layer_configs[i].type);
-    ASSERT_EQ(TConfig::kActivation, config.layer_configs[i].activation);
-    PostQKType post_qk = TConfig::kPostQK;
-    if (TConfig::kUseHalfRope) {
-      post_qk = PostQKType::HalfRope;
-    }
-    ASSERT_EQ(post_qk, config.layer_configs[i].post_qk);
-  }
-
-  ASSERT_EQ(TConfig::kAttentionWindowSizes.size(),
-            config.attention_window_sizes.size());
-  for (size_t i = 0; i < config.attention_window_sizes.size(); ++i) {
-    ASSERT_EQ(TConfig::kAttentionWindowSizes[i],
-              config.attention_window_sizes[i]);
-  }
-  ASSERT_EQ(TConfig::kNumTensorScales, config.num_tensor_scales);
-}
-
-ModelConfig RoundTripSerialize(const ModelConfig& config) {
-  std::vector<uint32_t> config_buffer = config.Write();
-  ModelConfig deserialized;
-  deserialized.Read(hwy::Span<const uint32_t>(config_buffer), 0);
-  return deserialized;
-}
-
-TEST(ConfigsTest, OldConfigGemma2B) {
-  AssertMatch<OldConfigGemma2B<float>>(ConfigFromModel(Model::GEMMA_2B));
-  ModelConfig config = RoundTripSerialize(ConfigFromModel(Model::GEMMA_2B));
-  AssertMatch<OldConfigGemma2B<float>>(config);
-}
-
-TEST(ConfigsTest, OldConfigGemma7B) {
-  AssertMatch<OldConfigGemma7B<float>>(ConfigFromModel(Model::GEMMA_7B));
-}
-
-TEST(ConfigsTest, OldConfigGemma2_2B) {
-  AssertMatch<OldConfigGemma2_2B<float>>(ConfigFromModel(Model::GEMMA2_2B));
-}
-
-TEST(ConfigsTest, OldConfigGemma2_9B) {
-  AssertMatch<OldConfigGemma2_9B<float>>(ConfigFromModel(Model::GEMMA2_9B));
-}
-
-TEST(ConfigsTest, OldConfigGemma2_27B) {
-  AssertMatch<OldConfigGemma2_27B<float>>(ConfigFromModel(Model::GEMMA2_27B));
-}
-
-TEST(ConfigsTest, OldConfigGriffin2B) {
-  AssertMatch<OldConfigGriffin2B<float>>(ConfigFromModel(Model::GRIFFIN_2B));
-}
-
-TEST(ConfigsTest, OldConfigGemmaTiny) {
-  AssertMatch<OldConfigGemmaTiny<float>>(ConfigFromModel(Model::GEMMA_TINY));
-}
-
-TEST(ConfigsTest, OldConfigPaliGemma_224) {
-  AssertMatch<OldConfigPaliGemma_224<float>>(
-      ConfigFromModel(Model::PALIGEMMA_224));
+    const std::vector<uint32_t> serialized = config.Write();
+    ModelConfig deserialized;
+    const IFields::ReadResult result =
+        deserialized.Read(hwy::Span<const uint32_t>(serialized), /*pos=*/0);
+    HWY_ASSERT(result.pos == serialized.size());
+    // We wrote it, so all fields should be known, and no extra.
+    HWY_ASSERT(result.extra_u32 == 0);
+    HWY_ASSERT(result.missing_fields == 0);
+    // All fields should match.
+    HWY_ASSERT(deserialized.TestEqual(config, /*print=*/true));
+    HWY_ASSERT(deserialized.model == model);
+    HWY_ASSERT(deserialized.display_name == saved_display_name);
+  });
 }
 
 }  // namespace gcpp
diff --git a/gemma/gemma-inl.h b/gemma/gemma-inl.h
index ccb34f0..e3f9a19 100644
--- a/gemma/gemma-inl.h
+++ b/gemma/gemma-inl.h
@@ -13,33 +13,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// SIMD functions for Gemma/Griffin transformers.
+// Transformer components shared between vit.cc and attention.cc.
 
-#include <math.h>  // sqrtf
 #include <stddef.h>
-#include <stdio.h>
+#include <stdint.h>
 
-#include <algorithm>  // std::min
-#include <cstdio>
-#include <vector>
-
-#include "compression/compress.h"
 #include "gemma/activations.h"
-#include "gemma/common.h"
 #include "gemma/configs.h"
-#include "gemma/gemma.h"
-#include "gemma/kv_cache.h"
 #include "gemma/weights.h"
-#include "paligemma/image.h"
-#include "util/allocator.h"
-#include "util/basics.h"
+#include "ops/matmul.h"
+#include "util/mat.h"
 #include "util/threading.h"
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"
-#include "hwy/bit_set.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/profiler.h"
-#include "hwy/timer.h"
 
 // Include guard (still compiled once per target)
 #if defined(THIRD_PARTY_GEMMA_CPP_GEMMA_GEMMA_INL_H_) == \
@@ -52,709 +37,17 @@
 
 #include "hwy/highway.h"
 // After highway.h
-#include "ops/matmul-inl.h"
-#include "ops/matvec-inl.h"
 #include "ops/ops-inl.h"
 
-#ifndef GEMMA_TYPE
-#if HWY_IDE
-// Provide a definition so the IDE does not complain.
-#define GEMMA_TYPE float
-#else
-#error "Only include from instantiations/*.cc, which must define GEMMA_TYPE"
-#endif  // HWY_IDE
-#endif  // GEMMA_TYPE
-
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-// Different functions use different naming conventions for the number of
-// tokens. Functions that are query-independent, such as RMSNorm*, call the
-// count `num_interleaved`. Functions that are query-dependent, such as
-// `Attention`, use separate `num_tokens` and `num_queries`.
-
-// TODO: add batch query support for Griffin (QueriesPos).
 template <typename T>
-HWY_NOINLINE void GriffinRecurrent(size_t batch_start, size_t num_tokens,
-                                   size_t layer, Activations& activations,
-                                   const LayerWeightsPtrs<T>* layer_weights,
-                                   const KVCaches& kv_caches) {
-  PROFILER_ZONE("Gen.Griffin");
-  KVCache& kv_cache = kv_caches[0];
-  hwy::ThreadPool& pool = activations.env->parallel.Pools().Pool(0);
-  namespace hn = hwy::HWY_NAMESPACE;
-  using D = hn::ScalableTag<float>;
-  const size_t model_dim = layer_weights->layer_config.model_dim;
-  const size_t conv_1d_width = layer_weights->layer_config.conv1d_width;
-  const size_t heads = layer_weights->layer_config.heads;
-
-  // X / Y linear layers.
-  for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
-    float* HWY_RESTRICT y = activations.griffin_y.Batch(batch_idx);
-    float* HWY_RESTRICT x = activations.griffin_x.Batch(batch_idx);
-    TwoMatVecAdd(layer_weights->griffin.linear_x_w,
-                 layer_weights->griffin.linear_y_w, 0, model_dim, model_dim,
-                 activations.pre_att_rms_out.Batch(batch_idx),
-                 /*add0=*/layer_weights->griffin.linear_x_biases.data_scale1(),
-                 /*add1=*/layer_weights->griffin.linear_y_biases.data_scale1(),
-                 /*out0=*/x, /*out1=*/y, pool);
-    Gelu(y, model_dim);
-  }
-
-  // Conv1D.
-  for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
-    const size_t pos = batch_start + batch_idx;
-    float* HWY_RESTRICT x = activations.griffin_x.Batch(batch_idx);
-    HWY_FULL(float) df;
-    HWY_DASSERT(model_dim % hn::Lanes(df) == 0);
-    const size_t layer_offset = layer * model_dim * (conv_1d_width - 1);
-
-    // cache[i] = input at time t-i.
-    float* HWY_RESTRICT cache[kMaxConv1DWidth];
-    cache[0] = x;
-    for (size_t i = 1; i < conv_1d_width; i++) {
-      cache[i] =
-          kv_cache.conv1d_cache.get() + layer_offset +
-          ((pos + conv_1d_width - 1 - i) % (conv_1d_width - 1)) * model_dim;
-    }
-    for (size_t i = 0; i < model_dim; i += hn::Lanes(df)) {
-      auto xv = hn::Load(df, x + i);
-      auto accum0 =
-          hn::Load(df, layer_weights->griffin.conv_biases.data_scale1() + i);
-      auto accum1 = hn::Zero(df);
-      HWY_ASSERT_M(conv_1d_width % 2 == 0, "Conv width must be even");
-      for (size_t l = 0; 2 * l < conv_1d_width; l++) {
-        auto wv0 =
-            hn::Load(df, layer_weights->griffin.conv_w.data_scale1() +
-                             (conv_1d_width - 1 - 2 * l) * model_dim + i);
-        auto wv1 =
-            hn::Load(df, layer_weights->griffin.conv_w.data_scale1() +
-                             (conv_1d_width - 2 - 2 * l) * model_dim + i);
-        accum0 = hn::MulAdd(wv0, hn::Load(df, cache[l * 2] + i), accum0);
-        accum1 = hn::MulAdd(wv1, hn::Load(df, cache[l * 2 + 1] + i), accum1);
-      }
-      hn::Store(hn::Add(accum0, accum1), df, x + i);
-      hn::Store(xv, df, cache[HWY_MAX(conv_1d_width, 1) - 1] + i);
-    }
-  }
-
-  // RGLRU
-  for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
-    const size_t pos = batch_start + batch_idx;
-    float* HWY_RESTRICT y = activations.griffin_y.Batch(batch_idx);
-    float* HWY_RESTRICT x = activations.griffin_x.Batch(batch_idx);
-    float* HWY_RESTRICT gate_x = activations.griffin_gate_x.Batch(batch_idx);
-    float* HWY_RESTRICT a = activations.griffin_multiplier.Batch(batch_idx);
-    float* HWY_RESTRICT rnn_state =
-        kv_cache.rglru_cache.get() + layer * model_dim;
-
-    pool.Run(0, heads, [&](const uint64_t head, size_t /*thread*/) HWY_ATTR {
-      const size_t kHeadDim = model_dim / heads;
-      const size_t kMatrixSize = kHeadDim * kHeadDim;
-      size_t head_offset = head * kHeadDim;
-      TwoOfsMatVecAddLoop(
-          layer_weights->griffin.gate_w, kMatrixSize * head,
-          kMatrixSize * (heads + head), kHeadDim, kHeadDim, x + head_offset,
-          /*add0=*/layer_weights->griffin.gate_biases.data_scale1() +
-              head_offset,
-          /*add1=*/layer_weights->griffin.gate_biases.data_scale1() +
-              model_dim + head_offset,
-          /*out0=*/gate_x + head_offset, /*out1=*/a + head_offset);
-      Sigmoid(gate_x + head_offset, kHeadDim);
-      Sigmoid(a + head_offset, kHeadDim);
-      const auto fn_mul = [](D d, hn::Vec<D> x, hn::Vec<D> gate_x)
-                          HWY_ATTR { return hn::Mul(x, gate_x); };
-      hn::Transform1(D(), a + head_offset, kHeadDim,
-                     layer_weights->griffin.a.data_scale1() + head_offset,
-                     fn_mul);
-      hn::Transform1(D(), x + head_offset, kHeadDim, gate_x + head_offset,
-                     fn_mul);
-      // RNN scan
-      HWY_FULL(float) df;
-      HWY_DASSERT(kHeadDim % hn::Lanes(df) == 0);
-      for (size_t i = 0; i < kHeadDim; i += hn::Lanes(df)) {
-        auto log_a = hn::Load(df, a + head_offset + i);
-        auto gated_x = hn::Load(df, x + head_offset + i);
-        auto rnn = hn::Load(df, rnn_state + head_offset + i);
-        auto a = hn::Exp(df, log_a);
-        auto x_multiplier = hn::Sqrt(hn::NegMulAdd(a, a, hn::Set(df, 1.0f)));
-        if (pos == 0) {
-          x_multiplier = hn::Set(df, 1.0f);
-        }
-        auto new_x = hn::MulAdd(x_multiplier, gated_x, hn::Mul(a, rnn));
-        hn::Store(new_x, df, rnn_state + head_offset + i);
-
-        // Join branches.
-        auto yv = hn::Load(df, y + head_offset + i);
-        auto pre_out = hn::Mul(yv, new_x);
-        hn::Store(pre_out, df, x + head_offset + i);
-      }
-    });
-  }
-
-  // Final linear layer.
-  for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
-    float* HWY_RESTRICT x = activations.griffin_x.Batch(batch_idx);
-    float* out_ptr = activations.att_sums.Batch(batch_idx);
-    MatVecAdd(layer_weights->griffin.linear_out_w, 0, model_dim, model_dim, x,
-              layer_weights->griffin.linear_out_biases.data_scale1(), out_ptr,
-              pool);
-  }
-}
-
-// Wrapper class; holds arguments in member variables to shorten call sites.
-template <typename T>
-class GemmaAttention {
-  // The attention window usually starts at 0 unless `pos` is larger than
-  // the attention window size, then it is `pos` - window_size + 1.
-  HWY_INLINE size_t StartPos(size_t pos, size_t layer) {
-    const size_t att_window_size =
-        activations_.weights_config.attention_window_sizes[layer];
-    return pos - std::min(att_window_size - 1, pos);
-  }
-
-  template <typename U>
-  HWY_INLINE void PositionalEncodingQK(U* qk, size_t pos, size_t layer,
-                                       const float mul) {
-    // qk is either q or k, so qkv_dim is the length we operate on.
-    const size_t qkv_dim = layer_config_.qkv_dim;
-    const float* inv_timescale = activations_.inv_timescale.Const();
-    bool is_global_layer =
-        activations_.weights_config.attention_window_sizes[layer] ==
-        activations_.seq_len;
-    // TODO: add a config flag instead of hardcoding the model.
-    if (is_global_layer &&
-        (activations_.weights_config.model == Model::GEMMA3_4B ||
-         activations_.weights_config.model == Model::GEMMA3_12B ||
-         activations_.weights_config.model == Model::GEMMA3_27B ||
-         activations_.weights_config.model == Model::GEMMA3_1B)) {
-      inv_timescale = activations_.inv_timescale_global.Const();
-    }
-    // PostQKType::Rope
-    (void)layer;
-    if (layer_weights_.layer_config.post_qk == PostQKType::HalfRope) {
-      Rope(qk, qkv_dim / 2, inv_timescale, pos);
-      if (mul != 1.0f) MulByConst(mul, qk, qkv_dim);
-    } else {
-      RopeAndMulBy(mul, qk, qkv_dim, inv_timescale, pos);
-    }
-  }
-
-  // Fills activations.q and computes KV. For is_mha_, a single MatMul suffices
-  // and we later copy KV from q to KVCache. Otherwise, a second MatMul writes
-  // KV directly to KVCache.
-  HWY_NOINLINE void ComputeQKV(const size_t num_interleaved) {
-    PROFILER_ZONE("Gen.Attention.QKV");
-    const size_t model_dim = layer_config_.model_dim;
-    const size_t qkv_dim = layer_config_.qkv_dim;
-    const size_t heads = layer_config_.heads;
-    const size_t kv_heads = layer_config_.kv_heads;
-
-    const auto pre_att_rms_out =
-        ConstMatFromBatch(num_interleaved, activations_.pre_att_rms_out);
-    auto w_q1 = layer_weights_.qkv_einsum_w.data()
-                    ? ConstMatFromWeights(layer_weights_.qkv_einsum_w)
-                    : ConstMatFromWeights(layer_weights_.qkv_einsum_w1);
-    // The original qkv_einsum_w has shape [(heads + kv_heads * 2), kKQVDim,
-    // model_dim], which we reshaped to (heads + kv_heads * 2) * kKQVDim rows.
-    // We must shrink to the actual size because MatMul verifies
-    // `B.extents.rows == C.Cols()`. If MHA, `QStride() == 3 * qkv_dim` and all
-    // rows are used. Otherwise, `QStride() == qkv_dim` and KV will be
-    // computed in the second MatMul.
-    const size_t w1_rows = heads * layer_config_.QStride();
-    w_q1.ShrinkRows(w1_rows);
-    MatMul(pre_att_rms_out, w_q1,
-           /*add=*/nullptr, *activations_.env, RowPtrFromBatch(activations_.q));
-
-    if (is_mha_) {
-      // Multi-Head Attention a.k.a. "use_qkv_einsum" computed QKV already.
-    } else {
-      auto w_q2 = layer_weights_.qkv_einsum_w.data()
-                      ? ConstMatFromWeights(layer_weights_.qkv_einsum_w,
-                                            w1_rows * model_dim)
-                      : ConstMatFromWeights(layer_weights_.qkv_einsum_w2);
-      // KV structure is [k, v, k, v, ....] = kv_heads pairs of (k, v).
-      const size_t w_rows_kv_cols = kv_heads * 2 * qkv_dim;
-      w_q2.ShrinkRows(w_rows_kv_cols);
-
-      // Single query and no wraparound means we can use a matmul and write
-      // directly into the KV cache with a stride of cache_pos_size_.
-      if (num_queries_ == 1 &&
-          queries_pos_[0] + num_tokens_ <= div_seq_len_.GetDivisor()) {
-        const size_t kv_ofs =
-            queries_pos_[0] * cache_pos_size_ + layer_ * cache_layer_size_;
-        float* HWY_RESTRICT kv = kv_caches_[0].kv_cache.get() + kv_ofs;
-        RowPtrF kv_rows(kv, w_rows_kv_cols);
-        kv_rows.SetStride(cache_pos_size_);
-        MatMul(pre_att_rms_out, w_q2,
-               /*add=*/nullptr, *activations_.env, kv_rows);
-      } else {
-        // Proceed row by row because there will be wraparound.
-        for (size_t interleaved_idx = 0; interleaved_idx < num_interleaved;
-             ++interleaved_idx) {
-          const float* x = activations_.pre_att_rms_out.Batch(interleaved_idx);
-          const size_t query_idx = interleaved_idx % num_queries_;
-          const size_t batch_idx = interleaved_idx / num_queries_;
-          KVCache& kv_cache = kv_caches_[query_idx];
-          const size_t cache_pos =
-              div_seq_len_.Remainder(queries_pos_[query_idx] + batch_idx);
-          const size_t kv_offset =
-              cache_pos * cache_pos_size_ + layer_ * cache_layer_size_;
-          float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + kv_offset;
-          if (layer_weights_.qkv_einsum_w.data()) {
-            MatVec(layer_weights_.qkv_einsum_w, heads * qkv_dim * model_dim,
-                   w_rows_kv_cols, model_dim, x, kv, pool_);
-          } else {
-            MatVec(layer_weights_.qkv_einsum_w2, 0,  //
-                   w_rows_kv_cols, model_dim, x, kv, pool_);
-          }
-        }
-      }
-    }  // !is_mha_
-
-    // Apply positional encodings for K (and copy KV to cache if MHA).
-    pool_.Run(0, kv_heads * num_interleaved,
-              [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-                const size_t head = task % kv_heads;
-                const size_t interleaved_idx = task / kv_heads;
-                const size_t query_idx = interleaved_idx % num_queries_;
-                const size_t batch_idx = interleaved_idx / num_queries_;
-                const size_t pos = queries_pos_[query_idx] + batch_idx;
-                const size_t cache_pos = div_seq_len_.Remainder(pos);
-                const size_t kv_offset = cache_pos * cache_pos_size_ +
-                                         layer_ * cache_layer_size_ +
-                                         head * qkv_dim * 2;
-                KVCache& kv_cache = kv_caches_[query_idx];
-                float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + kv_offset;
-                // If MHA, copy computed K and V into KVCache.
-                if (is_mha_) {
-                  const float* HWY_RESTRICT mha_kv =
-                      activations_.q.Batch(interleaved_idx) + head * q_stride_ +
-                      qkv_dim;
-                  hwy::CopyBytes(mha_kv, kv, 2 * qkv_dim * sizeof(*kv));
-                }
-
-                // Apply further processing to K.
-                if (layer_weights_.key_norm_scale.data()) {
-                  RMSNormInplace(layer_weights_.key_norm_scale.data(), kv,
-                                 qkv_dim);
-                }
-                PositionalEncodingQK(kv, pos, layer_, /*mul=*/1.0f);
-              });
-  }
-
-  // Computes Q.K scores, which are "logits" (or scores) stored to head_att.
-  HWY_INLINE void QDotK(const size_t start_pos, const size_t last_pos,
-                        const size_t head_offset, const float* HWY_RESTRICT q,
-                        const KVCache& kv_cache, float* HWY_RESTRICT head_att) {
-    const size_t qkv_dim = layer_config_.qkv_dim;
-    if (HWY_LIKELY(last_pos < activations_.seq_len)) {
-      // Slightly faster: no wraparound.
-      for (size_t pos = start_pos; pos <= last_pos; ++pos) {
-        const size_t kv_offset =
-            pos * cache_pos_size_ + layer_ * cache_layer_size_ + head_offset;
-        const float* HWY_RESTRICT k = &kv_cache.kv_cache[kv_offset];
-        const float score = Dot(q, k, qkv_dim);
-        head_att[pos] = score;
-      }
-    } else {
-      for (size_t pos = start_pos; pos <= last_pos; ++pos) {
-        const size_t cache_pos = div_seq_len_.Remainder(pos);
-        const size_t kv_offset = cache_pos * cache_pos_size_ +
-                                 layer_ * cache_layer_size_ + head_offset;
-        const float* HWY_RESTRICT k = &kv_cache.kv_cache[kv_offset];
-        const float score = Dot(q, k, qkv_dim);
-        head_att[pos % activations_.seq_len] = score;
-      }
-    }
-  }
-
-  // Accumulates the sum of v (from `kv_cache`) * probability (`head_att`) into
-  // `att_out`. Equivalent in gemma/modules.py:
-  // encoded = jnp.einsum('BTNS,BSNH->BTNH', probs, value_proj)
-  HWY_INLINE void WeightedSumV(const size_t start_pos, const size_t last_pos,
-                               const float* HWY_RESTRICT head_att,
-                               const size_t layer, const size_t head_offset,
-                               const hwy::Divisor& div_seq_len,
-                               const KVCache& kv_cache,
-                               float* HWY_RESTRICT att_out) const {
-    const size_t qkv_dim = layer_config_.qkv_dim;
-    hwy::ZeroBytes(att_out, qkv_dim * sizeof(*att_out));
-
-    if (HWY_LIKELY(last_pos < activations_.seq_len)) {
-      // Slightly faster: no wraparound.
-      for (size_t pos = start_pos; pos <= last_pos; ++pos) {
-        const size_t kv_offset =
-            pos * cache_pos_size_ + layer * cache_layer_size_ + head_offset;
-        const float* HWY_RESTRICT v =
-            kv_cache.kv_cache.get() + kv_offset + qkv_dim;
-        MulByConstAndAdd(head_att[pos], v, att_out, qkv_dim);
-      }
-    } else {
-      for (size_t pos = start_pos; pos <= last_pos; ++pos) {
-        const size_t cache_pos = div_seq_len.Remainder(pos);
-        const size_t kv_offset = cache_pos * cache_pos_size_ +
-                                 layer * cache_layer_size_ + head_offset;
-        const float* HWY_RESTRICT v =
-            kv_cache.kv_cache.get() + kv_offset + qkv_dim;
-        MulByConstAndAdd(head_att[pos % activations_.seq_len], v, att_out,
-                         qkv_dim);
-      }
-    }
-  }
-
-  HWY_NOINLINE void DotSoftmaxWeightedSum(const size_t num_interleaved) {
-    PROFILER_ZONE("Gen.Attention.DotSoftmax");
-    const float query_scale = ChooseQueryScale(activations_.weights_config);
-
-    // A "head group" in the context of GQA refers to a collection of query
-    // heads that share the same key and value heads.
-    const size_t kHeadGroups = layer_config_.heads / layer_config_.kv_heads;
-
-    // For each head (token, query), compute Q.K, softmax, and weighted V.
-    pool_.Run(0, layer_config_.heads * num_interleaved,
-              [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-                const size_t head = task % layer_config_.heads;
-                const size_t interleaved_idx = task / layer_config_.heads;
-                const size_t query_idx = interleaved_idx % num_queries_;
-                const size_t batch_idx = interleaved_idx / num_queries_;
-                const size_t qkv_dim = layer_config_.qkv_dim;
-                const size_t head_offset = (head / kHeadGroups) * qkv_dim * 2;
-                KVCache& kv_cache = kv_caches_[query_idx];
-                float* HWY_RESTRICT q =
-                    activations_.q.Batch(interleaved_idx) + head * q_stride_;
-
-                // Apply rope and scaling to Q.
-                const size_t pos = queries_pos_[query_idx] + batch_idx;
-                if (layer_weights_.query_norm_scale.data()) {
-                  RMSNormInplace(layer_weights_.query_norm_scale.data(), q,
-                                 qkv_dim);
-                }
-                PositionalEncodingQK(q, pos, layer_, query_scale);
-
-                const size_t start_pos = StartPos(pos, layer_);
-                size_t last_pos = pos;
-                const size_t prefix_end = queries_prefix_end_[query_idx];
-                if (prefix_end > 0 && prefix_end - 1 > last_pos) {
-                  // last_pos in QDotK and WeightedSumV is inclusive.
-                  last_pos = prefix_end - 1;
-                }
-
-                float* HWY_RESTRICT head_att =
-                    activations_.att.Batch(interleaved_idx) +
-                    head * activations_.seq_len;
-                QDotK(start_pos, last_pos, head_offset, q, kv_cache, head_att);
-                // SoftMax with optional SoftCap yields "probabilities" in
-                // head_att.
-                const size_t head_att_len =
-                    std::min(last_pos + 1, activations_.seq_len);
-                MaybeLogitsSoftCap(activations_.weights_config.att_cap,
-                                   head_att, head_att_len);
-                Softmax(head_att, head_att_len);
-
-                float* HWY_RESTRICT att_out =
-                    activations_.att_out.Batch(interleaved_idx) +
-                    head * qkv_dim;
-                WeightedSumV(start_pos, last_pos, head_att, layer_, head_offset,
-                             div_seq_len_, kv_cache, att_out);
-              });
-  }
-
-  // Sums encoded (`att_out`) over num_heads (`layer_config_.heads`) and
-  // head_dim (`qkv_dim`) into output (`layer_out`).
-  HWY_NOINLINE void SumHeads(const size_t num_interleaved) {
-    PROFILER_ZONE("Gen.Attention.SumHeads");
-    // att_weights and att_out are concatenated heads, each of length
-    // layer_config_.qkv_dim. Thus the [num_interleaved,
-    // layer_config_.model_dim] matmul output is the sum over heads. Compare
-    // gemma/modules.py: attn_output = self.attn_vec_einsum('BTNH,NHD->BTD',
-    // encoded)
-    HWY_DASSERT(layer_config_.model_dim > 0);
-    HWY_DASSERT(layer_config_.heads > 0);
-    HWY_DASSERT(layer_config_.qkv_dim > 0);
-    HWY_DASSERT(layer_weights_.att_weights.data() != nullptr);
-    HWY_DASSERT(activations_.att_out.All() != nullptr);
-    HWY_DASSERT(activations_.att_sums.All() != nullptr);
-
-    const float* add =
-        layer_weights_.layer_config.softmax_attn_output_biases
-            ? layer_weights_.attention_output_biases.data_scale1()
-            : nullptr;
-    MatMul(ConstMatFromBatch(num_interleaved, activations_.att_out),
-           ConstMatFromWeights(layer_weights_.att_weights), add,
-           *activations_.env, RowPtrFromBatch(activations_.att_sums));
-  }
-
- public:
-  // Constructor with explicit initialization of queries_prefix_end. This is
-  // needed for the Prefix-LM style attention. For standard causal attention,
-  // the other constructor can be used.
-  GemmaAttention(const QueriesPos& queries_pos,
-                 const QueriesPos& queries_prefix_end, size_t num_tokens,
-                 size_t layer, Activations& activations,
-                 const LayerWeightsPtrs<T>* layer_weights,
-                 const hwy::Divisor& div_seq_len, const KVCaches& kv_caches)
-      : GemmaAttention(queries_pos, &queries_prefix_end, num_tokens, layer,
-                       activations, layer_weights, div_seq_len, kv_caches) {}
-  // Constructor with default initialization to 0 for queries_prefix_end.
-  GemmaAttention(const QueriesPos& queries_pos, size_t num_tokens, size_t layer,
-                 Activations& activations,
-                 const LayerWeightsPtrs<T>* layer_weights,
-                 const hwy::Divisor& div_seq_len, const KVCaches& kv_caches)
-      : GemmaAttention(queries_pos, nullptr, num_tokens, layer, activations,
-                       layer_weights, div_seq_len, kv_caches) {}
-
-  // Full attention computation in three steps.
-  HWY_INLINE void operator()() {
-    const size_t num_interleaved = num_tokens_ * num_queries_;
-    ComputeQKV(num_interleaved);
-    DotSoftmaxWeightedSum(num_interleaved);
-    SumHeads(num_interleaved);
-  }
-
- private:
-  // Delegated Constructor that does most of the common work.
-  GemmaAttention(const QueriesPos& queries_pos,
-                 const QueriesPos* queries_prefix_end, size_t num_tokens,
-                 size_t layer, Activations& activations,
-                 const LayerWeightsPtrs<T>* layer_weights,
-                 const hwy::Divisor& div_seq_len, const KVCaches& kv_caches)
-      : queries_pos_(queries_pos),
-        num_queries_(queries_pos.size()),
-        num_tokens_(num_tokens),
-        layer_(layer),
-        layer_config_(layer_weights->layer_config),
-        q_stride_(layer_config_.QStride()),
-        cache_layer_size_(layer_weights->layer_config.CacheLayerSize()),
-        cache_pos_size_(activations.cache_pos_size),
-        is_mha_(layer_config_.IsMHA()),
-        activations_(activations),
-        layer_weights_(*layer_weights),
-        div_seq_len_(div_seq_len),
-        kv_caches_(kv_caches),
-        pool_(activations.env->parallel.Pools().Pool(0)) {
-    HWY_DASSERT(num_queries_ <= kv_caches_.size());
-    HWY_DASSERT_M((layer_config_.heads % layer_config_.kv_heads) == 0,
-                  "query heads must be a multiple of key-value heads");
-    if (queries_prefix_end != nullptr) {
-      queries_prefix_end_ = *queries_prefix_end;
-    } else {
-      queries_prefix_end_vec_.assign(num_queries_, 0);
-      queries_prefix_end_ = QueriesPos(queries_prefix_end_vec_.data(),
-                                       queries_prefix_end_vec_.size());
-    }
-  }
-
-  const QueriesPos& queries_pos_;
-  std::vector<size_t> queries_prefix_end_vec_;
-  QueriesPos queries_prefix_end_;
-  const size_t num_queries_;
-  const size_t num_tokens_;
-  const size_t layer_;
-  const LayerConfig& layer_config_;
-  const size_t q_stride_ = 0;
-  const size_t cache_layer_size_ = 0;
-  const size_t cache_pos_size_ = 0;
-  const bool is_mha_ = false;
-
-  Activations& activations_;
-  const LayerWeightsPtrs<T>& layer_weights_;
-  const hwy::Divisor& div_seq_len_;
-  const KVCaches& kv_caches_;
-  hwy::ThreadPool& pool_;
-};
-
-template <typename T>
-HWY_NOINLINE void Attention(
-    LayerAttentionType type, const QueriesPos& queries_pos,
-    const QueriesPos& queries_prefix_end, size_t num_tokens, size_t layer,
-    Activations& activations, const LayerWeightsPtrs<T>* layer_weights,
-    const hwy::Divisor& div_seq_len, const KVCaches& kv_caches) {
-  if (type == LayerAttentionType::kGemma) {
-    GemmaAttention<T>(queries_pos, queries_prefix_end, num_tokens, layer,
-                      activations, layer_weights, div_seq_len, kv_caches)();
-  } else {
-    // Only reached if the model is Griffin.
-    // The kv_caches are allocated only for the griffin layers, so we need to
-    // map the layer index to the griffin layer index.
-    auto type = layer_weights->layer_config.type;
-    size_t layer_of_type =
-        activations.weights_config.NumLayersOfTypeBefore(type, layer);
-    HWY_ASSERT(queries_pos.size() == 1);
-    GriffinRecurrent(queries_pos[0], num_tokens, layer_of_type, activations,
-                     layer_weights, kv_caches);
-  }
-}
-
-// Wrapper class; holds arguments in member variables to shorten call sites.
-// The main differences to GemmaAttention are:
-// - no KV Cache necessary, attention is always all-to-all and not causal.
-// - no potential wrap-around, attention always goes from 0 to kSeqLen.
-// - no need for batching, as we are always computing attention for kSeqLen
-//   tokens.
-// This results in a much simpler implementation. However, to avoid duplicating
-// code, we should still consider merging the two classes.
-// TODO(keysers): Refactor to share code with GemmaAttention.
-template <typename T>
-class VitAttention {
-  // Computes Q, K, V for all heads, stored in activations_.q.
-  HWY_NOINLINE void ComputeQKV() {
-    PROFILER_ZONE("Gen.VitAttention.QKV");
-    auto& qkv = activations_.q;
-    HWY_ASSERT(qkv.BatchSize() == num_tokens_);
-    HWY_ASSERT(qkv.Cols() == layer_config_.heads * 3 * layer_config_.qkv_dim);
-    MatMul(ConstMatFromBatch(num_tokens_, activations_.pre_att_rms_out),
-           ConstMatFromWeights(layer_weights_.vit.qkv_einsum_w),
-           layer_weights_.vit.qkv_einsum_b.data_scale1(), *activations_.env,
-           RowPtrFromBatch(qkv));
-  }
-
-  // TODO(philculliton): transition fully to MatMul.
-  HWY_NOINLINE void DotSoftmaxWeightedSumMatrix() {
-    const size_t qkv_dim = layer_config_.qkv_dim;
-    const size_t heads = layer_config_.heads;
-    HWY_ASSERT_M(heads == layer_config_.kv_heads, "Vit expects MHA");
-    const size_t seq_len = activations_.seq_len;
-    const float query_scale = 1.0f / sqrtf(static_cast<float>(qkv_dim));
-    PROFILER_ZONE("Gen.VitAttention.DotSoftmax");
-
-    // Shift Q, K, VT to RowVectorBatches with AllocateAlignedRows(extents)
-    RowVectorBatch<float> Q =
-        AllocateAlignedRows<float>(Extents2D(num_tokens_, qkv_dim));
-    RowVectorBatch<float> K =
-        AllocateAlignedRows<float>(Extents2D(seq_len, qkv_dim));
-    RowVectorBatch<float> C(Extents2D(num_tokens_, seq_len));
-
-    // Initialize att_out to zero prior to head loop.
-    hwy::ZeroBytes(activations_.att_out.All(),
-                   num_tokens_ * heads * qkv_dim * sizeof(float));
-
-    for (size_t head = 0; head < heads; ++head) {
-      pool_.Run(0, num_tokens_, [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-        const size_t token = task;
-        float* HWY_RESTRICT q =
-            activations_.q.Batch(token) + head * 3 * qkv_dim;
-        // TODO: shift to MatMul with A.scale once MatMul is confirmed working
-        MulByConst(query_scale, q, qkv_dim);
-        hwy::CopyBytes(q, Q.Batch(token), qkv_dim * sizeof(float));
-      });
-
-      pool_.Run(0, seq_len, [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-        const size_t seq_idx = task;
-        float* HWY_RESTRICT k =
-            activations_.q.Batch(seq_idx) + head * 3 * qkv_dim + qkv_dim;
-        hwy::CopyBytes(k, K.Batch(seq_idx), qkv_dim * sizeof(float));
-      });
-
-      // this produces C, a (num_tokens_, seq_len) matrix of dot products
-      MatMul(ConstMatFromBatch(Q.BatchSize(), Q),
-             ConstMatFromBatch(K.BatchSize(), K), nullptr, *activations_.env,
-             RowPtrFromBatch(C));
-
-      pool_.Run(0, num_tokens_, [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-        float* HWY_RESTRICT c = C.Batch(task);
-        Softmax(c, C.Cols());
-      });
-
-      pool_.Run(0, num_tokens_, [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-        size_t token = task;
-        float* HWY_RESTRICT att_out =
-            activations_.att_out.Batch(token) + head * qkv_dim;
-        for (size_t i = 0; i < seq_len; ++i) {
-          float* HWY_RESTRICT v =
-              activations_.q.Batch(i) + head * 3 * qkv_dim + 2 * qkv_dim;
-          MulByConstAndAdd(C.Batch(token)[i], v, att_out, qkv_dim);
-        }
-      });
-    }
-  }
-
-  HWY_NOINLINE void DotSoftmaxWeightedSum() {
-    const size_t qkv_dim = layer_config_.qkv_dim;
-    const size_t heads = layer_config_.heads;
-    HWY_ASSERT_M(heads == layer_config_.kv_heads, "Vit expects MHA");
-    const size_t seq_len = activations_.seq_len;
-    const float query_scale = 1.0f / sqrtf(static_cast<float>(qkv_dim));
-    PROFILER_ZONE("Gen.VitAttention.DotSoftmax");
-
-    // Compute Q.K, softmax, and weighted V.
-    pool_.Run(0, layer_config_.heads * num_tokens_,
-              [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-                const size_t head = task % layer_config_.heads;
-                const size_t token = task / layer_config_.heads;
-                // Compute Q.K scores, which are "logits" stored in head_att.
-                float* HWY_RESTRICT q =
-                    activations_.q.Batch(token) + head * 3 * qkv_dim;
-                MulByConst(query_scale, q, qkv_dim);
-                float* HWY_RESTRICT head_att =
-                    activations_.att.Batch(token) + head * activations_.seq_len;
-                for (size_t i = 0; i < seq_len; ++i) {
-                  float* HWY_RESTRICT k =
-                      activations_.q.Batch(i) + head * 3 * qkv_dim + qkv_dim;
-                  head_att[i] = Dot(q, k, qkv_dim);  // score = q.k
-                }
-                // SoftMax yields "probabilities" in head_att.
-                Softmax(head_att, seq_len);
-                // Compute weighted sum of v into att_out.
-                float* HWY_RESTRICT att_out =
-                    activations_.att_out.Batch(token) + head * qkv_dim;
-                hwy::ZeroBytes(att_out, qkv_dim * sizeof(*att_out));
-                for (size_t i = 0; i < seq_len; ++i) {
-                  float* HWY_RESTRICT v = activations_.q.Batch(i) +
-                                          head * 3 * qkv_dim + 2 * qkv_dim;
-                  MulByConstAndAdd(head_att[i], v, att_out, qkv_dim);
-                }
-              });
-  }
-
-  // Sums encoded (`att_out`) over num_heads (`layer_config_.heads`) and
-  // head_dim (`qkv_dim`) into output (`att_sums`).
-  HWY_NOINLINE void SumHeads() {
-    PROFILER_ZONE("Gen.VitAttention.SumHeads");
-    auto* bias = layer_weights_.vit.attn_out_b.data_scale1();
-    // att_weights and att_out are concatenated heads, each of length
-    // qkv_dim. Thus the [num_tokens_, layer_config_.model_dim]
-    // matmul output is the sum over heads.
-    auto att_out = ConstMatFromBatch(num_tokens_, activations_.att_out);
-    auto att_weights = ConstMatFromWeights(layer_weights_.vit.attn_out_w);
-    auto att_sums = RowPtrFromBatch(activations_.att_sums);
-    MatMul(att_out, att_weights, bias, *activations_.env, att_sums);
-  }
-
- public:
-  VitAttention(size_t num_tokens, size_t layer, Activations& activations,
-               const LayerWeightsPtrs<T>* layer_weights)
-      : num_tokens_(num_tokens),
-        layer_(layer),
-        activations_(activations),
-        layer_weights_(*layer_weights),
-        layer_config_(layer_weights->layer_config),
-        pool_(activations.env->parallel.Pools().Pool(0)) {}
-
-  HWY_INLINE void operator()() {
-    ComputeQKV();
-    if (activations_.weights_config.wrapping == PromptWrapping::GEMMA_VLM) {
-      DotSoftmaxWeightedSumMatrix();
-    } else {
-      DotSoftmaxWeightedSum();
-    }
-    SumHeads();
-  }
-
- private:
-  const size_t num_tokens_;
-  const size_t layer_;
-  Activations& activations_;
-  const LayerWeightsPtrs<T>& layer_weights_;
-  const LayerConfig& layer_config_;
-  hwy::ThreadPool& pool_;
-};
-
-template <typename T>
-HWY_NOINLINE void Activation(ActivationType activation, T* HWY_RESTRICT c1,
-                             T* HWY_RESTRICT c2, size_t count) {
-  PROFILER_ZONE("Gen.Activation");
+void Activation(ActivationType activation, T* HWY_RESTRICT c1,
+                const T* HWY_RESTRICT c2, const size_t count,
+                const size_t worker) {
+  PROFILER_ZONE2(worker, "Gen.Activation");
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<T>;
   using VF = hn::Vec<DF>;
@@ -769,849 +62,88 @@ HWY_NOINLINE void Activation(ActivationType activation, T* HWY_RESTRICT c1,
   });
 }
 
-template <typename T>
-HWY_NOINLINE void FFWNoVit(Activations& activations, size_t num_interleaved,
-                           const LayerWeightsPtrs<T>* layer_weights) {
-  PROFILER_ZONE("Gen.FFW");
-  const size_t model_dim = layer_weights->layer_config.model_dim;
-  const size_t ffh_hidden_dim = layer_weights->layer_config.ff_hidden_dim;
-  HWY_DASSERT(num_interleaved <= activations.bf_pre_ffw_rms_out.BatchSize());
+// No C2 multiplier.
+template <class Mat>
+void ActivationBatched(ActivationType activation, Mat& c1, NestedPools& pools) {
+  using T = typename Mat::T;
+  const size_t pkg_idx = 0;
+  SmallParallelFor(
+      c1.Rows(), pools, pkg_idx, [&](uint64_t task, size_t worker) {
+        // Cast to correct type so type deduction works.
+        Activation(activation, c1.Row(task), static_cast<const T*>(nullptr),
+                   c1.Cols(), worker);
+      });
+}
 
-  const bool add_bias = layer_weights->layer_config.ff_biases;
+template <class Mat>
+HWY_NOINLINE void ActivationBatched(ActivationType activation, Mat& c1,
+                                    const Mat* c2, NestedPools& pools) {
+  using T = typename Mat::T;
+  HWY_DASSERT(c1.SameShape(*c2));
+  const size_t pkg_idx = 0;
+  if (c2 && c2->HasPtr()) {
+    SmallParallelFor(c1.Rows(), pools, pkg_idx,
+                     [&](uint64_t task, size_t worker) {
+                       Activation(activation, c1.Row(task), c2->Row(task),
+                                  c1.Cols(), worker);
+                     });
+  } else {  // No multiplier
+    SmallParallelFor(
+        c1.Rows(), pools, pkg_idx, [&](uint64_t task, size_t worker) {
+          Activation(activation, c1.Row(task), static_cast<const T*>(nullptr),
+                     c1.Cols(), worker);
+        });
+  }
+}
+
+template <typename T2, class LayerWeights>
+HWY_NOINLINE void ResidualConnection(const MatPtrT<T2>& other,
+                                     MatPtrT<float>& HWY_RESTRICT x,
+                                     const LayerWeights& layer,
+                                     bool is_attention, ThreadingContext& ctx) {
+  // ResidualType::Add
+  AddFromBatched(other, x, ctx);
+}
+
+template <typename InOutT>
+void PostNorm(PostNormType post_norm, const MatPtr& weights,
+              MatPtrT<InOutT>& inout, ThreadingContext& ctx) {
+  HWY_DASSERT(weights.Rows() == 1);
+  if (post_norm == PostNormType::Scale) {
+    RMSNormInplaceBatched(weights, inout, ctx);
+  }
+}
+
+static inline void FFWNoVit(const LayerWeightsPtrs& layer,
+                            Activations& activations, MatMulEnv& env) {
+  PROFILER_ZONE("Gen.FFW");
+  const LayerConfig& layer_config = layer.layer_config;
+  const size_t ffh_hidden_dim = layer_config.ff_hidden_dim;
+
+  const bool add_bias = layer_config.ff_biases;
   const float* bias1 =
-      add_bias ? layer_weights->ffw_gating_biases.data_scale1() : nullptr;
+      add_bias ? layer.ffw_gating_biases.PackedScale1() : nullptr;
   const float* bias2 = add_bias ? bias1 + ffh_hidden_dim : nullptr;
   const float* output_bias =
-      add_bias ? layer_weights->ffw_output_biases.data_scale1() : nullptr;
-
-  // Define slightly more readable names for the weights and activations.
-  const auto x =
-      ConstMatFromBatch(num_interleaved, activations.bf_pre_ffw_rms_out);
-
-  auto hidden_activations = RowPtrFromBatch(activations.C1);
-  auto multiplier = RowPtrFromBatch(activations.C2);
-  auto ffw_out = RowPtrFromBatch(activations.ffw_out);
-
-  // gating_einsum_w holds two half-matrices. We plan to change the importer to
-  // avoid this confusion by splitting into gating_einsum_w1 and
-  // gating_einsum_w2.
-  const bool split = !!layer_weights->gating_einsum_w.data();
-  auto w1 = split ? ConstMatFromWeights(layer_weights->gating_einsum_w)
-                  : ConstMatFromWeights(layer_weights->gating_einsum_w1);
-  auto w2 = split ? ConstMatFromWeights(layer_weights->gating_einsum_w,
-                                        model_dim * ffh_hidden_dim)
-                  : ConstMatFromWeights(layer_weights->gating_einsum_w2);
-  if (split) {
-    // Ensure that B.Extents().row matches C.Cols() because MatMul checks that.
-    w1.ShrinkRows(ffh_hidden_dim);
-    w2.ShrinkRows(ffh_hidden_dim);
-  }
-  auto w_output = ConstMatFromWeights(layer_weights->linear_w);
+      add_bias ? layer.ffw_output_biases.PackedScale1() : nullptr;
 
   // Compute the hidden layer activations.
-  MatMul(x, w1, bias1, *activations.env, hidden_activations);
-  MatMul(x, w2, bias2, *activations.env, multiplier);
+  CallMatMul(activations.pre_ffw_rms_out, layer.gating_einsum_w1, bias1, env,
+             activations.C1);
+  CallMatMul(activations.pre_ffw_rms_out, layer.gating_einsum_w2, bias2, env,
+             activations.C2);
 
   // Activation (Gelu) and maybe multiply by gate. Store activations in act.
-  Activation(layer_weights->layer_config.activation, hidden_activations.Row(0),
-             multiplier.Row(0), ffh_hidden_dim * num_interleaved);
+  ActivationBatched(layer_config.activation, activations.C1, &activations.C2,
+                    env.ctx.pools);
 
   // Hidden layer -> output layer.
-  auto activations_mat = MakeConstMat(
-      hidden_activations.Row(0), Extents2D(num_interleaved, ffh_hidden_dim),
-      hidden_activations.Stride());
-
-  MatMul(activations_mat, w_output, output_bias, *activations.env, ffw_out);
-}
-
-// Same as FFWNoVit, but with different layer_weights members and no second
-// gating matrix.
-template <typename T>
-HWY_NOINLINE void FFWVit(Activations& activations, size_t num_interleaved,
-                         const LayerWeightsPtrs<T>* layer_weights) {
-  PROFILER_ZONE("Gen.FFW");
-  const size_t ff_hidden_dim = layer_weights->layer_config.ff_hidden_dim;
-  HWY_DASSERT(num_interleaved <= activations.bf_pre_ffw_rms_out.BatchSize());
-
-  const bool add_bias = layer_weights->layer_config.ff_biases;
-  const float* bias1 =
-      add_bias ? layer_weights->vit.linear_0_b.data_scale1() : nullptr;
-  const float* output_bias =
-      add_bias ? layer_weights->vit.linear_1_b.data_scale1() : nullptr;
-
-  // Define slightly more readable names for the weights and activations.
-  const auto x =
-      ConstMatFromBatch(num_interleaved, activations.bf_pre_ffw_rms_out);
-
-  auto hidden_activations = RowPtrFromBatch(activations.C1);
-  auto ffw_out = RowPtrFromBatch(activations.ffw_out);
-
-  auto w1 = ConstMatFromWeights(layer_weights->vit.linear_0_w);
-  auto w_output = ConstMatFromWeights(layer_weights->vit.linear_1_w);
-
-  // Compute the hidden layer activations.
-  MatMul(x, w1, bias1, *activations.env, hidden_activations);
-
-  // Activation (Gelu), store in act.
-  RowPtrF multiplier = RowPtrF(nullptr, 0);
-  Activation(layer_weights->layer_config.activation, hidden_activations.Row(0),
-             multiplier.Row(0), ff_hidden_dim * num_interleaved);
-
-  // Hidden layer -> output layer.
-  auto activations_mat = MakeConstMat(hidden_activations.Row(0),
-                                      Extents2D(num_interleaved, ff_hidden_dim),
-                                      hidden_activations.Stride());
-
-  MatMul(activations_mat, w_output, output_bias, *activations.env, ffw_out);
-}
-
-// `batch_idx` indicates which row of `x` to write to.
-// `pos` is the *token*'s position, not the start of the batch, because this is
-// called for batches of tokens in prefill, but batches of queries in decode.
-//
-// For GEMMA_VLM, image tokens are copied into -2 locations (per the Gemma 3
-// spec) until we run out of image tokens. This allows for a multi-image prompt
-// if -2 locations with appropriate begin/end image tokens are created by the
-// calling application.
-template <typename T>
-HWY_NOINLINE void EmbedMMToken(int token, size_t batch_idx, size_t pos,
-                               size_t pos_in_prompt,
-                               const ModelWeightsPtrs<T>& weights,
-                               RowVectorBatch<float>& x,
-                               const ImageTokens* image_tokens,
-                               size_t& image_token_position) {
-  // Image tokens just need to be copied.
-  if (weights.weights_config.wrapping == PromptWrapping::GEMMA_VLM &&
-      image_tokens != nullptr && token == -2 &&
-      image_token_position < image_tokens->BatchSize()) {
-    hwy::CopyBytes(image_tokens->Batch(image_token_position),
-                   x.Batch(batch_idx), x.Cols() * sizeof(x.Const()[0]));
-    image_token_position++;
-    return;
-  }
-
-  if (weights.weights_config.wrapping == PromptWrapping::PALIGEMMA &&
-      image_tokens != nullptr && pos_in_prompt < image_tokens->BatchSize()) {
-    hwy::CopyBytes(image_tokens->Batch(pos_in_prompt), x.Batch(batch_idx),
-                   x.Cols() * sizeof(x.Const()[0]));
-    return;
-  }
-
-  const size_t model_dim = weights.weights_config.model_dim;
-  const size_t vocab_size = weights.weights_config.vocab_size;
-  const float emb_scaling = EmbeddingScaling(model_dim);
-
-  HWY_DASSERT(token >= 0);
-  HWY_DASSERT(token < static_cast<int>(vocab_size));
-
-  const hn::ScalableTag<float> df;
-  DecompressAndZeroPad(
-      df,
-      MakeSpan(weights.embedder_input_embedding.data(), vocab_size * model_dim),
-      token * model_dim, x.Batch(batch_idx), model_dim);
-  MulByConst(emb_scaling * weights.embedder_input_embedding.scale(),
-             x.Batch(batch_idx), model_dim);
-  if (weights.weights_config.absolute_pe) {
-    AddAbsolutePositionalEmbeddings(x.Batch(batch_idx), model_dim, pos);
-  }
-}
-
-// `batch_idx` indicates which row of `x` to write to.
-// `pos` is the *token*'s position, not the start of the batch, because this is
-// called for batches of tokens in prefill, but batches of queries in decode.
-// This version of the function doesn't track internal image token position.
-template <typename T>
-HWY_NOINLINE void EmbedToken(int token, size_t batch_idx, size_t pos,
-                             size_t pos_in_prompt,
-                             const ModelWeightsPtrs<T>& weights,
-                             RowVectorBatch<float>& x,
-                             const ImageTokens* image_tokens) {
-  size_t image_token_position = 0;
-  EmbedMMToken<T>(token, batch_idx, pos, pos_in_prompt, weights, x,
-                  image_tokens, image_token_position);
-}
-
-template <typename Weights, typename T>
-HWY_NOINLINE void ResidualConnection(
-    size_t num_interleaved, T* HWY_RESTRICT other, T* HWY_RESTRICT x,
-    const LayerWeightsPtrs<Weights>* layer_weights, bool is_attention) {
-  // ResidualType::Add
-  AddFromBatched(num_interleaved, other, x,
-                 layer_weights->layer_config.model_dim);
-}
-
-template <typename WeightT, typename InOutT>
-void PostNorm(PostNormType post_norm, size_t num_interleaved,
-              const WeightT& weights, InOutT* inout) {
-  if (post_norm == PostNormType::Scale) {
-    RMSNormInplaceBatched(num_interleaved, weights.data_scale1(), inout,
-                          weights.NumElements());
-  }
-}
-
-template <typename T>
-HWY_NOINLINE void TransformerLayer(const QueriesPos& queries_pos,
-                                   const QueriesPos& queries_prefix_end,
-                                   size_t num_tokens, size_t cache_layer_idx,
-                                   const LayerWeightsPtrs<T>* layer_weights,
-                                   Activations& activations,
-                                   const hwy::Divisor& div_seq_len,
-                                   const KVCaches& kv_caches) {
-  const size_t model_dim = activations.weights_config.model_dim;
-  const size_t num_interleaved = num_tokens * queries_pos.size();
-  auto type = layer_weights->layer_config.type;
-
-  RMSNormBatched(num_interleaved, activations.x.All(),
-                 layer_weights->pre_attention_norm_scale.data_scale1(),
-                 activations.pre_att_rms_out.All(), model_dim);
-
-  Attention(type, queries_pos, queries_prefix_end, num_tokens, cache_layer_idx,
-            activations, layer_weights, div_seq_len, kv_caches);
-
-  PostNorm(layer_weights->layer_config.post_norm, num_interleaved,
-           layer_weights->post_attention_norm_scale,
-           activations.att_sums.All());
-
-  ResidualConnection(num_interleaved, activations.att_sums.All(),
-                     activations.x.All(), layer_weights, /*is_attention=*/true);
-
-  RMSNormBatched(num_interleaved, activations.x.All(),
-                 layer_weights->pre_ffw_norm_scale.data_scale1(),
-                 activations.bf_pre_ffw_rms_out.All(), model_dim);
-
-  if (layer_weights->layer_config.type == LayerAttentionType::kVit) {
-    FFWVit(activations, num_interleaved, layer_weights);
-  } else {
-    FFWNoVit(activations, num_interleaved, layer_weights);
-  }
-
-  PostNorm(layer_weights->layer_config.post_norm, num_interleaved,
-           layer_weights->post_ffw_norm_scale, activations.ffw_out.All());
-
-  ResidualConnection(num_interleaved, activations.ffw_out.All(),
-                     activations.x.All(), layer_weights,
-                     /*is_attention=*/false);
-}
-
-// Vit transformer layer. Some comments below refer to the Vit implementation in
-// the Big Vision codebase. See
-// github.com/google-research/big_vision/blob/main/big_vision/models/vit.py
-// TODO(keysers): consider adding a wrapper for both LayerNorm with RMSNorm and
-// try merging this with TransformerLayer.
-template <typename T>
-HWY_NOINLINE void VitTransformerLayer(size_t num_tokens, size_t layer,
-                                      const LayerWeightsPtrs<T>* layer_weights,
-                                      Activations& activations) {
-  const size_t model_dim = activations.weights_config.model_dim;
-  auto type = layer_weights->layer_config.type;
-  HWY_DASSERT(type == LayerAttentionType::kVit);
-  (void)type;
-
-  auto& x = activations.x;
-  HWY_DASSERT(x.BatchSize() == num_tokens);
-  HWY_DASSERT(x.Cols() == model_dim);
-
-  // y = nn.LayerNorm()(x)
-  // y ~ pre_att_rms_out
-  LayerNormBatched(num_tokens, x.All(),
-                   layer_weights->vit.layer_norm_0_scale.data_scale1(),
-                   layer_weights->vit.layer_norm_0_bias.data_scale1(),
-                   activations.pre_att_rms_out.All(), model_dim);
-
-  // y = out["sa"] = nn.MultiHeadDotProductAttention(...)(y, y)
-  // y ~ att_sums
-  VitAttention<T>(num_tokens, layer, activations, layer_weights)();
-
-  // x = out["+sa"] = x + y
-  AddFromBatched(num_tokens, activations.att_sums.All(), x.All(), model_dim);
-
-  // y = nn.LayerNorm()(x)
-  // y ~ bf_pre_ffw_rms_out
-  LayerNormBatched(num_tokens, x.All(),
-                   layer_weights->vit.layer_norm_1_scale.data_scale1(),
-                   layer_weights->vit.layer_norm_1_bias.data_scale1(),
-                   activations.bf_pre_ffw_rms_out.All(), model_dim);
-
-  // y = out["mlp"] = MlpBlock(...)(y)
-  // y ~ ffw_out
-  FFWVit(activations, num_tokens, layer_weights);
-
-  // x = out["+mlp"] = x + y
-  AddFromBatched(num_tokens, activations.ffw_out.All(), x.All(), model_dim);
-}
-
-// Prefill() and Transformer() increment positions in-place.
-using QueriesMutablePos = hwy::Span<size_t>;
-
-// Populates KV cache for batches of tokens from one query at a time.
-template <typename T>
-HWY_NOINLINE void Prefill(
-    const QueriesPromptTokens& queries_prompt,
-    const QueriesMutablePos& queries_pos, const QueriesPos& queries_prefix_end,
-    const size_t query_idx_start, const ModelWeightsPtrs<T>& weights,
-    Activations& activations, const RuntimeConfig& runtime_config,
-    const hwy::Divisor& div_seq_len, const KVCaches& kv_caches) {
-  PROFILER_ZONE("Gen.Prefill");
-  const size_t num_queries = queries_prompt.size();
-  HWY_DASSERT(queries_pos.size() == num_queries);
-  HWY_DASSERT(queries_prefix_end.size() == num_queries);
-  HWY_DASSERT(kv_caches.size() == num_queries);
-
-  // Batches are important for amortizing loading weights over multiple tokens.
-  // This is possible in prefill because we know all tokens beforehand, whereas
-  // decode depends on the previous output token. However, each prefill batch of
-  // a query requires that preceding batches already wrote to the KV cache,
-  // hence we sequentially loop over token batches. We can reduce the number of
-  // iterations by increasing the batch size, but this also increases arithmetic
-  // intensity, and so we are eventually compute-limited. We could devote some
-  // threads to parallelizing over queries, but for simplicity we assign them
-  // all to MatMul.
-  const size_t max_tbatch_size = activations.x.BatchSize();
-
-  // For each query. `qi` is within the batch, not the global query index.
-  for (size_t qi = 0; qi < num_queries; ++qi) {
-    // Single query at a time, so pass slices of the spans because
-    // GemmaAttention will only access the first KV cache and position.
-    QueriesPos single_query_pos(&queries_pos[qi], 1);
-    QueriesPos single_query_prefix_end(&queries_prefix_end[qi], 1);
-    KVCaches single_kv_cache(&kv_caches[qi], 1);
-
-    const size_t prompt_size = queries_prompt[qi].size();
-    // In autoregressive mode, we don't need to prefill the last token, so - 1.
-    size_t prefill_this_query = prompt_size - 1;
-    const size_t prefix_end_this_query = queries_prefix_end[qi];
-    // We can't attend beyond the prompt_size.
-    HWY_ASSERT(prefix_end_this_query <= prompt_size);
-    // Special case: if the prefix includes the last token, we need to prefill
-    // the last token, too. However, we need to rewind this for the generation
-    // of the first token. So we need to keep track of this.
-    // TODO: consider implementing masking instead of this logic?
-    const bool attend_to_last_token =
-        (prefill_this_query < prefix_end_this_query);
-    if (attend_to_last_token) {
-      // The difference can be at most 1.
-      prefill_this_query += 1;
-      HWY_ASSERT(prefill_this_query == prefix_end_this_query);
-    }
-    // In prefix-LM mode, we need to look at all the tokens for the prefix in
-    // one iteration through the layers, so we need a large enough batch size.
-    HWY_ASSERT(prefix_end_this_query == 0 ||
-               max_tbatch_size >= prefill_this_query);
-
-    // For each batch of tokens in the query:
-    for (size_t tbatch_start = 0; tbatch_start < prefill_this_query;
-         tbatch_start += max_tbatch_size) {
-      const size_t tbatch_size =
-          HWY_MIN(max_tbatch_size, prefill_this_query - tbatch_start);
-
-      // Fill activations.x (much faster than TransformerLayer).
-      size_t image_token_position = 0;
-      for (size_t ti = 0; ti < tbatch_size; ++ti) {
-        const size_t pos = queries_pos[qi] + ti;
-        const size_t pos_in_prompt = tbatch_start + ti;
-        const int token = queries_prompt[qi][pos_in_prompt];
-        EmbedMMToken(token, ti, pos, pos_in_prompt, weights, activations.x,
-                     runtime_config.image_tokens, image_token_position);
-      }
-
-      // Transformer with one batch of tokens from a single query.
-      for (size_t layer = 0;
-           layer < weights.weights_config.layer_configs.size(); ++layer) {
-        const auto* layer_weights = weights.GetLayer(layer);
-        TransformerLayer(single_query_pos, single_query_prefix_end, tbatch_size,
-                         layer, layer_weights, activations, div_seq_len,
-                         single_kv_cache);
-      }
-
-      // NOTE: we unconditionally call StreamToken, even if EOS.
-      for (size_t ti = 0; ti < tbatch_size; ++ti) {
-        const size_t pos = queries_pos[qi] + ti;
-        const size_t pos_in_prompt = tbatch_start + ti;
-        const int token = queries_prompt[qi][pos_in_prompt];
-        if (pos_in_prompt < prompt_size - 1) {
-          runtime_config.StreamToken(query_idx_start + qi, pos, token, 0.0f);
-        } else {
-          // The last token will be streamed later and we should only get here
-          // if we need to attend to the last token because it is in the prefix.
-          HWY_ASSERT(attend_to_last_token);
-        }
-      }
-
-      queries_pos[qi] += tbatch_size;
-    }  // for tbatch_start
-    if (attend_to_last_token) {
-      // We need to rewind the position for the last token that we only
-      // attended to to make sure the prefix LM sees everything.
-      // This means we duplicate work on the last prompt token in autoregressive
-      // decoding. Alternatives: (1) real masking; (2) always prefill the last
-      // token and only generate the next one from the already prefilled
-      // activations.
-      queries_pos[qi] -= 1;
-    }
-  }
-}
-
-// Gets the patches of the image and embeds them with the image embedding
-// kernel. The result is stored in activations.x.
-template <typename T>
-HWY_NOINLINE void EmbedImagePatches(const Image& image,
-                                    const ModelWeightsPtrs<T>& weights,
-                                    Activations& activations) {
-  const size_t model_dim = weights.weights_config.vit_config.model_dim;
-  const size_t patch_width = weights.weights_config.vit_config.patch_width;
-  const size_t seq_len = weights.weights_config.vit_config.seq_len;
-  const size_t patch_size = patch_width * patch_width * 3;
-  HWY_DASSERT(weights.vit_img_embedding_kernel.NumElements() ==
-              patch_size * model_dim);
-  HWY_DASSERT(activations.x.Cols() == model_dim);
-  std::vector<hwy::AlignedFreeUniquePtr<float[]>> image_patches(seq_len);
-  for (size_t i = 0; i < seq_len; ++i) {
-    image_patches[i] = hwy::AllocateAligned<float>(patch_size);
-    image.GetPatch(i, image_patches[i].get());
-  }
-  // img/embedding/kernel has original shape (14, 14, 3, 1152)
-  // H x W x C x D transposed to D x (H x W x C) so here (1152, 14 * 14 * 3)
-  // image_patches is (256, 14 * 14 * 3)
-  // This could be done as one MatMul like:
-  // RowVectorBatch<float> image_patches(kSeqLen, kPatchSize);
-  // [Get patches]
-  // MatMul(
-  //       MatFromBatch(kVitSeqLen, image_patches),
-  //       MatFromWeights(weights.vit_img_embedding_kernel),
-  //       weights.vit_img_embedding_bias.data_scale1(), *activations.env,
-  //       RowPtrF(activations.x.All(), kVitModelDim));
-  // However, MatMul currently requires that
-  //   A.cols % (2 * hn::Lanes(hn::ScalableTag<MulT>())) == 0
-  // which is not the case here. We should relax that requirement on MatMul and
-  // then use the above. For now, we rely on MatVecAdd instead.
-  for (size_t i = 0; i < seq_len; ++i) {
-    MatVecAdd(
-        weights.vit_img_embedding_kernel, 0, model_dim, patch_size,
-        image_patches[i].get(), weights.vit_img_embedding_bias.data_scale1(),
-        activations.x.Batch(i), activations.env->parallel.Pools().Pool(0));
-  }
-  // Add position embeddings.
-  AddFrom(weights.vit_img_pos_embedding.data_scale1(), activations.x.All(),
-          seq_len * model_dim);
-}
-
-// Prefills the image tokens with the ViT encoder.
-template <typename T>
-HWY_NOINLINE void PrefillVit(const ModelWeightsPtrs<T>& weights,
-                             const RuntimeConfig& runtime_config,
-                             const Image& image, ImageTokens& image_tokens,
-                             Activations& activations) {
-  PROFILER_ZONE("Gen.PrefillVit");
-  const size_t num_tokens = weights.weights_config.vit_config.seq_len;
-  const size_t vit_model_dim = weights.weights_config.vit_config.model_dim;
-  HWY_ASSERT(num_tokens == activations.x.BatchSize());
-  // Embed the image patches.
-  EmbedImagePatches(image, weights, activations);
-  // Go through all layers.
-  for (size_t layer = 0;
-       layer < weights.weights_config.vit_config.layer_configs.size();
-       ++layer) {
-    const auto* layer_weights = weights.GetVitLayer(layer);
-    VitTransformerLayer(num_tokens, layer, layer_weights, activations);
-  }
-  // Final Layernorm.
-  LayerNormBatched(num_tokens, activations.x.All(),
-                   weights.vit_encoder_norm_scale.data_scale1(),
-                   weights.vit_encoder_norm_bias.data_scale1(),
-                   activations.x.All(), vit_model_dim);
-
-  if (weights.weights_config.wrapping == PromptWrapping::GEMMA_VLM) {
-    activations.x = AvgPool4x4(activations.x);
-
-    // Apply soft embedding norm before input projection.
-    RMSNormInplace(weights.mm_embed_norm.data_scale1(), activations.x.All(),
-                   vit_model_dim);
-  }
-
-  // Apply head embedding into image_tokens of size of the LLM kModelDim.
-  MatMul(ConstMatFromBatch(activations.x.BatchSize(), activations.x),
-         ConstMatFromWeights(weights.vit_img_head_kernel),
-         weights.vit_img_head_bias.data_scale1(), *activations.env,
-         RowPtrFromBatch(image_tokens));
-}
-
-// Generates one token for each query. `queries_token` is the previous token
-// from each query, and `queries_pos` are their position in the sequence.
-template <typename T>
-HWY_NOINLINE void Transformer(
-    const QueriesToken& queries_token, const QueriesMutablePos& queries_pos,
-    const QueriesPos& queries_prefix_end, const ModelWeightsPtrs<T>& weights,
-    Activations& activations, const hwy::Divisor& div_seq_len,
-    const KVCaches& kv_caches, const LayersOutputFunc& layers_output,
-    const ActivationsObserverFunc& activations_observer) {
-  const size_t model_dim = weights.weights_config.model_dim;
-  const size_t num_queries = queries_token.size();
-  HWY_DASSERT(queries_pos.size() == num_queries);
-  HWY_DASSERT(queries_prefix_end.size() == num_queries);
-
-  if (layers_output) {
-    for (size_t query_idx = 0; query_idx < num_queries; ++query_idx) {
-      const float token_f = queries_token[query_idx];
-      layers_output(query_idx, queries_pos[query_idx], "tokens", -1, &token_f,
-                    1);
-    }
-  }
-
-  size_t image_token_position = 0;
-  for (size_t query_idx = 0; query_idx < num_queries; ++query_idx) {
-    EmbedMMToken(queries_token[query_idx], query_idx, queries_pos[query_idx],
-                 /*pos_in_prompt=*/0, weights, activations.x,
-                 /*image_tokens=*/nullptr, image_token_position);
-  }
-
-  for (size_t layer = 0; layer < weights.c_layers.size(); ++layer) {
-    const LayerWeightsPtrs<T>* layer_weights = weights.GetLayer(layer);
-    TransformerLayer(queries_pos, queries_prefix_end, /*num_tokens=*/1, layer,
-                     layer_weights, activations, div_seq_len, kv_caches);
-
-    if (activations_observer) {
-      activations_observer(queries_pos, layer, activations);
-    }
-  }
-
-  RMSNormInplaceBatched(num_queries, weights.final_norm_scale.data_scale1(),
-                        activations.x.All(), model_dim);
-
-  if (activations_observer) {
-    activations_observer(queries_pos, -1, activations);
-  }
-  for (size_t query_idx = 0; query_idx < num_queries; ++query_idx) {
-    queries_pos[query_idx] += 1;
-  }
-}
-
-// Placeholder for internal test3, do not remove
-
-// Returns the min and max number of tokens for all queries.
-static size_t MaxQueryLength(const QueriesPromptTokens& queries_prompt) {
-  size_t max_prompt_size = 0;
-  for (size_t i = 0; i < queries_prompt.size(); ++i) {
-    max_prompt_size = std::max(max_prompt_size, queries_prompt[i].size());
-  }
-  return max_prompt_size;
-}
-
-// Holds "is at end of stream" state for each query.
-class TokenStreamer {
- public:
-  explicit TokenStreamer(const RuntimeConfig& runtime_config,
-                         const ModelConfig& model_config)
-      : runtime_config_(runtime_config), model_config_(model_config) {}
-
-  // Returns whether the query was already at, or has just reached, the end of
-  // the stream: either via token == eos_id, or StreamToken returning false.
-  bool operator()(size_t query_idx, size_t pos, int token, float prob) {
-    if (HWY_UNLIKELY(is_eos_.Get(query_idx))) return true;
-
-    if (!runtime_config_.StreamToken(query_idx, pos, token, prob) ||
-        model_config_.IsEOS(token)) {
-      is_eos_.Set(query_idx);
-      return true;
-    }
-
-    return false;
-  }
-
- private:
-  const RuntimeConfig& runtime_config_;
-  const ModelConfig& model_config_;
-  hwy::BitSet4096<> is_eos_;
-};
-
-HWY_INLINE SampleFunc ChooseSampleFunc(const RuntimeConfig& runtime_config) {
-  // If user provided a sample_func, use it.
-  if (runtime_config.sample_func) return runtime_config.sample_func;
-
-  // Fast path for top-1 with no accept_token.
-  if (runtime_config.top_k == 1 && !runtime_config.accept_token) {
-    return [](float* logits, size_t vocab_size) HWY_ATTR -> TokenAndProb {
-      PROFILER_ZONE("Gen.Sample Top1");
-      return Top1OfSoftmax(logits, vocab_size);
-    };
-  }
-
-  // General case: Softmax with top-k sampling.
-  return [&runtime_config](float* logits,
-                           size_t vocab_size) HWY_ATTR -> TokenAndProb {
-    PROFILER_ZONE("Gen.Sample general");
-    return FusedSoftmaxAndSampleTopK(
-        logits, runtime_config.top_k, vocab_size, *runtime_config.gen,
-        runtime_config.temperature, runtime_config.accept_token);
-  };
-}
-
-template <typename T>
-// Runs one decode step for all the queries in the batch. Returns true if all
-// queries are at <end_of_sentence>.
-bool DecodeStepT(const ModelWeightsPtrs<T>& weights,
-                 const RuntimeConfig& runtime_config,
-                 const QueriesPromptTokens& queries_prompt,
-                 const size_t query_idx_start, const KVCaches& kv_caches,
-                 const QueriesPos& queries_prefix_end,
-                 const hwy::Divisor div_seq_len, const size_t vocab_size,
-                 const SampleFunc& sample_token, Activations& activations,
-                 TokenStreamer& token_streamer, std::vector<int>& gen_tokens,
-                 TimingInfo& timing_info,
-                 const QueriesMutablePos& queries_mutable_pos) {
-  const size_t num_queries = queries_prompt.size();
-  // Decode generates one token per query and increments
-  // queries_mutable_pos.
-  Transformer(QueriesToken(gen_tokens.data(), num_queries), queries_mutable_pos,
-              queries_prefix_end, weights, activations, div_seq_len, kv_caches,
-              runtime_config.layers_output,
-              runtime_config.activations_observer);
-  // queries_pos are incremented by Transformer.
-
-  bool all_queries_eos = true;
-  {
-    PROFILER_ZONE("Gen.EmbeddingMatmul");
-    // Compute logits from last layer activations.
-    MatMul(ConstMatFromBatch(num_queries, activations.x),
-           ConstMatFromWeights(weights.embedder_input_embedding),
-           /*add=*/nullptr, *activations.env,
-           RowPtrFromBatch(activations.logits));
-  }
-  PROFILER_ZONE("Gen.Softcap+Sample+Stream");
-  for (size_t query_idx = 0; query_idx < num_queries; ++query_idx) {
-    float* HWY_RESTRICT logits = activations.logits.Batch(query_idx);
-    MaybeLogitsSoftCap(weights.weights_config.final_cap, logits, vocab_size);
-    const TokenAndProb tp = sample_token(logits, vocab_size);
-    timing_info.NotifyGenerated();
-
-    const bool is_eos =
-        token_streamer(query_idx_start + query_idx,
-                       queries_mutable_pos[query_idx], tp.token, tp.prob);
-    all_queries_eos &= is_eos;
-    gen_tokens[query_idx] = is_eos ? runtime_config.eos_id : tp.token;
-  }
-  return all_queries_eos;
-}
-
-// Generates one continuation for each query in `queries_prompt`, which is one
-// qbatch whose size is at most the `batch_size` passed to
-// `activations.Allocate`.
-//
-// `queries_pos` stores the KV cache position for each query. In the first turn
-// of a chat, pos = 0; we increment each query's position after each token.
-//
-// `query_idx_start` is the query_idx of the first query in the batch, so that
-// `StreamFunc` gets the global query index, not relative to the batch.
-//
-// `kv_caches` is for the batch, size must match `queries_prompt`.
-template <typename T>
-void GenerateT(const ModelWeightsStorage& model, Activations& activations,
-               const RuntimeConfig& runtime_config,
-               const QueriesPromptTokens& queries_prompt,
-               const QueriesPos& queries_pos_in,
-               const QueriesPos& queries_prefix_end,
-               const size_t query_idx_start, const KVCaches& kv_caches,
-               TimingInfo& timing_info) {
-  // Griffin assumes that the recurrent block cache is zero-initialized.
-  for (size_t i = 0; i < kv_caches.size(); ++i) {
-    if (queries_pos_in[i] == 0) {
-      kv_caches[i].ZeroGriffinCache();  // No-op for non-Griffin models.
-    }
-  }
-
-  // Copy so we can increment without requiring users to pass in a mutable span.
-  std::vector<size_t> queries_pos_copy(queries_pos_in.cbegin(),
-                                       queries_pos_in.cend());
-  const QueriesMutablePos queries_mutable_pos(queries_pos_copy.data(),
-                                              queries_pos_copy.size());
-
-  // Sanity check: prompts should not be empty, nor start with EOS.
-  for (size_t query_idx = 0; query_idx < queries_prompt.size(); ++query_idx) {
-    const PromptTokens& prompt = queries_prompt[query_idx];
-    HWY_ASSERT(prompt.size() != 0 && prompt[0] != runtime_config.eos_id);
-  }
-
-  const size_t num_queries = queries_prompt.size();
-  HWY_ASSERT(num_queries <= 4096);  // TokenStreamer uses BitSet4096.
-  HWY_ASSERT(num_queries <= activations.x.BatchSize());
-  HWY_ASSERT(queries_pos_in.size() == num_queries);
-  HWY_ASSERT(kv_caches.size() == num_queries);
-  const hwy::Divisor div_seq_len(static_cast<uint32_t>(kv_caches[0].seq_len));
-  const ModelWeightsPtrs<T>& weights = *model.GetWeightsOfType<T>();
-  size_t max_prompt_size = MaxQueryLength(queries_prompt);
-  size_t max_generated_tokens = runtime_config.max_generated_tokens;
-  RangeChecks(weights.weights_config, max_generated_tokens, max_prompt_size);
-  const SampleFunc sample_token = ChooseSampleFunc(runtime_config);
-
-  // Prefill stops before min_prompt_size - 1 because the last prompt
-  // token is the first input token for generation.
-  timing_info.prefill_start = hwy::platform::Now();
-  // If tbatch is larger than the qbatch we already have in `activations`, then
-  // allocate prefill_activations, otherwise reuse.
-  const bool use_prefill_activations =
-      runtime_config.prefill_tbatch_size > activations.x.BatchSize();
-  Activations prefill_activations(weights.weights_config);
-  if (use_prefill_activations) {
-    prefill_activations.Allocate(runtime_config.prefill_tbatch_size,
-                                 activations.env);
-  }
-  Prefill(queries_prompt, queries_mutable_pos, queries_prefix_end,
-          query_idx_start, weights,
-          use_prefill_activations ? prefill_activations : activations,
-          runtime_config, div_seq_len, kv_caches);
-  // Compute the number of tokens that were prefilled and notify timing_info.
-  size_t prefilled_tokens = 0;
-  for (size_t qi = 0; qi < num_queries; ++qi) {
-    prefilled_tokens += queries_prompt[qi].size() - 1;
-  }
-  timing_info.NotifyPrefill(prefilled_tokens);
-  // queries_pos are incremented by Prefill.
-
-  // Storage for the last generated token from each query, passed to the next
-  // Transformer() call.
-  std::vector<int> gen_tokens(num_queries);
-
-  // Stream the last prompt token from each query and fill gen_tokens.
-  TokenStreamer token_streamer(runtime_config, model.Config());
-  for (size_t query_idx = 0; query_idx < num_queries; ++query_idx) {
-    size_t last_token_pos_in_prompt =
-        queries_mutable_pos[query_idx] - queries_pos_in[query_idx];
-    gen_tokens[query_idx] = queries_prompt[query_idx][last_token_pos_in_prompt];
-    (void)token_streamer(query_idx_start + query_idx,
-                         queries_mutable_pos[query_idx], gen_tokens[query_idx],
-                         0.0f);
-  }
-
-  {
-    const size_t vocab_size = model.Config().vocab_size;
-    timing_info.generate_start = hwy::platform::Now();
-    for (size_t gen = 0; gen < max_generated_tokens; ++gen) {
-      bool all_queries_eos = DecodeStepT<T>(
-          weights, runtime_config, queries_prompt, query_idx_start, kv_caches,
-          queries_prefix_end, div_seq_len, vocab_size, sample_token,
-          activations, token_streamer, gen_tokens,
-          timing_info, queries_mutable_pos);
-      if (all_queries_eos) break;
-    }  // foreach token to generate
-    timing_info.NotifyGenerateDone();
-  }
-}
-
-template <typename T>
-void GenerateSingleT(const ModelWeightsStorage& model,
-                     const RuntimeConfig& runtime_config,
-                     const PromptTokens& prompt, size_t pos, size_t prefix_end,
-                     KVCache& kv_cache, MatMulEnv* env,
-                     TimingInfo& timing_info) {
-  constexpr size_t kNumQueries = 1;
-  const size_t qbatch_start = 0;
-
-  // TODO: move into Gemma?
-  Activations activations(model.Config());
-  activations.Allocate(kNumQueries, env);
-
-  const QueriesPromptTokens queries_prompt(&prompt, kNumQueries);
-  QueriesPos queries_pos(&pos, kNumQueries);
-  const QueriesPos queries_prefix_end(&prefix_end, kNumQueries);
-  const KVCaches kv_caches{&kv_cache, kNumQueries};
-
-  GenerateT<T>(model, activations, runtime_config, queries_prompt, queries_pos,
-               queries_prefix_end, qbatch_start, kv_caches, timing_info);
-}
-
-template <typename T>
-void GenerateBatchT(const ModelWeightsStorage& model,
-                    const RuntimeConfig& runtime_config,
-                    const QueriesPromptTokens& queries_prompt,
-                    const QueriesPos& queries_pos,
-                    const QueriesPos& queries_prefix_end,
-                    const KVCaches& kv_caches, MatMulEnv* env,
-                    TimingInfo& timing_info) {
-  const size_t num_queries = queries_prompt.size();
-  HWY_ASSERT(queries_pos.size() == num_queries);
-  HWY_ASSERT(kv_caches.size() == num_queries);
-  // Griffin does not support query batching.
-  size_t max_qbatch_size = runtime_config.decode_qbatch_size;
-  for (const auto& layer_config : model.Config().layer_configs) {
-    if (layer_config.type == LayerAttentionType::kGriffinRecurrentBlock) {
-      max_qbatch_size = 1;
-      break;
-    }
-  }
-
-  Activations activations(model.Config());
-  activations.Allocate(max_qbatch_size, env);
-
-  for (size_t qbatch_start = 0; qbatch_start < num_queries;
-       qbatch_start += max_qbatch_size) {
-    // Generate one batch of tokens from `qbatch_size` queries.
-    const size_t qbatch_size =
-        HWY_MIN(num_queries - qbatch_start, max_qbatch_size);
-    const QueriesPromptTokens qbatch_prompts(&queries_prompt[qbatch_start],
-                                             qbatch_size);
-    QueriesPos qbatch_pos(&queries_pos[qbatch_start], qbatch_size);
-    const QueriesPos qbatch_prefix_end(&queries_prefix_end[qbatch_start],
-                                             qbatch_size);
-    const KVCaches qbatch_kv(&kv_caches[qbatch_start], qbatch_size);
-    GenerateT<T>(model, activations, runtime_config, qbatch_prompts, qbatch_pos,
-                 qbatch_prefix_end, qbatch_start, qbatch_kv, timing_info);
-  }
-}
-
-template <typename T>
-void GenerateImageTokensT(const ModelWeightsStorage& model,
-                          const RuntimeConfig& runtime_config,
-                          const Image& image, ImageTokens& image_tokens,
-                          MatMulEnv* env) {
-  if (model.Config().vit_config.layer_configs.empty()) {
-    HWY_ABORT("Model does not support generating image tokens.");
-  }
-  RuntimeConfig prefill_runtime_config = runtime_config;
-  ModelConfig vit_config = GetVitConfig(model.Config());
-  prefill_runtime_config.prefill_tbatch_size =
-      vit_config.seq_len / (vit_config.pool_dim * vit_config.pool_dim);
-  Activations prefill_activations(vit_config);
-  prefill_activations.Allocate(vit_config.seq_len, env);
-  // Weights are for the full PaliGemma model, not just the ViT part.
-  PrefillVit(*model.GetWeightsOfType<T>(), prefill_runtime_config, image,
-             image_tokens, prefill_activations);
+  CallMatMul(activations.C1, layer.linear_w, output_bias, env,
+             activations.ffw_out);
 }
 
+// NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
-
-#if HWY_ONCE
-
-// These are extern functions defined by instantiations/*.cc, which include this
-// 'header' after defining GEMMA_CONFIG, which is for function overloading.
-void GenerateSingle(  // NOLINT(misc-definitions-in-headers)
-    GEMMA_TYPE, const ModelWeightsStorage& model,
-    const RuntimeConfig& runtime_config, const PromptTokens& prompt, size_t pos,
-    size_t prefix_end, KVCache& kv_cache, MatMulEnv* env,
-    TimingInfo& timing_info) {
-  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateSingleT<GEMMA_TYPE>)
-  (model, runtime_config, prompt, pos, prefix_end, kv_cache, env, timing_info);
-}
-
-void GenerateBatch(  // NOLINT(misc-definitions-in-headers)
-    GEMMA_TYPE, const ModelWeightsStorage& model,
-    const RuntimeConfig& runtime_config,
-    const QueriesPromptTokens& queries_prompt, const QueriesPos& queries_pos,
-    const QueriesPos& queries_prefix_end, const KVCaches& kv_caches,
-    MatMulEnv* env, TimingInfo& timing_info) {
-  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateBatchT<GEMMA_TYPE>)
-  (model, runtime_config, queries_prompt, queries_pos, queries_prefix_end,
-   kv_caches, env, timing_info);
-}
-
-void GenerateImageTokens(  // NOLINT(misc-definitions-in-headers)
-    GEMMA_TYPE, const ModelWeightsStorage& model,
-    const RuntimeConfig& runtime_config, const Image& image,
-    ImageTokens& image_tokens, MatMulEnv* env) {
-  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateImageTokensT<GEMMA_TYPE>)
-  (model, runtime_config, image, image_tokens, env);
-}
-
-#endif  // HWY_ONCE
-
 }  // namespace gcpp
 HWY_AFTER_NAMESPACE();
 
diff --git a/gemma/gemma.cc b/gemma/gemma.cc
index bfc6534..496c21d 100644
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@@ -13,173 +13,654 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Defines Gemma member functions; the actual implementations are in
-// gemma-inl.h, included from instantiations/*.cc.
+// Defines Gemma member functions which dynamic-dispatch into the SIMD
+// implementations in gemma-inl.h.
 
 #include "gemma/gemma.h"
 
+#include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
+// Compiles this file for multiple architectures via "foreach_target.h", to
+// which we pass the filename via macro 'argument'.
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "gemma/gemma.cc"  // NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+// After highway.h
+#include "gemma/attention.h"  // includes highway.h
+#include "gemma/gemma-inl.h"
+#include "gemma/griffin.h"  // includes highway.h
+#include "gemma/vit.h"      // includes highway.h
+
+#ifndef GEMMA_CC_ONCE
+#define GEMMA_CC_ONCE
+
+#include <math.h>  // sqrtf
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include <string>
-#include <utility>  // std::move
 #include <vector>
 
-#include "compression/io.h"  // Path
-#include "compression/shared.h"
-#include "gemma/common.h"
+#include "gemma/configs.h"
+#include "gemma/model_store.h"
 #include "gemma/weights.h"
-#include "ops/ops-inl.h"
+#include "io/blob_store.h"
+#include "io/io.h"  // Path
+#include "ops/matmul.h"
 #include "paligemma/image.h"
-#include "util/threading.h"
-#include "hwy/highway.h"
+#include "util/threading_context.h"
+#include "hwy/aligned_allocator.h"  // Span
+#include "hwy/base.h"
+#include "hwy/timer.h"
 
+#endif  // GEMMA_CC_ONCE
+
+HWY_BEFORE_NAMESPACE();
 namespace gcpp {
+namespace HWY_NAMESPACE {
 
-Gemma::Gemma(const Path& tokenizer_path, const Path& weights,
-             const ModelInfo& info, MatMulEnv& env)
-    : env_(env), tokenizer_(tokenizer_path) {
-  model_.Load(weights, info.model, info.weight, info.wrapping,
-              env_.parallel.Pools().Pool(0),
-              /*tokenizer_proto=*/nullptr);
-}
-
-Gemma::Gemma(const Path& weights, MatMulEnv& env) : env_(env) {
-  std::string tokenizer_proto;
-  model_.Load(weights, Model::UNKNOWN, Type::kUnknown, PromptWrapping::GEMMA_IT,
-              env_.parallel.Pools().Pool(0), &tokenizer_proto);
-  tokenizer_.Deserialize(tokenizer_proto);
-}
-
-Gemma::Gemma(GemmaTokenizer&& tokenizer, const ModelInfo& info, MatMulEnv& env)
-    : env_(env), tokenizer_(std::move(tokenizer)) {
-  HWY_ASSERT(info.weight == Type::kF32);
-  model_.Allocate(info.model, info.weight, env_.parallel.Pools().Pool(0));
-}
-
-Gemma::~Gemma() {
-}
-
-// There are >=3 types of the inference code. To reduce compile time,
-// we shard them across multiple translation units in instantiations/*.cc.
-// This declares the functions defined there. We use overloading because
-// explicit instantiations are still too slow to compile.
-#define GEMMA_DECLARE(TWEIGHT)                                                 \
-  extern void GenerateSingle(TWEIGHT, const ModelWeightsStorage& model,        \
-                             const RuntimeConfig& runtime_config,              \
-                             const PromptTokens& prompt, size_t pos,           \
-                             size_t prefix_end, KVCache& kv_cache,             \
-                             MatMulEnv* env, TimingInfo& timing_info);         \
-  extern void GenerateBatch(                                                   \
-      TWEIGHT, const ModelWeightsStorage& model,                               \
-      const RuntimeConfig& runtime_config, const QueriesPromptTokens& prompts, \
-      const QueriesPos& queries_pos, const QueriesPos& queries_prefix_end,     \
-      const KVCaches& kv_caches, MatMulEnv* env, TimingInfo& timing_info);     \
-  extern void GenerateImageTokens(TWEIGHT, const ModelWeightsStorage& model,   \
-                                  const RuntimeConfig& runtime_config,         \
-                                  const Image& image,                          \
-                                  ImageTokens& image_tokens, MatMulEnv* env);
-GEMMA_DECLARE(float)
-GEMMA_DECLARE(BF16)
-GEMMA_DECLARE(NuqStream)
-GEMMA_DECLARE(SfpStream)
-
-// Adapters to select from the above overloads via CallForModelWeight.
-template <class TConfig>
-struct GenerateSingleT {
-  void operator()(const ModelWeightsStorage& model,
-                  const RuntimeConfig& runtime_config,
-                  const PromptTokens& prompt, size_t pos, size_t prefix_end,
-                  KVCache& kv_cache, MatMulEnv* env,
-                  TimingInfo& timing_info) const {
-    GenerateSingle(TConfig(), model, runtime_config, prompt, pos, prefix_end,
-                   kv_cache, env, timing_info);
+void Attention(LayerAttentionType type, const size_t num_tokens,
+               const size_t layer_idx, const LayerWeightsPtrs& layer,
+               Activations& activations, QBatch& qbatch, MatMulEnv& env) {
+  if (type == LayerAttentionType::kGemma) {
+    GemmaAttention(num_tokens, layer_idx, layer, activations.attention, qbatch,
+                   env,
+                   /*flags=*/0);
+  } else {
+    HWY_DASSERT(type == LayerAttentionType::kGriffinRecurrentBlock);
+    // KVCache conv1d_cache and rglru_cache have one row per *Griffin* layer,
+    // so map `layer` to the Griffin layer index.
+    const size_t griffin_layer =
+        activations.attention.config.NumLayersOfTypeBefore(type, layer_idx);
+    GriffinRecurrent(num_tokens, griffin_layer, &layer, activations, qbatch,
+                     env);
   }
-};
+}
 
-template <class TConfig>
-struct GenerateBatchT {
-  void operator()(const ModelWeightsStorage& model,
-                  const RuntimeConfig& runtime_config,
-                  const QueriesPromptTokens& queries_prompt,
-                  const QueriesPos& queries_pos,
-                  const QueriesPos& queries_prefix_end,
-                  const KVCaches& kv_caches, MatMulEnv* env,
-                  TimingInfo& timing_info) const {
-    GenerateBatch(TConfig(), model, runtime_config, queries_prompt, queries_pos,
-                  queries_prefix_end, kv_caches, env, timing_info);
-  }
-};
+static HWY_NOINLINE void TransformerLayer(const size_t num_tokens,
+                                          const size_t layer_idx,
+                                          const LayerWeightsPtrs& layer,
+                                          Activations& activations,
+                                          QBatch& qbatch, MatMulEnv& env) {
+  const LayerConfig& layer_config = layer.layer_config;
 
-template <class TConfig>
-struct GenerateImageTokensT {
-  void operator()(const ModelWeightsStorage& model,
-                  const RuntimeConfig& runtime_config, const Image& image,
-                  ImageTokens& image_tokens, MatMulEnv* env) const {
-    GenerateImageTokens(TConfig(), model, runtime_config, image, image_tokens,
-                        env);
+  RMSNormBatched(activations.x, layer.pre_attention_norm_scale,
+                 activations.attention.pre_att_rms_out, env.ctx);
+
+  Attention(layer_config.type, num_tokens, layer_idx, layer, activations,
+            qbatch, env);
+
+  PostNorm(layer_config.post_norm, layer.post_attention_norm_scale,
+           activations.attention.att_sums, env.ctx);
+
+  ResidualConnection(activations.attention.att_sums, activations.x, layer,
+                     /*is_attention=*/true, env.ctx);
+
+  RMSNormBatched(activations.x, layer.pre_ffw_norm_scale,
+                 activations.pre_ffw_rms_out, env.ctx);
+
+  if (layer_config.type == LayerAttentionType::kVit) {
+    FFWVit(layer, activations, env);
+  } else {
+    FFWNoVit(layer, activations, env);
   }
-};
+
+  PostNorm(layer_config.post_norm, layer.post_ffw_norm_scale,
+           activations.ffw_out, env.ctx);
+
+  ResidualConnection(activations.ffw_out, activations.x, layer,
+                     /*is_attention=*/false, env.ctx);
+}
+
+// Returns the scale value to use for the embedding (basically sqrt model_dim).
+static float EmbeddingScaling(size_t model_dim) {
+  // Round to bf16 to match Gemma's Embedder, which casts before mul.
+  return hwy::ConvertScalarTo<float>(
+      hwy::ConvertScalarTo<BF16>(sqrtf(static_cast<float>(model_dim))));
+}
+
+// `batch_idx` indicates which row of `x` to write to.
+// `pos` is the *token*'s position, not the start of the batch, because this is
+// called for batches of tokens in prefill, but batches of queries in decode.
+//
+// For GEMMA_VLM, image tokens are copied into -2 locations (per the Gemma 3
+// spec) until we run out of image tokens. This allows for a multi-image prompt
+// if -2 locations with appropriate begin/end image tokens are created by the
+// calling application.
+// Returns new image_token_position.
+static HWY_NOINLINE size_t
+EmbedMMToken(int token, size_t qi, size_t pos, size_t pos_in_prompt,
+             const ModelConfig& model_config, const WeightsPtrs& weights,
+             MatStorageT<float>& x, const ImageTokens* image_tokens = nullptr,
+             size_t image_token_position = 0) {
+  // Image tokens just need to be copied.
+  if (model_config.wrapping == PromptWrapping::GEMMA_VLM &&
+      image_tokens != nullptr && token == -2 &&
+      image_token_position < image_tokens->Rows()) {
+    hwy::CopyBytes(image_tokens->Row(image_token_position), x.Row(qi),
+                   x.Cols() * x.ElementBytes());
+    return image_token_position + 1;
+  }
+
+  if (model_config.wrapping == PromptWrapping::PALIGEMMA &&
+      image_tokens != nullptr && pos_in_prompt < image_tokens->Rows()) {
+    hwy::CopyBytes(image_tokens->Row(pos_in_prompt), x.Row(qi),
+                   x.Cols() * x.ElementBytes());
+    return image_token_position;
+  }
+
+  const size_t model_dim = model_config.model_dim;
+  const float emb_scaling = EmbeddingScaling(model_dim);
+  const size_t worker = 0;  // Not yet parallelized.
+
+  HWY_DASSERT(token >= 0);
+  HWY_DASSERT(token < static_cast<int>(model_config.vocab_size));
+
+  CallUpcasted(&weights.embedder_input_embedding, [&](const auto* weights_t) {
+    // Using `Stride` to compute the offset works for both NUQ (because we use
+    // an offset and NUQ is never padded) and padded, because non-NUQ types are
+    // seekable, hence the offset can also skip any padding.
+    const size_t embedding_ofs = token * weights_t->Stride();
+    HWY_ASSERT(weights_t->Cols() == model_dim);
+    const auto embedding_span =
+        MakeSpan(weights_t->Row(0), embedding_ofs + model_dim);
+    const hn::ScalableTag<float> df;
+    DecompressAndZeroPad(df, embedding_span, embedding_ofs, x.Row(qi),
+                         model_dim);
+    MulByConst(emb_scaling * weights_t->Scale(), x.Row(qi), model_dim, worker);
+  });
+
+  if (model_config.absolute_pe) {
+    AddAbsolutePositionalEmbeddings(x.Row(qi), model_dim, pos);
+  }
+  return image_token_position;
+}
+
+// Populates KV cache for batches of tokens from one query at a time. This is
+// called if prompts are longer than the query batch size, and also in
+// prefix-LM mode (end > 0), which must see all tokens in one batch.
+static HWY_NOINLINE void PrefillTBatch(const ModelConfig& config,
+                                       const RuntimeConfig& runtime_config,
+                                       const WeightsPtrs& weights,
+                                       Activations& activations, QBatch& qbatch,
+                                       MatMulEnv& env,
+                                       hwy::BitSet4096<>& non_eos) {
+  PROFILER_ZONE("Gen.PrefillT");
+
+  // Batches are important for amortizing loading weights over multiple tokens.
+  // This is possible in prefill because we know all tokens beforehand, whereas
+  // decode depends on the previous output token. However, each prefill batch of
+  // a query requires that preceding batches already wrote to the KV cache,
+  // hence we sequentially loop over token batches. We can reduce the number of
+  // iterations by increasing the batch size, but this also increases arithmetic
+  // intensity, and so we are eventually compute-limited. TransformerLayer uses
+  // all available threads, so we do not also parallelize over queries, but note
+  // that PrefillQBatch uses queries as the batch dimension.
+  const size_t max_tbatch_size = runtime_config.prefill_tbatch_size;
+
+  // For each query. `qi` is within the batch, not the global query index.
+  for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
+    non_eos.Set(qi);
+
+    // One query at a time, batching will be the query's prompt tokens.
+    QBatch qbatch_1 = qbatch.Single(qi);
+
+    const size_t prompt_size = qbatch_1.Prompt(0).size();
+    // In autoregressive mode, we don't need to prefill the last token, so - 1.
+    size_t prefill_this_query = prompt_size - 1;
+    const size_t prefix_end_this_query = qbatch_1.PrefixEnd(0);
+    // We can't attend beyond the prompt_size.
+    HWY_ASSERT(prefix_end_this_query <= prompt_size);
+    // Special case: if the prefix includes the last token, we need to prefill
+    // the last token, too. However, we need to rewind this for the generation
+    // of the first token. So we need to keep track of this.
+    // TODO: consider implementing masking instead of this logic?
+    const bool attend_to_last_token =
+        (prefill_this_query < prefix_end_this_query);
+    if (attend_to_last_token) {
+      // The difference can be at most 1.
+      prefill_this_query += 1;
+      HWY_ASSERT(prefill_this_query == prefix_end_this_query);
+    }
+    // In prefix-LM mode, we need to look at all the tokens for the prefix in
+    // one iteration through the layers, so we need a large enough batch size.
+    HWY_ASSERT(prefix_end_this_query == 0 ||
+               max_tbatch_size >= prefill_this_query);
+
+    // For each batch of tokens in the query:
+    for (size_t tbatch_start = 0; tbatch_start < prefill_this_query;
+         tbatch_start += max_tbatch_size) {
+      const size_t tbatch_size =
+          HWY_MIN(max_tbatch_size, prefill_this_query - tbatch_start);
+      activations.SetBatchSize(tbatch_size);
+
+      // Fill activations.x (much faster than TransformerLayer).
+      size_t image_token_position = 0;
+      for (size_t ti = 0; ti < tbatch_size; ++ti) {
+        const size_t pos = qbatch_1.Pos(0) + ti;
+        const size_t pos_in_prompt = tbatch_start + ti;
+        const int token = qbatch_1.Prompt(0)[pos_in_prompt];
+        image_token_position = EmbedMMToken(
+            token, ti, pos, pos_in_prompt, config, weights, activations.x,
+            runtime_config.image_tokens, image_token_position);
+      }
+
+      // Transformer with one batch of tokens from a single query.
+      for (size_t layer_idx = 0; layer_idx < config.layer_configs.size();
+           ++layer_idx) {
+        TransformerLayer(tbatch_size, layer_idx, *weights.GetLayer(layer_idx),
+                         activations, qbatch_1, env);
+      }
+
+      // NOTE: we unconditionally call StreamToken, even if EOS.
+      for (size_t ti = 0; ti < tbatch_size; ++ti) {
+        const size_t pos = qbatch_1.Pos(0) + ti;
+        const size_t pos_in_prompt = tbatch_start + ti;
+        const int token = qbatch_1.Prompt(0)[pos_in_prompt];
+        if (pos_in_prompt < prompt_size - 1) {
+          runtime_config.StreamToken(qbatch_1.QueryIdx(0), pos, token, 0.0f);
+        } else {
+          // The last token will be streamed later and we should only get here
+          // if we need to attend to the last token because it is in the prefix.
+          HWY_ASSERT(attend_to_last_token);
+        }
+      }
+
+      qbatch_1.MutablePos(0) += tbatch_size;
+    }  // for tbatch_start
+    if (attend_to_last_token) {
+      // We need to rewind the position for the last token that we only
+      // attended to to make sure the prefix LM sees everything.
+      // This means we duplicate work on the last prompt token in autoregressive
+      // decoding. Alternatives: (1) real masking; (2) always prefill the last
+      // token and only generate the next one from the already prefilled
+      // activations.
+      qbatch_1.MutablePos(0) -= 1;
+    }
+  }
+}
+
+// Embeds PrevToken (one from each query) and calls each TransformerLayer.
+// Called by query-batched `PrefillQBatch` and `DecodeStepT`, but not the
+// token-batched `PrefillTBatch`.
+static HWY_NOINLINE void Transformer(const ModelConfig& config,
+                                     const RuntimeConfig& runtime_config,
+                                     const WeightsPtrs& weights,
+                                     Activations& activations, QBatch& qbatch,
+                                     MatMulEnv& env) {
+  if (HWY_UNLIKELY(runtime_config.layers_output)) {
+    for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
+      const float token_f = qbatch.PrevToken(qi);
+      runtime_config.layers_output(qbatch.QueryIdx(qi), qbatch.Pos(qi),
+                                   "tokens", -1, &token_f, 1);
+    }
+  }
+
+  // TODO: parallelize?
+  for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
+    EmbedMMToken(qbatch.PrevToken(qi), qi, qbatch.Pos(qi),
+                 /*pos_in_prompt=*/0, config, weights, activations.x);
+  }
+
+  for (size_t layer_idx = 0; layer_idx < weights.c_layers.size(); ++layer_idx) {
+    TransformerLayer(/*num_tokens=*/1, layer_idx, *weights.GetLayer(layer_idx),
+                     activations, qbatch, env);
+
+    if (HWY_UNLIKELY(runtime_config.activations_observer)) {
+      runtime_config.activations_observer(
+          QueriesPos(&qbatch.MutablePos(0), qbatch.Size()), layer_idx,
+          activations);
+    }
+  }
+}
+
+// Populates KV cache for the batch queries, one token at a time. Only called
+// for autoregressive (non-prefix-LM) prefill, so `queries_prefix_end` == 0.
+static HWY_NOINLINE void PrefillQBatch(const size_t max_prompt_size,
+                                       const ModelConfig& config,
+                                       const RuntimeConfig& runtime_config,
+                                       const WeightsPtrs& weights,
+                                       Activations& activations, QBatch& qbatch,
+                                       MatMulEnv& env,
+                                       hwy::BitSet4096<>& non_eos) {
+  PROFILER_ZONE("Gen.PrefillQ");
+
+  for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
+    non_eos.Set(qi);
+    HWY_DASSERT(qbatch.PrefixEnd(qi) == 0);
+  }
+
+  // In autoregressive mode, we don't prefill the last token, hence - 1.
+  for (size_t pos_in_prompt = 0; pos_in_prompt < max_prompt_size - 1;
+       ++pos_in_prompt) {
+    for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
+      int token = config.eos_id;
+      if (pos_in_prompt < qbatch.Prompt(qi).size() - 1) {
+        token = qbatch.Prompt(qi)[pos_in_prompt];
+        // Ignore StreamToken return value because requesting to stop does not
+        // make sense during prefill.
+        (void)runtime_config.StreamToken(qbatch.QueryIdx(qi), pos_in_prompt,
+                                         token, 0.0f);
+        qbatch.MutablePos(qi) = pos_in_prompt;
+      }
+
+      qbatch.PrevToken(qi) = token;
+    }
+
+    // The input (PrevToken) is one token from each query in the batch.
+    // Do not call DecodeStepT because it computes logits for token
+    // probabilities, which are not required for the prompt tokens.
+    Transformer(config, runtime_config, weights, activations, qbatch, env);
+  }
+
+  for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
+    qbatch.MutablePos(qi) = qbatch.Prompt(qi).size() - 1;
+  }
+}
+
+// Calls `StreamToken`, writes the token to `PrevToken` for use by subsequent
+// `DecodeStepT`, and increments `MutablePos`. Also updates `non_eos` if the
+// query is at the end of its sequence.
+static void StreamAndUpdateEOS(const size_t qi, int token, const float prob,
+                               const ModelConfig& config,
+                               const RuntimeConfig& runtime_config,
+                               QBatch& qbatch, hwy::BitSet4096<>& non_eos) {
+  HWY_DASSERT(non_eos.Get(qi));  // otherwise, should not be called.
+
+  if (HWY_UNLIKELY(!runtime_config.StreamToken(qbatch.QueryIdx(qi),
+                                               qbatch.Pos(qi), token, prob))) {
+    // User decided to stop: set token to primary EOS to trigger IsEOS below.
+    token = config.eos_id;
+    HWY_DASSERT(config.IsEOS(token));
+  }
+
+  qbatch.PrevToken(qi) = token;
+  qbatch.MutablePos(qi) += 1;
+
+  // Primary or secondary EOS: mark query as EOS, but still increment (for
+  // multi-turn, we should still keep the prior EOS).
+  if (HWY_UNLIKELY(config.IsEOS(token))) non_eos.Clear(qi);
+}
+
+// For a batch of queries, runs Transformer, computes logits, samples and
+// streams the token.
+static void DecodeStepT(const ModelConfig& config,
+                        const RuntimeConfig& runtime_config,
+                        const WeightsPtrs& weights,
+                        const SampleFunc& sample_token,
+                        Activations& activations, QBatch& qbatch,
+                        MatMulEnv& env, hwy::BitSet4096<>& non_eos,
+                        TimingInfo& timing_info) {
+  HWY_DASSERT(qbatch.Size() == activations.x.Rows());
+
+  Transformer(config, runtime_config, weights, activations, qbatch, env);
+
+  RMSNormInplaceBatched(weights.final_norm_scale, activations.x, env.ctx);
+
+  if (HWY_UNLIKELY(runtime_config.activations_observer)) {
+    runtime_config.activations_observer(
+        QueriesPos(&qbatch.MutablePos(0), qbatch.Size()), -1, activations);
+  }
+
+  {
+    PROFILER_ZONE("Gen.EmbeddingMatmul");
+    // Compute logits from last layer activations.
+    CallMatMul(activations.x, weights.embedder_input_embedding,
+               /*add=*/nullptr, env, activations.logits);
+  }
+  PROFILER_ZONE("Gen.Softcap+Sample+Stream");
+  const size_t worker = 0;  // TODO: parallelize
+  non_eos.Foreach([&](size_t qi) {
+    float* HWY_RESTRICT logits = activations.logits.Row(qi);
+    MaybeLogitsSoftCap(config.final_cap, logits, config.vocab_size, worker);
+    const TokenAndProb tp = sample_token(logits, config.vocab_size);
+    timing_info.NotifyGenerated();
+
+    StreamAndUpdateEOS(qi, tp.token, tp.prob, config, runtime_config, qbatch,
+                       non_eos);
+  });
+}
+
+static HWY_INLINE SampleFunc
+ChooseSampleFunc(const RuntimeConfig& runtime_config) {
+  // If user provided a sample_func, use it.
+  if (runtime_config.sample_func) return runtime_config.sample_func;
+
+  const size_t worker = 0;  // TODO: parallelize
+
+  // Fast path for top-1 with no accept_token.
+  if (runtime_config.top_k == 1 && !runtime_config.accept_token) {
+    return [](float* logits, size_t vocab_size) HWY_ATTR -> TokenAndProb {
+      PROFILER_ZONE2(worker, "Gen.Sample Top1");
+      return Top1OfSoftmax(logits, vocab_size);
+    };
+  }
+
+  // General case: Softmax with top-k sampling.
+  return [&runtime_config](float* logits,
+                           size_t vocab_size) HWY_ATTR -> TokenAndProb {
+    PROFILER_ZONE("Gen.Sample general");
+    return FusedSoftmaxAndSampleTopK(
+        logits, runtime_config.top_k, vocab_size, *runtime_config.gen,
+        runtime_config.temperature, runtime_config.accept_token, worker);
+  };
+}
+
+// Decode: generates one continuation token for each query in `qbatch`.
+static void GenerateT(const ModelConfig& config,
+                      const RuntimeConfig& runtime_config,
+                      const WeightsPtrs& weights, Activations& activations,
+                      QBatch& qbatch, MatMulEnv& env, TimingInfo& timing_info) {
+  // Griffin assumes that the recurrent block cache is zero-initialized.
+  for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
+    if (qbatch.MutablePos(qi) == 0) {
+      qbatch.KV(qi).ZeroGriffinCache();  // No-op for non-Griffin models.
+    }
+  }
+
+  size_t max_prompt_size = 0;
+  bool all_prefix_end_are_zero = true;
+  size_t total_prefill_tokens = 0;  // only for throughput stats.
+  const size_t seq_len = qbatch.KV(0).SeqLen();
+  for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
+    const PromptTokens& prompt = qbatch.Prompt(qi);
+    max_prompt_size = HWY_MAX(max_prompt_size, prompt.size());
+
+    // Prefill stops before size - 1 because the last prompt token is the
+    // first input token for generation.
+    total_prefill_tokens += prompt.size() - 1;
+
+    // Sanity check: prompts should not be empty, nor start with EOS.
+    HWY_ASSERT(prompt.size() != 0 && prompt[0] != config.eos_id);
+
+    all_prefix_end_are_zero &= qbatch.PrefixEnd(qi) == 0;
+
+    // We use a single divisor, so all sequence lengths must be the same.
+    HWY_ASSERT(qbatch.KV(qi).SeqLen() == seq_len);
+  }
+  if (max_prompt_size >= seq_len) {
+    HWY_ABORT("max_prompt_size = %zu, increase --seq_len to at least that.",
+              max_prompt_size);
+  }
+  HWY_ASSERT(activations.attention.div_seq_len.GetDivisor() == seq_len);
+
+  // Lacks a constructor to bulk-set, hence initialized by Prefill* which have
+  // qi loops anyway.
+  hwy::BitSet4096<> non_eos;  // indexed by qi
+
+  timing_info.prefill_start = hwy::platform::Now();
+  // Batch over the larger of prompt length, or queries.
+  if ((qbatch.Size() > max_prompt_size) && all_prefix_end_are_zero) {
+    activations.SetBatchSize(qbatch.Size());  // required before PrefillQBatch
+    PrefillQBatch(max_prompt_size, config, runtime_config, weights, activations,
+                  qbatch, env, non_eos);
+  } else {
+    PrefillTBatch(config, runtime_config, weights, activations, qbatch, env,
+                  non_eos);
+    activations.SetBatchSize(qbatch.Size());  // Restore after PrefillTBatch.
+  }
+  HWY_DASSERT(non_eos.Count() == qbatch.Size());
+  timing_info.NotifyPrefill(total_prefill_tokens);
+  // queries_pos have been incremented by Prefill.
+
+  // Stream the last prompt token from each query, fill activations.gen_tokens.
+  for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
+    const size_t last_pos_in_prompt = qbatch.Pos(qi) - qbatch.InitialPos(qi);
+    StreamAndUpdateEOS(qi, qbatch.Prompt(qi)[last_pos_in_prompt], 0.0f, config,
+                       runtime_config, qbatch, non_eos);
+  }
+
+  size_t max_gen_steps = runtime_config.max_generated_tokens;
+  if (max_prompt_size + max_gen_steps > seq_len) {
+    HWY_WARN("prefill %zu + max_gen_steps %zu > seq_len %zu, truncating.",
+             max_prompt_size, max_gen_steps, seq_len);
+    max_gen_steps = seq_len - max_prompt_size;
+  }
+
+  const SampleFunc sample_token = ChooseSampleFunc(runtime_config);
+
+  {
+    timing_info.generate_start = hwy::platform::Now();
+    for (size_t gen = 0; gen < max_gen_steps && non_eos.Any(); ++gen) {
+      DecodeStepT(config, runtime_config, weights, sample_token, activations,
+                  qbatch, env, non_eos, timing_info);
+    }
+    timing_info.NotifyGenerateDone();
+  }
+}
+
+void GenerateSingleT(const PromptTokens& prompt, size_t pos, size_t prefix_end,
+                     const ModelConfig& config,
+                     const RuntimeConfig& runtime_config,
+                     const WeightsPtrs& weights, KVCache& kv_cache,
+                     MatMulEnv& env, TimingInfo& timing_info) {
+  Activations activations(config, runtime_config.prefill_tbatch_size,
+                          kv_cache.SeqLen(), env.ctx.allocator, env.row_ptrs);
+
+  AllQueries all_queries(prompt, pos, prefix_end,
+                         hwy::Span<KVCache>(&kv_cache, 1));
+  QBatch qbatch(/*start=*/0, /*max_size=*/1, all_queries);
+  GenerateT(config, runtime_config, weights, activations, qbatch, env,
+            timing_info);
+}
+
+// Splits the input into batches of at most `runtime_config.decode_qbatch_size`
+// queries, and calls `GenerateT` on each batch.
+void GenerateBatchT(const ModelConfig& config,
+                    const RuntimeConfig& runtime_config,
+                    const WeightsPtrs& weights, AllQueries& all_queries,
+                    MatMulEnv& env, TimingInfo& timing_info) {
+  const size_t max_batch_size = HWY_MAX(runtime_config.decode_qbatch_size,
+                                        runtime_config.prefill_tbatch_size);
+  Activations activations(config, max_batch_size,
+                          all_queries[0].kv_cache.SeqLen(), env.ctx.allocator,
+                          env.row_ptrs);
+
+  for (size_t start = 0; start < all_queries.NumQueries();
+       start += runtime_config.decode_qbatch_size) {
+    QBatch qbatch(start, runtime_config.decode_qbatch_size, all_queries);
+    // Generate a batch of one token for each of `qbatch.Size()` queries.
+    GenerateT(config, runtime_config, weights, activations, qbatch, env,
+              timing_info);
+  }
+}
+
+void GenerateImageTokensT(const ModelConfig& config,
+                          const RuntimeConfig& runtime_config, size_t seq_len,
+                          const WeightsPtrs& weights, const Image& image,
+                          ImageTokens& image_tokens, MatMulEnv& env) {
+  if (config.vit_config.layer_configs.empty()) {
+    HWY_ABORT("Model does not support generating image tokens.");
+  }
+  RuntimeConfig prefill_runtime_config = runtime_config;
+  const ModelConfig vit_config = GetVitConfig(config);
+  const size_t num_tokens = vit_config.max_seq_len;
+  prefill_runtime_config.prefill_tbatch_size =
+      num_tokens / (vit_config.pool_dim * vit_config.pool_dim);
+  Activations prefill_activations(vit_config, num_tokens, num_tokens,
+                                  env.ctx.allocator, env.row_ptrs);
+  // Weights are for the full PaliGemma model, not just the ViT part.
+  PrefillVit(config, weights, prefill_runtime_config, image, image_tokens,
+             prefill_activations, env);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace gcpp {
+HWY_EXPORT(GenerateSingleT);
+HWY_EXPORT(GenerateBatchT);
+HWY_EXPORT(GenerateImageTokensT);
+
+Gemma::Gemma(const LoaderArgs& loader, const InferenceArgs& inference,
+             ThreadingContext& ctx)
+    : reader_(loader.weights),
+      model_(reader_, loader.tokenizer, loader.wrapping),
+      weights_(model_.Config()),
+      chat_template_(model_.Tokenizer(), model_.Config().model),
+      inference_(inference) {
+  weight_read_mode_ = weights_.ReadFromBlobs(model_, reader_, loader, inference,
+                                             mat_owners_, ctx);
+  // Read everything into memory, or `weights_.mapped_` keeps the mapping alive.
+  reader_.CloseFile();
+}
+
+Gemma::~Gemma() = default;
+
+void Gemma::Save(const Path& weights_path, NestedPools& pools) const {
+  BlobWriter writer(weights_path, pools.Pool());
+  const std::vector<uint32_t> serialized_mat_ptrs =
+      weights_.AddTensorDataToWriter(writer);
+  WriteSingleFile(model_.Config(), model_.Tokenizer(), serialized_mat_ptrs,
+                  writer);
+}
 
 void Gemma::Generate(const RuntimeConfig& runtime_config,
                      const PromptTokens& prompt, size_t pos, size_t prefix_end,
-                     KVCache& kv_cache, TimingInfo& timing_info) {
-  env_.parallel.Pools().MaybeStartSpinning(runtime_config.use_spinning);
+                     KVCache& kv_cache, MatMulEnv& env,
+                     TimingInfo& timing_info) const {
+  env.ctx.pools.MaybeStartSpinning(runtime_config.use_spinning);
 
-  model_.CallForModelWeight<GenerateSingleT>(
-      runtime_config, prompt, pos, prefix_end, kv_cache, &env_, timing_info);
+  HWY_DYNAMIC_DISPATCH(GenerateSingleT)(prompt, pos, prefix_end,
+                                        model_.Config(), runtime_config,
+                                        weights_, kv_cache, env, timing_info);
 
-  env_.parallel.Pools().MaybeStopSpinning(runtime_config.use_spinning);
+  env.ctx.pools.MaybeStopSpinning(runtime_config.use_spinning);
 }
 
 void Gemma::GenerateBatch(const RuntimeConfig& runtime_config,
-                          const QueriesPromptTokens& queries_prompt,
-                          const QueriesPos& queries_pos,
-                          const QueriesPos& queries_prefix_end,
-                          const KVCaches& kv_caches, TimingInfo& timing_info) {
-  // If we did not get passed prefix ends (size 0), assume 0 and pass that on.
-  QueriesPos mutable_queries_prefix_end = queries_prefix_end;
-  std::vector<size_t> prefix_end_vec;
-  if (queries_prefix_end.size() == 0) {
-    prefix_end_vec.resize(queries_prompt.size(), 0);
-    mutable_queries_prefix_end =
-        QueriesPos(prefix_end_vec.data(), prefix_end_vec.size());
-  }
+                          AllQueries& all_queries, MatMulEnv& env,
+                          TimingInfo& timing_info) const {
+  env.ctx.pools.MaybeStartSpinning(runtime_config.use_spinning);
 
-  env_.parallel.Pools().MaybeStartSpinning(runtime_config.use_spinning);
+  HWY_DYNAMIC_DISPATCH(GenerateBatchT)(model_.Config(), runtime_config,
+                                       weights_, all_queries, env, timing_info);
 
-  model_.CallForModelWeight<GenerateBatchT>(
-      runtime_config, queries_prompt, queries_pos, mutable_queries_prefix_end,
-      kv_caches, &env_, timing_info);
-
-  env_.parallel.Pools().MaybeStopSpinning(runtime_config.use_spinning);
+  env.ctx.pools.MaybeStopSpinning(runtime_config.use_spinning);
 }
 
 void Gemma::GenerateImageTokens(const RuntimeConfig& runtime_config,
-                                const Image& image, ImageTokens& image_tokens) {
-  env_.parallel.Pools().MaybeStartSpinning(runtime_config.use_spinning);
+                                size_t seq_len, const Image& image,
+                                ImageTokens& image_tokens,
+                                MatMulEnv& env) const {
+  env.ctx.pools.MaybeStartSpinning(runtime_config.use_spinning);
 
-  model_.CallForModelWeight<GenerateImageTokensT>(runtime_config, image,
-                                                  image_tokens, &env_);
+  HWY_DYNAMIC_DISPATCH(GenerateImageTokensT)(model_.Config(), runtime_config,
+                                             seq_len, weights_, image,
+                                             image_tokens, env);
 
-  env_.parallel.Pools().MaybeStopSpinning(runtime_config.use_spinning);
-}
-
-// Non-template functions moved from gemma-inl.h to avoid ODR violations.
-
-void RangeChecks(const ModelConfig& weights_config,
-                 size_t& max_generated_tokens, const size_t prompt_size) {
-  if (!weights_config.use_local_attention) {
-    if (max_generated_tokens > weights_config.seq_len) {
-      fprintf(stderr,
-              "WARNING: max_generated_tokens %zu > kSeqLen %u, truncating.\n",
-              max_generated_tokens, weights_config.seq_len);
-      max_generated_tokens = weights_config.seq_len;
-    }
-  }
-  HWY_ASSERT(prompt_size > 0);
+  env.ctx.pools.MaybeStopSpinning(runtime_config.use_spinning);
 }
 
 }  // namespace gcpp
+#endif  // HWY_ONCE
diff --git a/gemma/gemma.h b/gemma/gemma.h
index ccda69c..5ebd70d 100644
--- a/gemma/gemma.h
+++ b/gemma/gemma.h
@@ -16,122 +16,154 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_GEMMA_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_GEMMA_H_
 
-#include <functional>
-#include <random>
-#include <string>
+#include <stdio.h>
+
 #include <vector>
 
 // IWYU pragma: begin_exports
-#include "compression/io.h"  // Path
 #include "gemma/activations.h"
-#include "gemma/common.h"
 #include "gemma/configs.h"
+#include "gemma/gemma_args.h"
 #include "gemma/kv_cache.h"
-#include "gemma/tokenizer.h"
+#include "gemma/model_store.h"
 #include "gemma/weights.h"
+#include "io/blob_store.h"
+#include "io/io.h"       // Path
 #include "ops/matmul.h"  // MatMulEnv
 #include "paligemma/image.h"
-#include "util/allocator.h"  // RowVectorBatch
-#include "util/basics.h"     // TokenAndProb
+#include "util/basics.h"  // TokenAndProb
+#include "util/threading_context.h"
 #include "hwy/timer.h"
 // IWYU pragma: end_exports
-#include "hwy/aligned_allocator.h"  // Span
 
 namespace gcpp {
-using PromptTokens = hwy::Span<const int>;
 
-// Batches of independent queries have their own prompt, previous token,
-// position in the sequence, and KVCache.
-using QueriesPromptTokens = hwy::Span<const PromptTokens>;
-using QueriesToken = hwy::Span<const int>;
-using QueriesPos = hwy::Span<const size_t>;
-using KVCaches = hwy::Span<KVCache>;
+struct PerQuery {
+  PromptTokens prompt;
 
-// StreamFunc is called with (token, probability). For prompt tokens,
-// probability is 0.0f. StreamFunc should return false to stop generation and
-// true to continue generation.
-using StreamFunc = std::function<bool(int, float)>;
-// BatchStreamFunc is called with (query_idx, pos, token, probability).
-// For prompt tokens, probability is 0.0f.
-// StreamFunc should return false to stop generation and true to continue.
-using BatchStreamFunc = std::function<bool(size_t, size_t, int, float)>;
-// If not empty, AcceptFunc is called with token. It should return false for
-// tokens you don't want to generate and true for tokens you want to generate.
-using AcceptFunc = std::function<bool(int, float)>;
-// If not empty, SampleFunc is called with the logits for the next token, which
-// it may modify/overwrite, and its return value is the next generated token
-// together with its probability.
-using SampleFunc = std::function<TokenAndProb(float*, size_t)>;
-// If not empty, LayersOutputFunc is called for layer outputs, specified with:
-// - index of query within containing batch (if any); zero otherwise.
-// - position in the tokens sequence
-// - name of the data, e.g. "tokens" for token IDs
-// - layer index (or -1 for global outputs)
-// - pointer to the data array
-// - size of the data array
-using LayersOutputFunc = std::function<void(size_t, size_t, const std::string&,
-                                            int, const float*, size_t)>;
-// If not empty, ActivationsObserverFunc is invoked after each layer with:
-// - per-query position within the tokens sequence
-// - layer index (or -1 for post-norm output)
-// - activations
-using ActivationsObserverFunc =
-    std::function<void(const QueriesPos& queries_pos, int, const Activations&)>;
+  // Position in the KV cache: initially zero for the first turn, or when
+  // multi-turn is NOT desired. Incremented by prefill and `StreamAndUpdateEOS`.
+  size_t mutable_pos;
+  // Allows computing the last prefill token as `mutable_pos - initial_pos`,
+  // which might differ from `prompt.size() - 1` for prefix-LM.
+  size_t initial_pos;
+  // Zero for causal attention, or the end of the prefix for prefix-LM style
+  // attention in Paligemma.
+  size_t prefix_end;
 
-// ImageTokens are represented as a RowVectorBatch, where each "batch" index
-// corresponds to a token for an image patch as computed by the image encoder.
-using ImageTokens = RowVectorBatch<float>;
+  KVCache& kv_cache;
 
-// RuntimeConfig holds configuration for a single generation run.
-struct RuntimeConfig {
-  // If not empty, batch_stream_token is called for each token in the batch,
-  // instead of stream_token.
-  bool StreamToken(size_t query_idx, size_t pos, int token, float prob) const {
-    if (batch_stream_token) {
-      return batch_stream_token(query_idx, pos, token, prob);
+  // Previous token generated for this query, or the last prompt token. Will be
+  // fed into the next Transformer() call.
+  int prev_token = 0;
+};
+
+// Array of `PerQuery`. Referenced by `QBatch` and passed to `GenerateBatch`.
+struct AllQueries {
+  AllQueries() = default;
+
+  // For `GenerateSingleT`: same prompt/pos, replicated for each KV cache.
+  AllQueries(const PromptTokens& prompt, size_t pos, size_t prefix_end,
+             const hwy::Span<KVCache>& kv_caches) {
+    per_query_.reserve(kv_caches.size());
+    for (size_t i = 0; i < kv_caches.size(); ++i) {
+      HWY_ASSERT(kv_caches[i].SeqLen() == kv_caches[0].SeqLen());
+      per_query_.push_back(PerQuery{
+          .prompt = prompt,
+          .mutable_pos = pos,
+          .initial_pos = pos,
+          .prefix_end = prefix_end,
+          .kv_cache = kv_caches[i],
+      });
     }
-    return stream_token(token, prob);
   }
 
-  // Limit on the number of tokens generated.
-  size_t max_generated_tokens;
+  // Batch of queries with initial position set to zero. Causal attention
+  // is requested via empty or all-zero `prefix_end`.
+  AllQueries(
+      const hwy::Span<const PromptTokens>& prompts,
+      const hwy::Span<KVCache>& kv_caches,
+      const hwy::Span<const size_t>& prefix_end = hwy::Span<const size_t>()) {
+    HWY_ASSERT(prompts.size() == kv_caches.size());
+    HWY_ASSERT(prompts.size() == prefix_end.size() || prefix_end.size() == 0);
+    per_query_.reserve(kv_caches.size());
+    for (size_t i = 0; i < kv_caches.size(); ++i) {
+      HWY_ASSERT(kv_caches[i].SeqLen() == kv_caches[0].SeqLen());
+      per_query_.push_back(PerQuery{
+          .prompt = prompts[i],
+          .mutable_pos = 0,
+          .initial_pos = 0,
+          .prefix_end = prefix_end.size() == 0 ? 0 : prefix_end[i],
+          .kv_cache = kv_caches[i],
+      });
+    }
+  }
 
-  // These defaults are overridden by InferenceArgs::CopyTo(*this):
-  // Max tokens per batch during prefill.
-  size_t prefill_tbatch_size = 256;
-  // Max queries per batch (one token from each) during decode.
-  size_t decode_qbatch_size = 16;
+  void Reserve(size_t size) { per_query_.reserve(size); }
+  void Append(const PerQuery& query) { per_query_.push_back(query); }
 
-  // Sampling-related parameters.
-  float temperature;     // Temperature for sampling.
-  size_t top_k = kTopK;  // Top-k for sampling.
-  std::mt19937* gen;     // Random number generator used for sampling.
+  size_t NumQueries() const { return per_query_.size(); }
 
-  int verbosity;  // Controls verbosity of printed messages.
+  PerQuery& operator[](size_t query_idx) {
+    HWY_DASSERT(query_idx < NumQueries());
+    return per_query_[query_idx];
+  }
+  const PerQuery& operator[](size_t query_idx) const {
+    HWY_DASSERT(query_idx < NumQueries());
+    return per_query_[query_idx];
+  }
 
-  // Functions operating on the generated tokens.
-  StreamFunc stream_token;
-  BatchStreamFunc batch_stream_token;
-  AcceptFunc accept_token;         // if empty, accepts all tokens.
-  SampleFunc sample_func;          // if empty, uses SampleTopK.
+ private:
+  std::vector<PerQuery> per_query_;
+};
 
-  // Observer callbacks for intermediate data.
-  LayersOutputFunc layers_output;  // if not empty, called after each layer.
-  ActivationsObserverFunc activations_observer;  // if set, called per-layer.
+// View into AllQueries: either a batch of queries, or a single query for use
+// in PrefillTBatch or GenerateSingleT. Cheap to create because it holds a
+// reference to AllQueries.
+class QBatch {
+ public:
+  QBatch(size_t start, size_t max_size, AllQueries& queries)
+      : start_(start),
+        max_size_(max_size),
+        queries_(queries),
+        size_(HWY_MIN(max_size_, queries_.NumQueries() - start_)) {
+    HWY_ASSERT(max_size_ <= 4096);  // non_eos uses `BitSet4096`.
+    HWY_DASSERT(size_ != 0);
+    HWY_DASSERT(start_ + size_ <= queries_.NumQueries());
+  }
 
-  // If not empty, these point to the image tokens and are used in the
-  // PaliGemma prefix-LM style attention.
-  const ImageTokens *image_tokens = nullptr;
+  // Returns a single-query view starting at `qi` relative to this batch.
+  QBatch Single(size_t qi) const { return QBatch(start_ + qi, 1, queries_); }
 
-  // Whether to use thread spinning to reduce barrier synchronization latency.
-  // Mutable so we can change kDefault to kTrue/kFalse during Generate, because
-  // RuntimeConfig is const there and is not passed to the Gemma ctor. This
-  // default decision is likely sufficient because it is based on whether
-  // threads are successfully pinned.
-  mutable Tristate use_spinning = Tristate::kDefault;
+  // How many queries in this batch, <= `queries_.NumQueries()` and `max_size_`.
+  size_t Size() const { return size_; }
 
-  // End-of-sequence token.
-  int eos_id = EOS_ID;
+  // Returns index for use with `AllQueries` and `BatchStreamToken`.
+  size_t QueryIdx(size_t qi) const {
+    HWY_DASSERT(qi < size_);
+    return start_ + qi;
+  }
+
+  // Accessor functions to bridge the previous SoA and current AoS layout.
+  const PromptTokens& Prompt(size_t qi) const {
+    return queries_[QueryIdx(qi)].prompt;
+  }
+  size_t Pos(size_t qi) const { return queries_[QueryIdx(qi)].mutable_pos; }
+  size_t& MutablePos(size_t qi) { return queries_[QueryIdx(qi)].mutable_pos; }
+  size_t InitialPos(size_t qi) const {
+    return queries_[QueryIdx(qi)].initial_pos;
+  }
+  size_t PrefixEnd(size_t qi) const {
+    return queries_[QueryIdx(qi)].prefix_end;
+  }
+  KVCache& KV(size_t qi) const { return queries_[QueryIdx(qi)].kv_cache; }
+  int& PrevToken(size_t qi) { return queries_[QueryIdx(qi)].prev_token; }
+
+ private:
+  size_t start_;
+  size_t max_size_;
+  AllQueries& queries_;
+  size_t size_;
 };
 
 struct TimingInfo {
@@ -193,82 +225,58 @@ struct TimingInfo {
   size_t tokens_generated = 0;
 };
 
+// After construction, all methods are const and thread-compatible if using
+// separate ThreadingContext for each thread.
 class Gemma {
  public:
-  // Reads old format weights file and tokenizer file.
-  // `env` must remain valid for the lifetime of this Gemma.
-  Gemma(const Path& tokenizer_path, const Path& weights, const ModelInfo& info,
-        MatMulEnv& env);
-  // Reads new format weights file that contains everything in a single file.
-  // `env` must remain valid for the lifetime of this Gemma.
-  Gemma(const Path& weights, MatMulEnv& env);
-  // Allocates weights, caller is responsible for filling them.
-  Gemma(GemmaTokenizer&& tokenizer, const ModelInfo& info, MatMulEnv& env);
+  // Reads weights/config/tokenizer from the `BlobStore` at `loader.weights`.
+  // `ctx` is only used to read tensors, but it is typically also referenced
+  // by the `MatMulEnv` passed to the Generate* methods.
+  Gemma(const LoaderArgs& loader, const InferenceArgs& inference,
+        ThreadingContext& ctx);
   ~Gemma();
 
-  const ModelConfig& GetModelConfig() const { return model_.Config(); }
-  ModelInfo Info() const {
-    return ModelInfo({.model = model_.Config().model,
-                      .wrapping = model_.Config().wrapping,
-                      .weight = model_.Config().weight});
-  }
-  const GemmaTokenizer& Tokenizer() const { return tokenizer_; }
-  const ModelWeightsStorage& Weights() const { return model_; }
-  ModelWeightsStorage& MutableWeights() { return model_; }
-  void Save(const Path& weights, hwy::ThreadPool& pool) {
-    std::string tokenizer_proto = tokenizer_.Serialize();
-    model_.Save(tokenizer_proto, weights, pool);
-  }
+  const ModelConfig& Config() const { return model_.Config(); }
+  const GemmaTokenizer& Tokenizer() const { return model_.Tokenizer(); }
+  const WeightsPtrs& Weights() const { return weights_; }
+  WeightsPtrs::Mode WeightReadMode() const { return weight_read_mode_; }
+  const GemmaChatTemplate& ChatTemplate() const { return chat_template_; }
+  const InferenceArgs& Inference() const { return inference_; }
+
+  void Save(const Path& weights_path, NestedPools& pools) const;
 
   // `pos` is the position in the KV cache. Users are responsible for
   // incrementing it in the `*StreamFunc`, or setting to zero for single-turn.
   void Generate(const RuntimeConfig& runtime_config, const PromptTokens& prompt,
-                size_t pos, KVCache& kv_cache, TimingInfo& timing_info) {
-    Generate(runtime_config, prompt, pos, /*prefix_end=*/0, kv_cache,
+                size_t pos, KVCache& kv_cache, MatMulEnv& env,
+                TimingInfo& timing_info) const {
+    Generate(runtime_config, prompt, pos, /*prefix_end=*/0, kv_cache, env,
              timing_info);
   }
   // For prefix-LM style attention, we can pass the end of the prefix.
   void Generate(const RuntimeConfig& runtime_config, const PromptTokens& prompt,
                 size_t pos, size_t prefix_end, KVCache& kv_cache,
-                TimingInfo& timing_info);
+                MatMulEnv& env, TimingInfo& timing_info) const;
 
-  // `queries_pos` are the positions in the KV cache. Users are responsible for
-  // incrementing them in `BatchStreamFunc`, or setting to zero for single-turn.
   void GenerateBatch(const RuntimeConfig& runtime_config,
-                     const QueriesPromptTokens& queries_prompt,
-                     const QueriesPos& queries_pos, const KVCaches& kv_caches,
-                     TimingInfo& timing_info) {
-    GenerateBatch(runtime_config, queries_prompt, queries_pos,
-                  /*queries_prefix_end=*/{}, kv_caches, timing_info);
-  }
-  // For prefix-LM style attention, we can pass the ends of the prefixes.
-  void GenerateBatch(const RuntimeConfig& runtime_config,
-                     const QueriesPromptTokens& queries_prompt,
-                     const QueriesPos& queries_pos,
-                     const QueriesPos& queries_prefix_end,
-                     const KVCaches& kv_caches, TimingInfo& timing_info);
+                     AllQueries& all_queries, MatMulEnv& env,
+                     TimingInfo& timing_info) const;
 
   // Generates the image tokens by running the image encoder ViT.
-  void GenerateImageTokens(const RuntimeConfig& runtime_config,
-                           const Image& image, ImageTokens& image_tokens);
+  void GenerateImageTokens(const RuntimeConfig& runtime_config, size_t seq_len,
+                           const Image& image, ImageTokens& image_tokens,
+                           MatMulEnv& env) const;
 
  private:
-  MatMulEnv& env_;
-
-  GemmaTokenizer tokenizer_;
-  // Type-erased so that this can be defined in the header.
-  ModelWeightsStorage model_;
+  BlobReader reader_;
+  ModelStore model_;
+  std::vector<MatOwner> mat_owners_;
+  WeightsPtrs weights_;
+  WeightsPtrs::Mode weight_read_mode_;
+  GemmaChatTemplate chat_template_;
+  InferenceArgs inference_;
 };
 
-// Adds BOS token and possibly 'turn' annotations, which depend on `info`
-// and `pos`, the number of tokens decoded so far; returns the corresponding
-// tokens. Asserts that tokenization is successful.
-std::vector<int> WrapAndTokenize(const GemmaTokenizer& tokenizer,
-                                 const ModelInfo& info, size_t pos,
-                                 std::string& prompt);
-void RangeChecks(const ModelConfig& weights_config,
-                 size_t& max_generated_tokens, size_t prompt_size);
-
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_GEMMA_H_
diff --git a/gemma/gemma_args.h b/gemma/gemma_args.h
new file mode 100644
index 0000000..70268c7
--- /dev/null
+++ b/gemma/gemma_args.h
@@ -0,0 +1,273 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Shared between various frontends.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_ARGS_H_
+#define THIRD_PARTY_GEMMA_CPP_GEMMA_ARGS_H_
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include <functional>
+#include <random>
+#include <string>
+
+#include "io/io.h"       // Path
+#include "ops/matmul.h"  // MMStorage::kMax*
+#include "util/args.h"
+#include "util/basics.h"  // Tristate
+#include "util/mat.h"
+#include "hwy/aligned_allocator.h"  // Span
+#include "hwy/base.h"               // HWY_ABORT
+#include "hwy/profiler.h"
+
+namespace gcpp {
+
+struct LoaderArgs : public ArgsBase<LoaderArgs> {
+  LoaderArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
+  LoaderArgs(const std::string& tokenizer_path,
+             const std::string& weights_path) {
+    Init();  // Init sets to defaults, so assignments must come after Init().
+    tokenizer.path = tokenizer_path;
+    weights.path = weights_path;
+  };
+
+  Path tokenizer;
+  Path weights;  // weights file location
+  Tristate map;
+  Tristate to_bf16;
+  Tristate wrapping;
+
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
+    visitor(tokenizer, "tokenizer", Path(),
+            "Path name of tokenizer model; only required for pre-2025 format.");
+    visitor(weights, "weights", Path(),
+            "Path name of model weights (.sbs) file.\n  Required argument.\n");
+    visitor(map, "map", Tristate::kDefault,
+            "Enable memory-mapping? -1 = auto, 0 = no, 1 = yes.");
+    visitor(to_bf16, "to_bf16", Tristate::kDefault,
+            "Convert weights to bf16? -1 = auto, 0 = no, 1 = yes.");
+    visitor(wrapping, "wrapping", Tristate::kDefault,
+            "Enable prompt wrapping? Specify 0 for pre-2025 format PT models.");
+  }
+};
+
+using PromptTokens = hwy::Span<const int>;
+
+// Batches of independent queries have their own prompt, previous token,
+// position in the sequence, and KVCache.
+using QueriesPromptTokens = hwy::Span<const PromptTokens>;
+using QueriesToken = hwy::Span<const int>;
+using QueriesPos = hwy::Span<const size_t>;
+
+// ImageTokens are represented as a matrix, where each row corresponds
+// to a token for an image patch as computed by the image encoder.
+using ImageTokens = MatStorageT<float>;
+
+// StreamFunc is called with (token, probability). For prompt tokens,
+// probability is 0.0f. StreamFunc should return false to stop generation and
+// true to continue generation.
+using StreamFunc = std::function<bool(int, float)>;
+// BatchStreamFunc is called with (query_idx, pos, token, probability).
+// For prompt tokens, probability is 0.0f. Generation continues if this returns
+// true and stops if it returns false. Note that query_idx is absolute, not
+// relative to the batch.
+using BatchStreamFunc = std::function<bool(size_t, size_t, int, float)>;
+// If not empty, AcceptFunc is called with token. It should return false for
+// tokens you don't want to generate and true for tokens you want to generate.
+using AcceptFunc = std::function<bool(int, float)>;
+// If not empty, SampleFunc is called with the logits for the next token, which
+// it may modify/overwrite, and its return value is the next generated token
+// together with its probability.
+using SampleFunc = std::function<TokenAndProb(float*, size_t)>;
+// If not empty, LayersOutputFunc is called for layer outputs, specified with:
+// - index of query within containing batch (if any); zero otherwise.
+// - position in the tokens sequence
+// - name of the data, e.g. "tokens" for token IDs
+// - layer index (or -1 for global outputs)
+// - pointer to the data array
+// - size of the data array
+using LayersOutputFunc = std::function<void(size_t, size_t, const std::string&,
+                                            int, const float*, size_t)>;
+// If not empty, ActivationsObserverFunc is invoked after each layer with:
+// - per-query position within the tokens sequence
+// - layer index (or -1 for post-norm output)
+// - activations
+struct Activations;
+using ActivationsObserverFunc =
+    std::function<void(const QueriesPos& queries_pos, int, const Activations&)>;
+
+// RuntimeConfig holds configuration for a single generation run.
+// TODO: move into InferenceArgs, use that directly.
+struct RuntimeConfig {
+  // If non-null, `batch_stream_token` is called for each token in the batch,
+  // otherwise `stream_token`. `query_idx` is absolute, not batch-relative.
+  bool StreamToken(size_t query_idx, size_t pos, int token, float prob) const {
+    PROFILER_ZONE("Gen.StreamToken");
+    if (batch_stream_token) {
+      return batch_stream_token(query_idx, pos, token, prob);
+    }
+    return stream_token(token, prob);
+  }
+
+  // Limit on the number of tokens generated.
+  size_t max_generated_tokens;
+
+  // These defaults are overridden by InferenceArgs::CopyTo(*this):
+  // Max tokens per batch during prefill.
+  size_t prefill_tbatch_size = 256;
+  // Max queries per batch (one token from each) during decode.
+  size_t decode_qbatch_size = 16;
+
+  // Sampling-related parameters.
+  float temperature;  // Temperature for sampling.
+
+  size_t top_k = 1;           // Top-k for sampling.
+  std::mt19937* gen;          // Random number generator used for sampling.
+
+  int verbosity;  // Controls verbosity of printed messages.
+
+  // Functions operating on the generated tokens.
+  StreamFunc stream_token;
+  BatchStreamFunc batch_stream_token;
+  AcceptFunc accept_token;  // if empty, accepts all tokens.
+  SampleFunc sample_func;   // if empty, uses SampleTopK.
+
+  // Observer callbacks for intermediate data.
+  LayersOutputFunc layers_output;  // if not empty, called after each layer.
+  ActivationsObserverFunc activations_observer;  // if set, called per-layer.
+
+  // If not empty, these point to the image tokens and are used in the
+  // PaliGemma prefix-LM style attention.
+  const ImageTokens* image_tokens = nullptr;
+
+  // Whether to use thread spinning to reduce barrier synchronization latency.
+  // Mutable so we can change kDefault to kTrue/kFalse during Generate, because
+  // RuntimeConfig is const there and is not passed to the Gemma ctor. This
+  // default decision is likely sufficient because it is based on whether
+  // threads are successfully pinned.
+  mutable Tristate use_spinning = Tristate::kDefault;
+};
+
+struct InferenceArgs : public ArgsBase<InferenceArgs> {
+  InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
+  InferenceArgs() { Init(); };
+
+  bool IsInteractive() const { return prompt.empty() && prompt_file.Empty(); }
+
+  int verbosity;
+
+  size_t seq_len;
+  size_t max_generated_tokens;
+
+  size_t prefill_tbatch_size;
+  size_t decode_qbatch_size;
+
+  float temperature;
+  size_t top_k;
+  bool deterministic;
+  bool multiturn;
+  Path image_file;
+
+  std::string prompt;  // Bypasses std::getline
+  // For prompts longer than the Linux terminal's 4K line edit buffer.
+  Path prompt_file;
+  std::string eot_line;
+
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
+    visitor(verbosity, "verbosity", 1,
+            "Show verbose developer information\n    0 = only print generation "
+            "output\n    1 = standard user-facing terminal ui\n    2 = show "
+            "developer/debug info).\n    Default = 1.",
+            1);
+
+    visitor(seq_len, "seq_len", size_t{8192},
+            "Sequence length, capped by ModelConfig.max_seq_len.");
+    visitor(max_generated_tokens, "max_generated_tokens", size_t{4096},
+            "Maximum number of tokens to generate.");
+
+    visitor(prefill_tbatch_size, "prefill_tbatch", size_t{256},
+            "Prefill: max tokens per batch.");
+    visitor(decode_qbatch_size, "decode_qbatch", size_t{16},
+            "Decode: max queries per batch.");
+
+    visitor(temperature, "temperature", 1.0f, "Temperature for top-K", 2);
+    visitor(top_k, "top_k", size_t{1}, "Number of top-K tokens to sample from",
+            2);
+    visitor(deterministic, "deterministic", false,
+            "Make top-k sampling deterministic", 2);
+    visitor(multiturn, "multiturn", false,
+            "Multiturn mode\n    0 = clear KV cache after every "
+            "interaction\n    1 = continue KV cache after every interaction\n  "
+            "  Default : 0 (conversation "
+            "resets every turn)");
+    visitor(image_file, "image_file", Path(), "Image file to load.");
+
+    visitor(prompt, "prompt", std::string(""),
+            "Initial prompt for non-interactive mode. When specified, "
+            "generates a response and exits.",
+            1);
+    visitor(prompt_file, "prompt_file", Path(),
+            "Path to file containing the prompt for non-interactive mode. When "
+            " specified, generates a response and exits.",
+            1);
+
+    visitor(
+        eot_line, "eot_line", std::string(""),
+        "End of turn line. "
+        "When you specify this, the prompt will be all lines "
+        "before the line where only the given string appears.\n    Default = "
+        "When a newline is encountered, that signals the end of the turn.",
+        2);
+  }
+
+  void CopyTo(RuntimeConfig& runtime_config) const {
+    runtime_config.max_generated_tokens = max_generated_tokens;
+    runtime_config.prefill_tbatch_size = prefill_tbatch_size;
+    runtime_config.decode_qbatch_size = decode_qbatch_size;
+    if (prefill_tbatch_size > MMStorage::kMaxM) {
+      HWY_ABORT(
+          "prefill_tbatch_size %zu > kMaxM %zu: specify a smaller value, "
+          "or increase the constant in MMStorage.\n",
+          prefill_tbatch_size, MMStorage::kMaxM);
+    }
+    if (decode_qbatch_size > MMStorage::kMaxM) {
+      HWY_ABORT(
+          "decode_qbatch_size %zu > kMaxM %zu: specify a smaller value, "
+          "or increase the constant in MMStorage.\n",
+          decode_qbatch_size, MMStorage::kMaxM);
+    }
+
+    runtime_config.temperature = temperature;
+    runtime_config.top_k = top_k;
+  }
+};
+
+static inline ThreadingArgs UpdateArgs(const ThreadingArgs& threading_args,
+                                       const InferenceArgs& inference_args) {
+  if (inference_args.decode_qbatch_size >= 256) {
+    ThreadingArgs copy = threading_args;
+    copy.max_packages = 1;
+    return copy;
+  }
+  return threading_args;
+}
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_ARGS_H_
diff --git a/gemma/griffin.cc b/gemma/griffin.cc
new file mode 100644
index 0000000..35bf29a
--- /dev/null
+++ b/gemma/griffin.cc
@@ -0,0 +1,192 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
+#include "gemma/activations.h"
+#include "gemma/gemma.h"
+#include "gemma/gemma_args.h"
+#include "gemma/weights.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/profiler.h"
+
+// Compiles this file for multiple architectures via "foreach_target.h", to
+// which we pass the filename via macro 'argument'.
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "gemma/griffin.cc"  // NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+// After highway.h
+#include "ops/matvec-inl.h"
+#include "ops/ops-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+
+void GriffinRecurrent(size_t num_tokens, size_t griffin_layer,
+                      const LayerWeightsPtrs* layer_weights,
+                      Activations& activations, QBatch& qbatch,
+                      MatMulEnv& env) {
+  PROFILER_ZONE("Gen.Griffin");
+  hwy::ThreadPool& pool = env.ctx.pools.Pool(0);
+  namespace hn = hwy::HWY_NAMESPACE;
+  using D = hn::ScalableTag<float>;
+  const D df;
+
+  const size_t model_dim = layer_weights->layer_config.model_dim;
+  HWY_DASSERT(model_dim % hn::Lanes(df) == 0);
+
+  const size_t heads = layer_weights->layer_config.heads;
+  const size_t conv_1d_width = layer_weights->layer_config.conv1d_width;
+  HWY_ASSERT_M(conv_1d_width % 2 == 0, "Conv width must be even");
+  const size_t kHeadDim = model_dim / heads;
+  const size_t kMatrixSize = kHeadDim * kHeadDim;
+
+  const size_t num_interleaved = num_tokens * qbatch.Size();
+  const hwy::Divisor div_qbatch(static_cast<uint32_t>(qbatch.Size()));
+  GriffinActivations& griffin = activations.griffin;
+
+  // X / Y linear layers.
+  // TODO: MatMul
+  HWY_DASSERT(griffin.griffin_y.Rows() == griffin.griffin_x.Rows());
+  HWY_DASSERT(num_interleaved == griffin.griffin_y.Rows());
+  CallUpcastedSame(
+      &layer_weights->griffin.linear_x_w, &layer_weights->griffin.linear_y_w,
+      [&](const auto* wx, const auto* wy) {
+        for (size_t r = 0; r < num_interleaved; ++r) {
+          float* HWY_RESTRICT y = griffin.griffin_y.Row(r);
+          float* HWY_RESTRICT x = griffin.griffin_x.Row(r);
+          TwoMatVecAdd(
+              *wx, *wy, 0, model_dim, model_dim,
+              activations.attention.pre_att_rms_out.Row(r),
+              /*add0=*/layer_weights->griffin.linear_x_biases.PackedScale1(),
+              /*add1=*/layer_weights->griffin.linear_y_biases.PackedScale1(),
+              /*out0=*/x, /*out1=*/y, pool);
+          Gelu(y, model_dim);
+        }
+      });
+
+  // Conv1D.
+  for (size_t interleaved_idx = 0; interleaved_idx < num_interleaved;
+       ++interleaved_idx) {
+    const size_t qi = div_qbatch.Remainder(interleaved_idx);
+    const size_t batch_idx = div_qbatch.Divide(interleaved_idx);
+    const size_t pos = qbatch.Pos(qi) + batch_idx;
+    float* HWY_RESTRICT x = griffin.griffin_x.Row(qi);
+
+    // cache[i] = input at time t-i.
+    float* HWY_RESTRICT cache[kMaxConv1DWidth];
+    cache[0] = x;
+    for (size_t i = 1; i < conv_1d_width; i++) {
+      cache[i] =
+          qbatch.KV(qi).conv1d_cache.Row(griffin_layer) +
+          ((pos + conv_1d_width - 1 - i) % (conv_1d_width - 1)) * model_dim;
+    }
+    for (size_t i = 0; i < model_dim; i += hn::Lanes(df)) {
+      auto xv = hn::Load(df, x + i);
+      auto accum0 =
+          hn::Load(df, layer_weights->griffin.conv_biases.PackedScale1() + i);
+      auto accum1 = hn::Zero(df);
+      for (size_t l = 0; 2 * l < conv_1d_width; l++) {
+        auto wv0 =
+            hn::Load(df, layer_weights->griffin.conv_w.PackedScale1() +
+                             (conv_1d_width - 1 - 2 * l) * model_dim + i);
+        auto wv1 =
+            hn::Load(df, layer_weights->griffin.conv_w.PackedScale1() +
+                             (conv_1d_width - 2 - 2 * l) * model_dim + i);
+        accum0 = hn::MulAdd(wv0, hn::Load(df, cache[l * 2] + i), accum0);
+        accum1 = hn::MulAdd(wv1, hn::Load(df, cache[l * 2 + 1] + i), accum1);
+      }
+      hn::Store(hn::Add(accum0, accum1), df, x + i);
+      hn::Store(xv, df, cache[HWY_MAX(conv_1d_width, 1) - 1] + i);
+    }
+  }
+
+  // RGLRU
+  for (size_t interleaved_idx = 0; interleaved_idx < num_interleaved;
+       ++interleaved_idx) {
+    const size_t qi = div_qbatch.Remainder(interleaved_idx);
+    const size_t batch_idx = div_qbatch.Divide(interleaved_idx);
+    const size_t pos = qbatch.Pos(qi) + batch_idx;
+
+    float* HWY_RESTRICT x = griffin.griffin_x.Row(qi);
+    float* HWY_RESTRICT y = griffin.griffin_y.Row(qi);
+    float* HWY_RESTRICT gate_x = griffin.griffin_gate_x.Row(qi);
+    float* HWY_RESTRICT a = griffin.griffin_multiplier.Row(qi);
+    float* HWY_RESTRICT rnn_state =
+        qbatch.KV(qi).rglru_cache.Row(griffin_layer);
+
+    pool.Run(0, heads, [&](const uint64_t head, size_t /*thread*/) HWY_ATTR {
+      size_t head_offset = head * kHeadDim;
+      CallUpcasted(&layer_weights->griffin.gate_w, [&](const auto* gate_w) {
+        TwoOfsMatVecAddLoop(
+            *gate_w, kMatrixSize * head, kMatrixSize * (heads + head), kHeadDim,
+            kHeadDim, x + head_offset,
+            /*add0=*/layer_weights->griffin.gate_biases.PackedScale1() +
+                head_offset,
+            /*add1=*/layer_weights->griffin.gate_biases.PackedScale1() +
+                model_dim + head_offset,
+            /*out0=*/gate_x + head_offset, /*out1=*/a + head_offset);
+      });
+      Sigmoid(gate_x + head_offset, kHeadDim);
+      Sigmoid(a + head_offset, kHeadDim);
+      const auto fn_mul = [](D d, hn::Vec<D> x, hn::Vec<D> gate_x)
+                              HWY_ATTR { return hn::Mul(x, gate_x); };
+      hn::Transform1(D(), a + head_offset, kHeadDim,
+                     layer_weights->griffin.a.PackedScale1() + head_offset,
+                     fn_mul);
+      hn::Transform1(D(), x + head_offset, kHeadDim, gate_x + head_offset,
+                     fn_mul);
+      // RNN scan
+      HWY_DASSERT(kHeadDim % hn::Lanes(df) == 0);
+      for (size_t i = 0; i < kHeadDim; i += hn::Lanes(df)) {
+        auto log_a = hn::Load(df, a + head_offset + i);
+        auto gated_x = hn::Load(df, x + head_offset + i);
+        auto rnn = hn::Load(df, rnn_state + head_offset + i);
+        auto a = hn::Exp(df, log_a);
+        auto x_multiplier = hn::Sqrt(hn::NegMulAdd(a, a, hn::Set(df, 1.0f)));
+        if (pos == 0) {
+          x_multiplier = hn::Set(df, 1.0f);
+        }
+        auto new_x = hn::MulAdd(x_multiplier, gated_x, hn::Mul(a, rnn));
+        hn::Store(new_x, df, rnn_state + head_offset + i);
+
+        // Join branches.
+        auto yv = hn::Load(df, y + head_offset + i);
+        auto pre_out = hn::Mul(yv, new_x);
+        hn::Store(pre_out, df, x + head_offset + i);
+      }
+    });
+  }  // interleaved_idx
+
+  // Final linear layer.
+  CallMatMul(griffin.griffin_x, layer_weights->griffin.linear_out_w,
+             layer_weights->griffin.linear_out_biases.PackedScale1(), env,
+             activations.attention.att_sums);
+}  // GriffinRecurrent
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
diff --git a/gemma/griffin.h b/gemma/griffin.h
new file mode 100644
index 0000000..0ba6a23
--- /dev/null
+++ b/gemma/griffin.h
@@ -0,0 +1,47 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_GRIFFIN_H_
+#define THIRD_PARTY_GEMMA_CPP_GEMMA_GRIFFIN_H_
+
+// Declares GriffinRecurrent for all SIMD targets.
+
+#include <stddef.h>
+
+#include "gemma/gemma.h"
+#include "hwy/highway.h"
+
+namespace gcpp {
+
+// Passed to HWY_VISIT_TARGETS; declares for one target.
+#define GEMMA_DECL_GRIFFIN(TARGET, NAMESPACE)                     \
+  namespace NAMESPACE {                                           \
+  void GriffinRecurrent(size_t num_tokens, size_t griffin_layer,  \
+                        const LayerWeightsPtrs* layer_weights,    \
+                        Activations& activations, QBatch& qbatch, \
+                        MatMulEnv& env);                          \
+  /* NOLINTNEXTLINE(google-readability-namespace-comments) */     \
+  }  // namespace NAMESPACE
+
+// Function declarations for each SIMD target. Allows direct call from the
+// per-target namespace. We may later replace this with dynamic dispatch if
+// the overhead is acceptable.
+HWY_VISIT_TARGETS(GEMMA_DECL_GRIFFIN)
+
+#undef GEMMA_DECL_GRIFFIN
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_GRIFFIN_H_
diff --git a/gemma/kv_cache.cc b/gemma/kv_cache.cc
index 60ad5dd..9d107e8 100644
--- a/gemma/kv_cache.cc
+++ b/gemma/kv_cache.cc
@@ -15,91 +15,75 @@
 
 #include "gemma/kv_cache.h"
 
-#include <algorithm>
+#include <stddef.h>
 
-#include "gemma/common.h"  // CallForModel
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"  // ZeroBytes
+#include "gemma/configs.h"
+#include "gemma/gemma_args.h"
+#include "util/mat.h"  // ZeroInit
+#include "hwy/base.h"    // HWY_MAX
 
 namespace gcpp {
 
 void KVCache::ZeroGriffinCache() {
-  if (conv1d_cache_size != 0) {
-    hwy::ZeroBytes(conv1d_cache.get(),
-                   conv1d_cache_size * sizeof(conv1d_cache[0]));
-  }
-  if (rglru_cache_size != 0) {
-    hwy::ZeroBytes(rglru_cache.get(),
-                   rglru_cache_size * sizeof(rglru_cache[0]));
-  }
+  if (conv1d_cache.Rows() == 0) return;
+  ZeroInit(conv1d_cache);
+  ZeroInit(rglru_cache);
 }
 
-// prefill_tbatch_size is the maximum number of tokens from one query to
-// prefill at a time.
-KVCache KVCache::Create(const ModelConfig& weights_config,
-                        size_t prefill_tbatch_size) {
-  KVCache kv_cache = {};
-
-  const size_t size_cache_pos = weights_config.CachePosSize();
-  if (size_cache_pos != 0) {
-    // Allocate more so that prefill can always access one batch, even if
-    // near the end of the sequence.
-    kv_cache.seq_len = weights_config.seq_len + prefill_tbatch_size;
-    kv_cache.kv_cache =
-        hwy::AllocateAligned<float>(kv_cache.seq_len * size_cache_pos);
-  }
-
-  const size_t num_griffin_layers = weights_config.NumLayersOfType(
-      LayerAttentionType::kGriffinRecurrentBlock);
-  // TODO(patrickms): Add query batching support for Griffin.
-  if (num_griffin_layers > 0) {
-    uint32_t conv1d_width = 0;
-    for (const auto& layer_config : weights_config.layer_configs) {
-      conv1d_width = std::max(conv1d_width, layer_config.conv1d_width);
-    }
-    const size_t conv1d_cache_size =
-        num_griffin_layers * (conv1d_width == 0 ? 0 : conv1d_width - 1) *
-        weights_config.model_dim;
-    kv_cache.conv1d_cache_size = conv1d_cache_size;
-    if (conv1d_cache_size != 0) {
-      kv_cache.conv1d_cache = hwy::AllocateAligned<float>(conv1d_cache_size);
-    }
-
-    const size_t rglru_cache_size =
-        num_griffin_layers * weights_config.model_dim;
-    kv_cache.rglru_cache_size = rglru_cache_size;
-    if (rglru_cache_size != 0) {
-      kv_cache.rglru_cache = hwy::AllocateAligned<float>(rglru_cache_size);
-    }
-  }  // num_griffin_layers
-
-  return kv_cache;
+static size_t GriffinLayers(const ModelConfig& config) {
+  return config.NumLayersOfType(LayerAttentionType::kGriffinRecurrentBlock);
 }
 
-KVCache KVCache::Copy(const ModelConfig& weights_config,
-                      size_t prefill_tbatch_size) {
-  KVCache kv_cache_copy = Create(weights_config, prefill_tbatch_size);
+static size_t GriffinConv1dCols(const ModelConfig& config) {
+  size_t conv1d_width = 0;
+  for (const auto& layer_config : config.layer_configs) {
+    conv1d_width = HWY_MAX(conv1d_width, layer_config.conv1d_width);
+  }
+  // The row offset, in blocks of model_dim is computed mod (conv1d_width - 1),
+  // hence allocate conv1d_width * model_dim total columns.
+  return conv1d_width * config.model_dim;
+}
 
-  const size_t size_cache_pos = weights_config.CachePosSize();
-  if (size_cache_pos != 0) {
-    std::copy(kv_cache.get(), kv_cache.get() + size_cache_pos * seq_len,
-              kv_cache_copy.kv_cache.get());
+// Number of rows for KV cache. Note that both rows and cols are u32, and
+// the total number of elements can exceed 2^32.
+static size_t CappedSeqLen(const ModelConfig& config,
+                           const InferenceArgs& inference_args) {
+  if (inference_args.seq_len > config.max_seq_len) {
+    HWY_WARN("Capping seq_len %zu to config.max_seq_len %u.",
+             inference_args.seq_len, config.max_seq_len);
+    return config.max_seq_len;
+  }
+  return inference_args.seq_len;
+}
+
+KVCache::KVCache(const Extents2D& conv1d_extents,
+                 const Extents2D& rglru_extents, const Extents2D& kv_extents,
+                 const Allocator& allocator)
+    : conv1d_cache("conv1d_cache", conv1d_extents, allocator, MatPadding::kOdd),
+      rglru_cache("rglru_cache", rglru_extents, allocator, MatPadding::kOdd),
+      kv_cache("kv", kv_extents, allocator, MatPadding::kOdd),
+      allocator_(allocator) {}
+
+KVCache::KVCache(const ModelConfig& config, const InferenceArgs& inference_args,
+                 const Allocator& allocator)
+    : KVCache(
+          Extents2D(GriffinLayers(config), GriffinConv1dCols(config)),
+          Extents2D(GriffinLayers(config), config.model_dim),
+          Extents2D(CappedSeqLen(config, inference_args), config.KVCacheCols()),
+          allocator) {}
+
+KVCache KVCache::Copy() {
+  KVCache copy(conv1d_cache.Extents(), rglru_cache.Extents(),
+               kv_cache.Extents(), allocator_);
+
+  if (conv1d_cache.Rows() != 0) {
+    CopyMat(conv1d_cache, copy.conv1d_cache);
+    CopyMat(rglru_cache, copy.rglru_cache);
   }
 
-  const size_t num_griffin_layers = weights_config.NumLayersOfType(
-      LayerAttentionType::kGriffinRecurrentBlock);
-  if (num_griffin_layers > 0) {
-    if (conv1d_cache_size != 0) {
-      std::copy(conv1d_cache.get(), conv1d_cache.get() + conv1d_cache_size,
-                kv_cache_copy.conv1d_cache.get());
-    }
-    if (rglru_cache_size != 0) {
-      std::copy(rglru_cache.get(),
-                rglru_cache.get() + rglru_cache_size * sizeof(rglru_cache[0]),
-                kv_cache_copy.rglru_cache.get());
-    }
-  }
-  return kv_cache_copy;
+  CopyMat(kv_cache, copy.kv_cache);
+
+  return copy;
 }
 
 }  // namespace gcpp
diff --git a/gemma/kv_cache.h b/gemma/kv_cache.h
index 6052d0b..7b5b88d 100644
--- a/gemma/kv_cache.h
+++ b/gemma/kv_cache.h
@@ -18,34 +18,41 @@
 
 #include <stddef.h>
 
-#include "gemma/common.h"  // Model
-#include "hwy/aligned_allocator.h"
+#include "gemma/configs.h"  // ModelConfig
+#include "gemma/gemma_args.h"  // InferenceArgs
+#include "util/basics.h"       // BF16
+#include "util/mat.h"
 
 namespace gcpp {
 
+using KV_t = float;
+
 struct KVCache {
-  size_t seq_len = 0;  // = kSeqLen + prefill_tbatch_size
+  KVCache(const ModelConfig& config, const InferenceArgs& inference_args,
+          const Allocator& allocator);
 
-  // seq_len * kGemmaLayers * kKVHeads * kQKVDim * 2
-  hwy::AlignedFreeUniquePtr<float[]> kv_cache;
-
-  // (kConv1dWidth - 1) * kModelDim * kGriffinLayers
-  hwy::AlignedFreeUniquePtr<float[]> conv1d_cache;
-  size_t conv1d_cache_size = 0;
-
-  // kModelDim * kGriffinLayers
-  hwy::AlignedFreeUniquePtr<float[]> rglru_cache;
-  size_t rglru_cache_size = 0;
+  // Returns a deep copy of the KVCache. Use explicit function instead of
+  // copy ctor to make the cost explicit.
+  KVCache Copy();
 
   // Zero-initialize the Griffin recurrent block cache, i.e. the conv1d_cache
   // and rglru_cache.
   void ZeroGriffinCache();
 
-  static KVCache Create(const ModelConfig& weights_config,
-                        size_t prefill_tbatch_size);
+  size_t SeqLen() const { return kv_cache.Rows(); }
 
-  // Returns a deep copy of the KVCache.
-  KVCache Copy(const ModelConfig& weights_config, size_t prefill_tbatch_size);
+  // [griffin_layers, griffin_conv1d_cols * model_dim]
+  MatStorageT<float> conv1d_cache;
+  MatStorageT<float> rglru_cache;  // [griffin_layers, model_dim]
+
+  MatStorageT<KV_t> kv_cache;  // [seq_len, layers * kv_heads * qkv_dim * 2]
+
+ private:
+  const Allocator& allocator_;
+
+  // For use by other ctor and Copy()
+  KVCache(const Extents2D& conv1d_extents, const Extents2D& rglru_extents,
+          const Extents2D& kv_extents, const Allocator& allocator);
 };
 
 }  // namespace gcpp
diff --git a/gemma/model_store.cc b/gemma/model_store.cc
new file mode 100644
index 0000000..8f6c138
--- /dev/null
+++ b/gemma/model_store.cc
@@ -0,0 +1,464 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gemma/model_store.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <array>
+#include <charconv>
+#include <cstdlib>
+#include <cstring>  // strcmp
+#include <string>
+#include <system_error>  // std::errc  // NOLINT
+
+#include "compression/types.h"
+#include "gemma/configs.h"  // ModelConfig, kMaxQKVDim
+#include "gemma/tensor_info.h"
+#include "gemma/tokenizer.h"
+#include "io/blob_store.h"
+#include "io/fields.h"
+#include "io/io.h"  // Path
+#include "util/basics.h"
+#include "util/threading_context.h"
+#include "hwy/base.h"
+
+namespace gcpp {
+
+// Single-file format contains blobs with these names:
+static constexpr char kConfigName[] = "config";
+static constexpr char kTokenizerName[] = "tokenizer";
+static constexpr char kMatPtrsName[] = "toc";
+// Pre-2025 format has one metadata blob. 'F' denoted f32.
+static constexpr char kDecoratedScalesName[] = "Fscales";
+
+static void WarnIfExtra(const IFields::ReadResult& result, const char* name) {
+  // No warning if missing_fields > 0: those fields are default-initialized.
+  if (result.extra_u32) {
+    HWY_WARN(
+        "Serialized blob %s has %u extra fields the code is not aware of. "
+        "Consider updating to the latest code from GitHub.",
+        name, result.extra_u32);
+  }
+}
+
+// Returns the serialized tokenizer (std::string is required for proto).
+// Reads it from a blob or from a separate file if pre-2025.
+static std::string ReadTokenizer(BlobReader& reader,
+                                 const Path& tokenizer_path) {
+  std::string tokenizer;
+  // Check prevents `CallWithSpan` from printing a warning.
+  if (reader.Find(kTokenizerName)) {
+    if (!reader.CallWithSpan<char>(
+            kTokenizerName, [&tokenizer](const hwy::Span<const char> bytes) {
+              tokenizer.assign(bytes.data(), bytes.size());
+            })) {
+      HWY_WARN(
+          "Reading tokenizer blob failed, please raise an issue. You can "
+          "instead specify a tokenizer file via --tokenizer.");
+    }
+  }
+
+  // Read actual tokenizer from blob.
+  if (!tokenizer.empty() && tokenizer != kMockTokenizer) {
+    if (!tokenizer_path.Empty()) {
+      HWY_WARN("--weights has tokenizer but overriding with %s.",
+               tokenizer_path.path.c_str());
+      return ReadFileToString(tokenizer_path);
+    }
+
+    return tokenizer;
+  }
+
+  // No blob but user specified path to file: read it or abort.
+  if (!tokenizer_path.Empty()) {
+    return ReadFileToString(tokenizer_path);
+  }
+
+  HWY_WARN(
+      "BlobStore does not contain a tokenizer and no --tokenizer was "
+      "specified. Tests may continue but inference will fail.");
+  return kMockTokenizer;
+}
+
+using KeyVec = std::vector<std::string>;
+
+class TypePrefix {
+ public:
+  static Type TypeFromChar(char c) {
+    switch (c) {
+      case 'F':
+        return Type::kF32;
+      case 'B':
+        return Type::kBF16;
+      case '$':
+        return Type::kSFP;
+      case '2':
+        return Type::kNUQ;
+      default:
+        // The other types were not written to pre-2025 files, hence no need to
+        // encode and check for them here.
+        return Type::kUnknown;
+    }
+  }
+
+  TypePrefix(const KeyVec& keys, const BlobReader& reader) {
+    for (size_t key_idx = 0; key_idx < keys.size(); ++key_idx) {
+      const std::string& key = keys[key_idx];
+      const Type type = TypeFromChar(key[0]);
+      const uint64_t bytes = reader.Range(key_idx).bytes;
+      bytes_[static_cast<size_t>(type)] += bytes;
+      blobs_[static_cast<size_t>(type)]++;
+      total_bytes_ += bytes;
+    }
+  }
+
+  // Returns true for pre-2025 format, which has type prefixes and thus the
+  // functions below may be used.
+  bool HasPrefixes() const {
+    return bytes_[static_cast<size_t>(Type::kUnknown)] != total_bytes_;
+  }
+
+  // Returns the weight type deduced from the histogram of blobs per type.
+  // Rationale: We expect a mix of types due to varying precision requirements
+  // for each tensor. The preferred weight type might not even be the most
+  // common, because we prioritize higher compression for the *large* tensors.
+  // Ignore types which only have a few blobs (might be metadata), and assume
+  // that there would be at least 4 of the large tensors (in particular, global
+  // attention layers). Hence return the smallest type with >= 4 blobs.
+  Type DeduceWeightType() const {
+    size_t min_bits = ~size_t{0};
+    Type weight_type = Type::kUnknown;
+    for (size_t i = 0; i < kNumTypes; ++i) {
+      if (blobs_[i] < 4) continue;
+      const size_t bits = TypeBits(static_cast<Type>(i));
+      if (bits < min_bits) {
+        min_bits = bits;
+        weight_type = static_cast<Type>(i);
+      }
+    }
+    return weight_type;
+  }
+
+  // Prints statistics on the total size of tensors by type.
+  void PrintTypeBytes() const {
+    for (size_t type_idx = 0; type_idx < kNumTypes; ++type_idx) {
+      const Type type = static_cast<Type>(type_idx);
+      const uint64_t bytes = bytes_[type_idx];
+      if (bytes == 0) continue;
+      const double percent = 100.0 * bytes / total_bytes_;
+      fprintf(stderr, "%12zu blob bytes (%5.2f%%) of %4s\n",
+              static_cast<size_t>(bytes), percent, TypeName(type));
+    }
+  }
+
+ private:
+  uint64_t total_bytes_ = 0;
+  std::array<size_t, kNumTypes> bytes_{0};
+  std::array<size_t, kNumTypes> blobs_{0};
+};
+
+// Returns 0 if the blob does not seem to be a per-layer tensor, otherwise the
+// layer index.
+static size_t LayerIdxFromKey(const std::string& key) {
+  const auto parse_num = [&key](size_t begin, size_t end) -> int {
+    HWY_DASSERT(begin <= end);
+    HWY_DASSERT(end <= key.size());
+    int val = 0;
+    auto [ptr, ec] = std::from_chars(key.data() + begin, key.data() + end, val);
+    return (ec == std::errc()) ? val : -1;
+  };
+
+  const size_t suffix_pos = key.rfind('_');
+  // If there is no digit after the last underscore, it is not a layer name.
+  if (suffix_pos == std::string::npos) return 0;
+  if (suffix_pos == key.size() - 1) return 0;
+
+  int layer_idx = parse_num(suffix_pos + 1, key.size());
+
+  HWY_ASSERT(layer_idx < 999);
+  return layer_idx == -1 ? 0 : static_cast<size_t>(layer_idx);
+}
+
+// Returns the number of layers based on the largest blob name suffix seen.
+// This works with or without type prefixes because it searches for suffixes.
+static size_t DeduceNumLayers(const KeyVec& keys) {
+  // Built-in self-test.
+  {
+    HWY_ASSERT(LayerIdxFromKey("gr_conv_w_2") == 2);   // common case
+    HWY_ASSERT(LayerIdxFromKey("prefix_") == 0);       // no number
+    HWY_ASSERT(LayerIdxFromKey("c_embedding") == 0);   // per-model
+    HWY_ASSERT(LayerIdxFromKey("c_final_norm") == 0);  // per-model, two _
+  }
+
+  size_t max_layer_idx = 0;
+  for (const std::string& key : keys) {
+    max_layer_idx = HWY_MAX(max_layer_idx, LayerIdxFromKey(key));
+  }
+  return max_layer_idx + 1;
+}
+
+// Looks for known tensor names associated with model families.
+// This works with or without type prefixes because it searches for substrings.
+static int DeduceLayerTypes(const BlobReader& reader) {
+  int layer_types = 0;
+  for (size_t key_idx = 0; key_idx < reader.Keys().size(); ++key_idx) {
+    const std::string& key = reader.Keys()[key_idx];
+    if (key.find("gr_conv_w") != std::string::npos) {  // NOLINT
+      return kDeducedGriffin;
+    }
+    if (key.find("qkv_ein_w") != std::string::npos) {  // NOLINT
+      layer_types |= kDeducedViT;
+    }
+    if (key.find("img_pos_emb") != std::string::npos) {  // NOLINT
+      // About 5.88 elements per pixel; assume at least bf16.
+      if (reader.Range(key_idx).bytes > 448 * 448 * 5 * sizeof(BF16)) {
+        layer_types |= kDeduced448;
+      }
+    }
+  }
+  return layer_types;
+}
+
+// `wrapping_override` is forwarded from the command line. For pre-2025 files
+// without `ModelConfig`, it is the only way to force PT.
+static ModelConfig ReadOrDeduceConfig(BlobReader& reader,
+                                      Tristate wrapping_override) {
+  const TypePrefix type_prefix(reader.Keys(), reader);
+  Type deduced_weight = Type::kUnknown;
+  if (type_prefix.HasPrefixes()) {
+    deduced_weight = type_prefix.DeduceWeightType();
+    type_prefix.PrintTypeBytes();
+  }
+
+  // Always deduce so we can verify it against the config we read.
+  const size_t layers = DeduceNumLayers(reader.Keys());
+  const int layer_types = DeduceLayerTypes(reader);
+  const Model deduced_model =
+      DeduceModel(reader.blob_path(), layers, layer_types);
+
+  ModelConfig config;
+  // Check first to prevent `CallWithSpan` from printing a warning.
+  if (reader.Find(kConfigName)) {
+    HWY_ASSERT(reader.CallWithSpan<uint32_t>(
+        kConfigName, [&config](const SerializedSpan serialized) {
+          const IFields::ReadResult result = config.Read(serialized, 0);
+          WarnIfExtra(result, kConfigName);
+          HWY_ASSERT_M(result.pos != 0, "Error deserializing config");
+        }));
+
+    HWY_ASSERT(config.model != Model::UNKNOWN);
+    HWY_ASSERT(config.wrapping != PromptWrapping::kSentinel);
+    HWY_ASSERT(config.weight != Type::kUnknown);
+    for (const LayerConfig& layer_config : config.layer_configs) {
+      if (static_cast<size_t>(layer_config.qkv_dim) > kMaxQKVDim) {
+        HWY_ABORT("Increase kMaxQKVDim to at least %u.", layer_config.qkv_dim);
+      }
+    }
+
+    // We trust the deserialized config, but checking helps to validate the
+    // deduction, which we rely on below for pre-2025 files.
+    if (config.model != deduced_model) {
+      const std::string suffix = WrappingSuffix(config.wrapping);
+      HWY_WARN("Detected model %s does not match config %s.",
+               (std::string(ModelPrefix(deduced_model)) + suffix).c_str(),
+               (std::string(ModelPrefix(config.model)) + suffix).c_str());
+    }
+    return config;
+  }
+
+  // Pre-2025 format: no config, rely on deduction plus `wrapping_override`.
+  return ModelConfig(deduced_model, deduced_weight,
+                     ChooseWrapping(deduced_model, wrapping_override));
+}
+
+static std::vector<float> ReadScales(BlobReader& reader,
+                                     const ModelConfig& config) {
+  std::vector<float> scales;
+  // Check first to prevent `CallWithSpan` from printing a warning. This blob is
+  // optional even in pre-2025 format; Griffin was the first to include it.
+  if (reader.Find(kDecoratedScalesName)) {
+    HWY_ASSERT(reader.CallWithSpan<float>(
+        kDecoratedScalesName,
+        [&scales](const hwy::Span<const float> scales_blob) {
+          scales.assign(scales_blob.cbegin(), scales_blob.cend());
+        }));
+  }
+  return scales;
+}
+
+// Single-file format: reads `MatPtr` from the blob; returns false if not found.
+bool ModelStore::ReadMatPtrs(BlobReader& reader) {
+  // Check first to prevent `CallWithSpan` from printing a warning.
+  if (!reader.Find(kMatPtrsName)) return false;
+
+  // For verifying `config_.weight`.
+  size_t min_bits = ~size_t{0};
+  Type weight_type = Type::kUnknown;
+
+  HWY_ASSERT(reader.CallWithSpan<uint32_t>(
+      kMatPtrsName, [&, this](SerializedSpan serialized) {
+        for (size_t pos = 0; pos < serialized.size();) {
+          MatPtr mat;
+          const IFields::ReadResult result = mat.Read(serialized, pos);
+          WarnIfExtra(result, mat.Name());
+          if (result.pos == 0) {
+            HWY_ABORT("Deserializing MatPtr %s failed (pos %zu of %zu).",
+                      mat.Name(), pos, serialized.size());
+          }
+          pos = result.pos + result.extra_u32;
+
+          // Retrieve actual key index because a writer may have written other
+          // blobs before the tensor data.
+          const BlobRange* range = reader.Find(mat.Name());
+          HWY_ASSERT(range);
+          const size_t key_idx = range->key_idx;
+          AddMatPtr(key_idx, mat);
+
+          const size_t bits = TypeBits(mat.GetType());
+          if (bits < min_bits) {
+            min_bits = bits;
+            weight_type = mat.GetType();
+          }
+        }
+      }));
+
+  HWY_ASSERT(weight_type != Type::kUnknown);
+  HWY_ASSERT(weight_type == config_.weight);
+
+  return true;
+}
+
+// Pre-2025 format: synthesizes `MatPtr` from the blob names if `!ReadMatPtrs`.
+void ModelStore::CreateMatPtrs(BlobReader& reader) {
+  const TensorInfoRegistry tensors(config_);
+
+  const KeyVec& keys = reader.Keys();
+  mat_ptrs_.reserve(keys.size());
+  // `key_idx` is the blob index. It is not the same as the index of the
+  // `MatPtr` in `mat_ptrs_` because not all blobs are tensors.
+  for (size_t key_idx = 0; key_idx < keys.size(); ++key_idx) {
+    const Type type = TypePrefix::TypeFromChar(keys[key_idx][0]);
+    if (type == Type::kUnknown) continue;  // likely not a tensor
+
+    // Strip type prefix from the key. Still includes layer suffix.
+    const std::string name = keys[key_idx].substr(1);
+    const TensorInfo* info = tensors.Find(name);
+    if (HWY_UNLIKELY(!info)) {
+      if (name == "scales") continue;  // ignore, not a tensor.
+      HWY_ABORT("Unknown tensor %s.", name.c_str());
+    }
+    // Unable to set scale already because they are ordered according to
+    // `ForEachTensor`, which we do not know here. The initial value is 1.0f
+    // and we set the correct value in `FindAndUpdateMatPtr`.
+    AddMatPtr(key_idx, MatPtr(name.c_str(), type, ExtentsFromInfo(info)));
+  }
+  HWY_ASSERT(mat_ptrs_.size() <= keys.size());
+  HWY_ASSERT(mat_ptrs_.size() == key_idx_.size());
+}
+
+ModelStore::ModelStore(BlobReader& reader, const Path& tokenizer_path,
+                         Tristate wrapping)
+    : config_(ReadOrDeduceConfig(reader, wrapping)),
+      tokenizer_(ReadTokenizer(reader, tokenizer_path)) {
+  if (!ReadMatPtrs(reader)) {  // Pre-2025 format.
+    CreateMatPtrs(reader);
+    scales_ = ReadScales(reader, config_);
+    // ModelConfig serialized a vector of strings. Unpack into a set for more
+    // efficient lookup.
+    for (const std::string& name : config_.scale_base_names) {
+      scale_base_names_.insert(name);
+    }
+    // If the model has scales, the config must know about it.
+    HWY_ASSERT(scales_.empty() || !scale_base_names_.empty());
+  }
+
+  HWY_ASSERT(key_idx_.size() == mat_ptrs_.size());
+}
+
+ModelStore::~ModelStore() {
+  // Sanity check: ensure all scales were consumed.
+  HWY_ASSERT(scales_consumed_ == scales_.size());
+}
+
+const MatPtr* ModelStore::FindMat(const char* name) const {
+  auto it = mat_idx_for_name_.find(name);
+  if (it == mat_idx_for_name_.end()) return nullptr;
+  const size_t mat_idx = it->second;
+  const MatPtr* file_mat = &mat_ptrs_[mat_idx];
+  HWY_ASSERT(!strcmp(file_mat->Name(), name));
+  return file_mat;
+}
+
+bool ModelStore::FindAndUpdateMatPtr(MatPtr& mat, size_t& key_idx) const {
+  const MatPtr* file_mat = FindMat(mat.Name());
+  if (!file_mat) return false;
+  if (file_mat->Rows() != mat.Rows() || file_mat->Cols() != mat.Cols()) {
+    HWY_ABORT("Tensor %s shape %zu %zu mismatches file %zu %zu.", mat.Name(),
+              mat.Rows(), mat.Cols(), file_mat->Rows(), file_mat->Cols());
+  }
+  // `Compress()` output is always packed because it assumes a 1D array.
+  HWY_ASSERT(mat.IsPacked());
+  // Update fields. Name already matched, otherwise we would not find it.
+  // For MatPtr tensors, the type will be `kUnknown`. If it was a `MatPtrT`,
+  // ensure the type set via code matches the file.
+  HWY_ASSERT_M(
+      mat.GetType() == Type::kUnknown || mat.GetType() == file_mat->GetType(),
+      mat.Name());
+  mat.SetType(file_mat->GetType());
+  if (scales_.empty()) {
+    // `file_mat->Scale()` is either read from file, or we have pre-2025 format
+    // without the optional scales, and it is default-initialized to 1.0f.
+    mat.SetScale(file_mat->Scale());
+  } else {  // Pre-2025 with scaling factors: set next if `mat` wants one.
+    if (scale_base_names_.find(StripLayerSuffix(mat.Name())) !=
+        scale_base_names_.end()) {
+      HWY_ASSERT(scales_consumed_ < scales_.size());
+      mat.SetScale(scales_[scales_consumed_++]);
+    }
+  }
+
+  key_idx = key_idx_[file_mat - mat_ptrs_.data()];
+  return true;
+}
+
+static void AddBlob(const char* name, const std::vector<uint32_t>& data,
+                    BlobWriter& writer) {
+  HWY_ASSERT(!data.empty());
+  writer.Add(name, data.data(), data.size() * sizeof(data[0]));
+}
+
+void WriteSingleFile(const ModelConfig& config, const GemmaTokenizer& tokenizer,
+                     const std::vector<uint32_t>& serialized_mat_ptrs,
+                     BlobWriter& writer) {
+  HWY_ASSERT(config.model != Model::UNKNOWN);
+  HWY_ASSERT(config.weight != Type::kUnknown);
+  HWY_ASSERT(config.wrapping != PromptWrapping::kSentinel);
+  const std::vector<uint32_t> serialized_config = config.Write();
+  AddBlob(kConfigName, serialized_config, writer);
+
+  const std::string serialized_tokenizer = tokenizer.Serialize();
+  HWY_ASSERT(!serialized_tokenizer.empty());
+  writer.Add(kTokenizerName, serialized_tokenizer.data(),
+             serialized_tokenizer.size());
+
+  AddBlob(kMatPtrsName, serialized_mat_ptrs, writer);
+
+  writer.WriteAll();
+}
+
+}  // namespace gcpp
diff --git a/gemma/model_store.h b/gemma/model_store.h
new file mode 100644
index 0000000..42af343
--- /dev/null
+++ b/gemma/model_store.h
@@ -0,0 +1,111 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Reads/writes model metadata (all but the weights) from/to a `BlobStore`.
+#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_MODEL_STORE_H_
+#define THIRD_PARTY_GEMMA_CPP_GEMMA_MODEL_STORE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+// IWYU pragma: begin_exports
+#include "gemma/configs.h"  // ModelConfig
+#include "gemma/tokenizer.h"
+#include "io/blob_store.h"
+#include "io/io.h"        // Path
+#include "util/basics.h"  // Tristate
+#include "util/mat.h"     // MatPtr
+// IWYU pragma: end_exports
+
+#include "util/allocator.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+
+namespace gcpp {
+
+// Reads and holds the model config, tokenizer and all `MatPtr`: everything
+// except the tensor data, which are read/written by `weights.cc`.
+//
+// As of 2025-04, the `BlobStore` format includes blobs for `ModelConfig`,
+// tokenizer, and all `MatPtr` metadata. "Pre-2025" format instead stored the
+// tokenizer in a separate file, encoded tensor type in a prefix of the blob
+// name, and had a blob for tensor scaling factors. We still support reading
+// both, but only write single-file format.
+class ModelStore {
+ public:
+  // Reads from file(s) or aborts on error. The latter two arguments are only
+  // used for pre-2025 files.
+  ModelStore(BlobReader& reader, const Path& tokenizer_path = Path(),
+             Tristate wrapping = Tristate::kDefault);
+  ~ModelStore();
+
+  const ModelConfig& Config() const {
+    HWY_ASSERT(config_.model != Model::UNKNOWN);
+    return config_;
+  }
+
+  const GemmaTokenizer& Tokenizer() const { return tokenizer_; }
+
+  // Returns nullptr if `name` is not available for loading, otherwise the
+  // metadata of that tensor.
+  const MatPtr* FindMat(const char* name) const;
+
+  // Returns false if `mat` is not available for loading, otherwise updates
+  // `mat` with metadata from the file and sets `key_idx` for use by
+  // `BlobReader`. Called via `ReadOrAllocate` in `weights.cc`.
+  bool FindAndUpdateMatPtr(MatPtr& mat, size_t& key_idx) const;
+
+ private:
+  void AddMatPtr(const size_t key_idx, const MatPtr& mat) {
+    auto pair_ib = mat_idx_for_name_.insert({mat.Name(), mat_ptrs_.size()});
+    HWY_ASSERT_M(pair_ib.second, mat.Name());  // Ensure inserted/unique.
+    mat_ptrs_.push_back(mat);
+    key_idx_.push_back(key_idx);
+  }
+
+  bool ReadMatPtrs(BlobReader& reader);
+  void CreateMatPtrs(BlobReader& reader);  // Aborts on error.
+
+  ModelConfig config_;
+  GemmaTokenizer tokenizer_;
+
+  // All `MatPtr` present in the `BlobStore`, see `ReadMatPtrs`/`CreateMatPtrs`.
+  std::vector<MatPtr> mat_ptrs_;
+  // For each of `mat_ptrs_`, the index within `BlobReader::Keys()`. This is
+  // not necessarily iota because some blobs are not tensors, and callers may
+  // have added blobs before ours.
+  std::vector<size_t> key_idx_;
+  // Index within `mat_ptrs_` and `key_idx_` for each tensor name.
+  std::unordered_map<std::string, size_t> mat_idx_for_name_;
+
+  // Only used if `!ReadMatPtrs` (pre-2025 format):
+  std::vector<float> scales_;
+  std::unordered_set<std::string> scale_base_names_;
+  mutable size_t scales_consumed_ = 0;
+};
+
+// Adds metadata blobs to `writer` and writes everything to `path`. This
+// produces a single BlobStore file holding everything required for inference.
+void WriteSingleFile(const ModelConfig& config, const GemmaTokenizer& tokenizer,
+                     const std::vector<uint32_t>& serialized_mat_ptrs,
+                     BlobWriter& writer);
+
+}  // namespace gcpp
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_MODEL_STORE_H_
diff --git a/gemma/run.cc b/gemma/run.cc
index 254d13f..7cbc4de 100644
--- a/gemma/run.cc
+++ b/gemma/run.cc
@@ -15,22 +15,22 @@
 
 // Command line text interface to gemma.
 
+#include <stdio.h>
+
 #include <iostream>
 #include <random>
 #include <string>
 #include <string_view>
 #include <vector>
 
-// Placeholder for internal header, do not modify.
-#include "compression/shared.h"  // PromptWrapping
+#include "compression/types.h"  // PromptWrapping
 #include "evals/benchmark_helper.h"
-#include "gemma/common.h"
 #include "gemma/gemma.h"  // Gemma
-#include "ops/matmul.h"   // MatMulEnv
+#include "gemma/gemma_args.h"
+#include "gemma/tokenizer.h"  // WrapAndTokenize
+#include "ops/matmul.h"       // MatMulEnv
 #include "paligemma/image.h"
-#include "util/app.h"
 #include "util/args.h"  // HasHelp
-#include "util/threading.h"
 #include "hwy/base.h"
 #include "hwy/highway.h"
 #include "hwy/profiler.h"
@@ -55,9 +55,8 @@ static constexpr std::string_view kAsciiArtBanner = R""(
  |___/                                     |_|   |_|
 )"";
 
-std::string GetPrompt(std::istream& input, int verbosity,
-                      std::string_view eot_line) {
-  PROFILER_ZONE("Gen.input");
+std::string GetPromptFromStream(std::istream& input, int verbosity,
+                                std::string_view eot_line) {
   if (verbosity >= 1) {
     std::cout << "> " << std::flush;
   }
@@ -77,36 +76,55 @@ std::string GetPrompt(std::istream& input, int verbosity,
   return prompt_string;
 }
 
+// Get prompt either from interactive input or command line
+std::string GetPrompt(const InferenceArgs& inference) {
+  PROFILER_ZONE("Gen.input");
+  // If prompt is provided via command line, use that
+  if (!inference.prompt.empty()) {
+    return inference.prompt;
+  }
+  if (!inference.prompt_file.Empty()) {
+    return ReadFileToString(inference.prompt_file);
+  }
+
+  return GetPromptFromStream(std::cin, inference.verbosity, inference.eot_line);
+}
+
 // The main Read-Eval-Print Loop.
-void ReplGemma(Gemma& model, KVCache& kv_cache, const AppArgs& app,
-               const InferenceArgs& args, const AcceptFunc& accept_token,
-               std::string& eot_line) {
+void ReplGemma(const ThreadingArgs& threading, const InferenceArgs& inference,
+               const Gemma& gemma, KVCache& kv_cache, MatMulEnv& env) {
   PROFILER_ZONE("Gen.misc");
   size_t abs_pos = 0;                     // across turns
   size_t tokens_generated_this_turn = 0;  // differentiates prefill from reply
   size_t prompt_size = 0;
+  const ModelConfig& config = gemma.Config();
 
   std::mt19937 gen;
-  InitGenerator(args, gen);
+  InitGenerator(inference, gen);
 
-  const bool have_image = !args.image_file.path.empty();
+  const bool have_image = !inference.image_file.path.empty();
   Image image;
-  ImageTokens image_tokens;
+  const size_t pool_dim = config.vit_config.pool_dim;
+  ImageTokens image_tokens(
+      "image_tokens",
+      have_image ? Extents2D(config.vit_config.seq_len / (pool_dim * pool_dim),
+                             config.model_dim)
+                 : Extents2D(0, 0),
+      env.ctx.allocator, MatPadding::kOdd);
+  image_tokens.AllocateAndAttachRowPtrs(env.row_ptrs);
   if (have_image) {
-    size_t pool_dim = model.GetModelConfig().vit_config.pool_dim;
-    image_tokens = ImageTokens(Extents2D(
-        model.GetModelConfig().vit_config.seq_len / (pool_dim * pool_dim),
-        model.GetModelConfig().model_dim));
-    HWY_ASSERT(model.Info().wrapping == PromptWrapping::PALIGEMMA ||
-               model.Info().wrapping == PromptWrapping::GEMMA_VLM);
-    HWY_ASSERT(image.ReadPPM(args.image_file.path));
-    const size_t image_size = model.GetModelConfig().vit_config.image_size;
+    HWY_ASSERT(config.wrapping == PromptWrapping::PALIGEMMA ||
+               config.wrapping == PromptWrapping::GEMMA_VLM);
+    HWY_ASSERT(image.ReadPPM(inference.image_file.path));
+    const size_t image_size = config.vit_config.image_size;
     image.Resize(image_size, image_size);
-    RuntimeConfig runtime_config = {
-        .gen = &gen, .verbosity = app.verbosity, .use_spinning = app.spin};
+    RuntimeConfig runtime_config = {.gen = &gen,
+                                    .verbosity = inference.verbosity,
+                                    .use_spinning = threading.spin};
     double image_tokens_start = hwy::platform::Now();
-    model.GenerateImageTokens(runtime_config, image, image_tokens);
-    if (app.verbosity >= 1) {
+    gemma.GenerateImageTokens(runtime_config, kv_cache.SeqLen(), image,
+                              image_tokens, env);
+    if (inference.verbosity >= 1) {
       double image_tokens_duration = hwy::platform::Now() - image_tokens_start;
       fprintf(stderr,
               "\n\n[ Timing info ] Image token generation took: %d ms\n",
@@ -121,21 +139,21 @@ void ReplGemma(Gemma& model, KVCache& kv_cache, const AppArgs& app,
     const bool first_response_token = tokens_generated_this_turn == prompt_size;
     ++tokens_generated_this_turn;
     if (in_prompt) {
-      if (app.verbosity >= 1) {
-        std::cerr << "." << std::flush;
+      if (inference.verbosity >= 1) {
+        std::cout << "." << std::flush;
       }
       return true;
-    } else if (model.GetModelConfig().IsEOS(token)) {
-      if (app.verbosity >= 2) {
+    } else if (config.IsEOS(token)) {
+      if (inference.verbosity >= 2) {
         std::cout << "\n[ End ]\n";
       }
       return true;
     }
     std::string token_text;
-    HWY_ASSERT(model.Tokenizer().Decode(std::vector<int>{token}, &token_text));
+    HWY_ASSERT(gemma.Tokenizer().Decode(std::vector<int>{token}, &token_text));
     if (first_response_token) {
       token_text.erase(0, token_text.find_first_not_of(" \t\n"));
-      if (app.verbosity >= 1) {
+      if (inference.verbosity >= 1) {
         std::cout << "\n\n";
       }
     }
@@ -146,72 +164,77 @@ void ReplGemma(Gemma& model, KVCache& kv_cache, const AppArgs& app,
   while (true) {  // Loop until user quits.
     tokens_generated_this_turn = 0;
 
-    // Read prompt and handle special commands.
-    std::string prompt_string = GetPrompt(std::cin, app.verbosity, eot_line);
-    if (!std::cin) return;
-    // If !eot_line.empty(), we append \n, so only look at the first 2 chars.
-    if (prompt_string.size() >= 2 && prompt_string[0] == '%') {
-      if (prompt_string[1] == 'q' || prompt_string[1] == 'Q') return;
-      if (prompt_string[1] == 'c' || prompt_string[1] == 'C') {
-        abs_pos = 0;
+    const std::string prompt_string = GetPrompt(inference);
+    const bool is_interactive = inference.IsInteractive();
+    if (is_interactive) {  // handle special commands:
+      if (!std::cin) return;
+
+      // If !eot_line.empty(), we append \n, so only look at the first 2 chars.
+      if (prompt_string.size() >= 2 && prompt_string[0] == '%') {
+        if (prompt_string[1] == 'q' || prompt_string[1] == 'Q') return;
+        if (prompt_string[1] == 'c' || prompt_string[1] == 'C') {
+          abs_pos = 0;
+          continue;
+        }
+      }
+
+      if (prompt_string.empty()) {
+        std::cout << "Use '%q' to quit.\n";
         continue;
       }
     }
-    if (prompt_string.empty()) {
-      std::cout << "Use '%q' to quit.\n";
-      continue;
+
+    // Set up runtime config.
+    TimingInfo timing_info = {.verbosity = inference.verbosity};
+    RuntimeConfig runtime_config = {.gen = &gen,
+                                    .verbosity = inference.verbosity,
+                                    .stream_token = stream_token,
+                                    .use_spinning = threading.spin};
+    inference.CopyTo(runtime_config);
+    std::vector<int> prompt;
+    size_t prefix_end = 0;
+    if (have_image) {
+      prompt = WrapAndTokenize(gemma.Tokenizer(), gemma.ChatTemplate(),
+                               config.wrapping, abs_pos, prompt_string,
+                               image_tokens.Rows());
+      runtime_config.image_tokens = &image_tokens;
+      prompt_size = prompt.size();
+      if (config.wrapping == PromptWrapping::PALIGEMMA) {
+        // The end of the prefix for prefix-LM style attention in Paligemma.
+        // See Figure 2 of https://arxiv.org/abs/2407.07726.
+        prefix_end = prompt_size;
+        // We need to look at all the tokens for the prefix.
+        // NOTE: Online softmax is on the roadmap, after which this requirement
+        // can be lifted.
+        runtime_config.prefill_tbatch_size = prompt_size;
+      }
+    } else {
+      prompt = WrapAndTokenize(gemma.Tokenizer(), gemma.ChatTemplate(),
+                               config.wrapping, abs_pos, prompt_string);
+      prompt_size = prompt.size();
     }
 
-    // Wrap, tokenize and maybe log prompt tokens.
-    std::vector<int> prompt = WrapAndTokenize(
-        model.Tokenizer(), model.Info(), abs_pos, prompt_string);
-    prompt_size = prompt.size();
     if constexpr (kVerboseLogTokens) {
       for (int i = 0; i < prompt_size; ++i) {
         fprintf(stderr, "DDD TOKEN %3d: %6d\n", i, prompt[i]);
       }
     }
 
-    // Set up runtime config.
-    TimingInfo timing_info = {.verbosity = app.verbosity};
-    RuntimeConfig runtime_config = {.gen = &gen,
-                                    .verbosity = app.verbosity,
-                                    .stream_token = stream_token,
-                                    .accept_token = accept_token,
-                                    .use_spinning = app.spin};
-    args.CopyTo(runtime_config);
-    size_t prefix_end = 0;
-    if (have_image) {
-      runtime_config.image_tokens = &image_tokens;
-      if (model.Info().wrapping == PromptWrapping::PALIGEMMA) {
-        prompt.insert(prompt.begin(), image_tokens.BatchSize(), 0);
-      } else if (model.Info().wrapping == PromptWrapping::GEMMA_VLM) {
-        size_t seq_len = model.GetModelConfig().vit_config.seq_len;
-        size_t pool_dim = model.GetModelConfig().vit_config.pool_dim;
-        prompt =
-            WrapVLM(model.Tokenizer(), model.Info(), abs_pos, prompt,
-                    image_tokens.BatchSize(), seq_len / (pool_dim * pool_dim));
-      }
-      prompt_size = prompt.size();
-      // The end of the prefix for prefix-LM style attention in Paligemma.
-      // See Figure 2 of https://arxiv.org/abs/2407.07726.
-      prefix_end = prompt_size;
-      // We need to look at all the tokens for the prefix.
-      runtime_config.prefill_tbatch_size = prompt_size;
-    }
-
     // Generate until EOS or max_generated_tokens.
-    if (app.verbosity >= 1) {
+    if (inference.verbosity >= 1) {
       std::cerr << "\n[ Reading prompt ] " << std::flush;
     }
-    model.Generate(runtime_config, prompt, abs_pos, prefix_end, kv_cache,
+    gemma.Generate(runtime_config, prompt, abs_pos, prefix_end, kv_cache, env,
                    timing_info);
     std::cout << "\n\n";
 
+    // In non-interactive mode, we only process one prompt/turn.
+    if (!is_interactive) break;
+
     // Prepare for the next turn. Works only for PaliGemma.
-    if (!args.multiturn || model.Info().wrapping == PromptWrapping::PALIGEMMA) {
+    if (!inference.multiturn || config.wrapping == PromptWrapping::PALIGEMMA) {
       abs_pos = 0;  // Start a new turn at position 0.
-      InitGenerator(args, gen);
+      InitGenerator(inference, gen);
     } else {
       // The last token was either EOS, then it should be ignored because it is
       // never part of the dialog, see Table 5 in the Gemma-2 paper:
@@ -227,20 +250,17 @@ void ReplGemma(Gemma& model, KVCache& kv_cache, const AppArgs& app,
   }
 }
 
-void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
+void Run(const LoaderArgs& loader, const ThreadingArgs& threading,
+         const InferenceArgs& inference) {
   PROFILER_ZONE("Run.misc");
 
-  // Note that num_threads is an upper bound; we also limit to the number of
-  // detected and enabled cores.
-  const BoundedTopology topology = CreateTopology(app);
-  NestedPools pools = CreatePools(topology, app);
-  MatMulEnv env(topology, pools);
-  if (app.verbosity >= 2) env.print_best = true;
-  Gemma model = CreateGemma(loader, env);
-  KVCache kv_cache =
-      KVCache::Create(model.GetModelConfig(), inference.prefill_tbatch_size);
+  ThreadingContext ctx(UpdateArgs(threading, inference));
+  MatMulEnv env(ctx);
+  if (inference.verbosity >= 2) env.print_best = true;
+  const Gemma gemma(loader, inference, ctx);
+  KVCache kv_cache(gemma.Config(), inference, ctx.allocator);
 
-  if (app.verbosity >= 1) {
+  if (inference.verbosity >= 1) {
     std::string instructions =
         "*Usage*\n"
         "  Enter an instruction and press enter (%C resets conversation, "
@@ -261,46 +281,37 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
     instructions += multiturn;
     instructions += examples;
 
-    std::cout << "\033[2J\033[1;1H"  // clear screen
-              << kAsciiArtBanner << "\n\n";
-    ShowConfig(loader, inference, app, topology, pools);
-    std::cout << "\n" << instructions << "\n";
+    // Skip the banner and instructions in non-interactive mode
+    if (inference.IsInteractive()) {
+      std::cout << "\033[2J\033[1;1H"  // clear screen
+                << kAsciiArtBanner << "\n\n";
+      ShowConfig(loader, threading, inference, gemma.Config(),
+                 gemma.WeightReadMode(), ctx);
+      std::cout << "\n" << instructions << "\n";
+    }
   }
 
-  ReplGemma(model, kv_cache, app, inference, AcceptFunc(), app.eot_line);
+  ReplGemma(threading, inference, gemma, kv_cache, env);
 }
 
 }  // namespace gcpp
 
 int main(int argc, char** argv) {
+  gcpp::InternalInit();
   {
     PROFILER_ZONE("Startup.misc");
 
-    // Placeholder for internal init, do not modify.
-
     gcpp::LoaderArgs loader(argc, argv);
+    gcpp::ThreadingArgs threading(argc, argv);
     gcpp::InferenceArgs inference(argc, argv);
-    gcpp::AppArgs app(argc, argv);
 
     if (gcpp::HasHelp(argc, argv)) {
       std::cerr << gcpp::kAsciiArtBanner;
-      gcpp::ShowHelp(loader, inference, app);
+      gcpp::ShowHelp(loader, threading, inference);
       return 0;
     }
 
-    if (const char* error = loader.Validate()) {
-      std::cerr << gcpp::kAsciiArtBanner;
-      gcpp::ShowHelp(loader, inference, app);
-      HWY_ABORT("\nInvalid args: %s", error);
-    }
-
-    if (const char* error = inference.Validate()) {
-      std::cerr << gcpp::kAsciiArtBanner;
-      gcpp::ShowHelp(loader, inference, app);
-      HWY_ABORT("\nInvalid args: %s", error);
-    }
-
-    gcpp::Run(loader, inference, app);
+    gcpp::Run(loader, threading, inference);
   }
   PROFILER_PRINT_RESULTS();  // Must call outside the zone above.
   return 0;
diff --git a/gemma/tensor_index.cc b/gemma/tensor_index.cc
deleted file mode 100644
index 4308c9d..0000000
--- a/gemma/tensor_index.cc
+++ /dev/null
@@ -1,607 +0,0 @@
-#include "gemma/tensor_index.h"
-
-#include <stddef.h>
-
-#include <algorithm>
-#include <cctype>
-#include <cstdio>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "compression/shared.h"
-#include "gemma/configs.h"
-
-namespace gcpp {
-namespace {
-
-// Returns the non-layer tensors for the model.
-std::vector<TensorInfo> ModelTensors(const ModelConfig& config) {
-  return {
-      TensorInfo{
-          .name = "c_embedding",
-          .source_names = {"embedder/input_embedding"},
-          .axes = {0, 1},
-          .shape = {config.vocab_size, config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "c_final_norm",
-          .source_names = {"final_norm/scale"},
-          .axes = {0},
-          .shape = {config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "enc_norm_bias",
-          .source_names = {"img/Transformer/encoder_norm/bias"},
-          .axes = {0},
-          .shape = {config.vit_config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "enc_norm_scale",
-          .source_names = {"img/Transformer/encoder_norm/scale"},
-          .axes = {0},
-          .shape = {config.vit_config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "img_emb_bias",
-          .source_names = {"img/embedding/bias"},
-          .axes = {0},
-          .shape = {config.vit_config.model_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "img_emb_kernel",
-          .source_names = {"img/embedding/kernel"},
-          .axes = {3, 0, 1, 2},
-          .shape = {config.vit_config.model_dim, config.vit_config.patch_width,
-                    config.vit_config.patch_width, 3},
-          .min_size = Type::kBF16,
-          .cols_take_extra_dims = true,
-      },
-      TensorInfo{
-          .name = "img_head_bias",
-          .source_names = {"img/head/bias", "embedder/mm_input_projection/b"},
-          .axes = {0},
-          .shape = {config.model_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "img_head_kernel",
-          .source_names = {"img/head/kernel", "embedder/mm_input_projection/w"},
-          .axes = {1, 0},
-          .shape = {config.model_dim, config.vit_config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "img_pos_emb",
-          .source_names = {"img/pos_embedding"},
-          .axes = {0, 1},
-          .shape = {/*1,*/ config.vit_config.seq_len,
-                    config.vit_config.model_dim},
-          .min_size = Type::kF32,
-      },
-      // RMS norm applied to soft tokens prior to pos embedding.
-      TensorInfo{
-          .name = "mm_embed_norm",
-          .source_names = {"embedder/mm_soft_embedding_norm/scale"},
-          .axes = {0},
-          .shape = {config.vit_config.model_dim},
-          .min_size = Type::kBF16,
-      },
-  };
-}
-
-// Returns the tensors for the given image layer config.
-std::vector<TensorInfo> ImageLayerTensors(const ModelConfig& config,
-                                          const LayerConfig& layer_config,
-                                          const int img_layer_idx) {
-  return {
-      // Vit layers.
-      TensorInfo{
-          .name = "attn_out_w",
-          .source_names = {"MultiHeadDotProductAttention_0/out/kernel"},
-          .axes = {2, 0, 1},
-          .shape = {config.vit_config.model_dim, layer_config.heads,
-                    layer_config.qkv_dim},
-          .min_size = Type::kBF16,
-          .cols_take_extra_dims = true,
-      },
-      TensorInfo{
-          .name = "attn_out_b",
-          .source_names = {"MultiHeadDotProductAttention_0/out/bias"},
-          .axes = {0},
-          .shape = {config.vit_config.model_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "q_ein_w",
-          .source_names = {"MultiHeadDotProductAttention_0/query/kernel"},
-          .axes = {1, 2, 0},
-          .shape = {layer_config.heads, layer_config.qkv_dim,
-                    config.vit_config.model_dim},
-          .concat_names = {"qkv_ein_w", "k_ein_w", "v_ein_w"},
-          .concat_axis = 1,
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "k_ein_w",
-          .source_names = {"MultiHeadDotProductAttention_0/key/kernel"},
-          .axes = {1, 2, 0},
-          .shape = {layer_config.heads, layer_config.qkv_dim,
-                    config.vit_config.model_dim},
-          .concat_names = {""},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "v_ein_w",
-          .source_names = {"MultiHeadDotProductAttention_0/value/kernel"},
-          .axes = {1, 2, 0},
-          .shape = {layer_config.heads, layer_config.qkv_dim,
-                    config.vit_config.model_dim},
-          .concat_names = {""},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "qkv_ein_w",
-          .source_names = {"MultiHeadDotProductAttention_0/qkv/kernel"},
-          .axes = {1, 2, 0},
-          .shape = {layer_config.heads, 3 * layer_config.qkv_dim,
-                    config.vit_config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "q_ein_b",
-          .source_names = {"MultiHeadDotProductAttention_0/query/bias"},
-          .axes = {0, 1},
-          .shape = {layer_config.heads, layer_config.qkv_dim},
-          .concat_names = {"qkv_ein_b", "k_ein_b", "v_ein_b"},
-          .concat_axis = 1,
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "k_ein_b",
-          .source_names = {"MultiHeadDotProductAttention_0/key/bias"},
-          .axes = {0, 1},
-          .shape = {layer_config.kv_heads, layer_config.qkv_dim},
-          .concat_names = {""},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "v_ein_b",
-          .source_names = {"MultiHeadDotProductAttention_0/value/bias"},
-          .axes = {0, 1},
-          .shape = {layer_config.kv_heads, layer_config.qkv_dim},
-          .concat_names = {""},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "qkv_ein_b",
-          .source_names = {"MultiHeadDotProductAttention_0/qkv/bias"},
-          .axes = {0, 1},
-          .shape = {layer_config.heads + layer_config.kv_heads * 2,
-                    layer_config.qkv_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "linear_0_w",
-          .source_names = {"MlpBlock_0/Dense_0/kernel"},
-          .axes = {1, 0},
-          .shape = {layer_config.ff_hidden_dim, config.vit_config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "linear_0_b",
-          .source_names = {"MlpBlock_0/Dense_0/bias"},
-          .axes = {0},
-          .shape = {layer_config.ff_hidden_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "linear_1_w",
-          .source_names = {"MlpBlock_0/Dense_1/kernel"},
-          .axes = {1, 0},
-          .shape = {config.vit_config.model_dim, layer_config.ff_hidden_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "linear_1_b",
-          .source_names = {"MlpBlock_0/Dense_1/bias"},
-          .axes = {0},
-          .shape = {config.vit_config.model_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "ln_0_bias",
-          .source_names = {"img/Transformer/encoderblock/LayerNorm_0/bias",
-                           "img/Transformer/encoderblock_" +
-                               std::to_string(img_layer_idx) +
-                               "/LayerNorm_0/bias"},
-          .axes = {0},
-          .shape = {config.vit_config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "ln_0_scale",
-          .source_names = {"img/Transformer/encoderblock/LayerNorm_0/scale",
-                           "img/Transformer/encoderblock_" +
-                               std::to_string(img_layer_idx) +
-                               "/LayerNorm_0/scale"},
-          .axes = {0},
-          .shape = {config.vit_config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "ln_1_bias",
-          .source_names = {"img/Transformer/encoderblock/LayerNorm_1/bias",
-                           "img/Transformer/encoderblock_" +
-                               std::to_string(img_layer_idx) +
-                               "/LayerNorm_1/bias"},
-          .axes = {0},
-          .shape = {config.vit_config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "ln_1_scale",
-          .source_names = {"img/Transformer/encoderblock/LayerNorm_1/scale",
-                           "img/Transformer/encoderblock_" +
-                               std::to_string(img_layer_idx) +
-                               "/LayerNorm_1/scale"},
-          .axes = {0},
-          .shape = {config.vit_config.model_dim},
-          .min_size = Type::kBF16,
-      },
-  };
-}
-
-// Returns the tensors for the given LLM layer config.
-std::vector<TensorInfo> LLMLayerTensors(const ModelConfig& config,
-                                        const LayerConfig& layer_config,
-                                        bool reshape_att) {
-  std::vector<TensorInfo> tensors = {
-      TensorInfo{
-          .name = "key_norm",
-          .source_names = {"attn/_key_norm/scale"},
-          .axes = {0},
-          .shape = {layer_config.qkv_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "query_norm",
-          .source_names = {"attn/_query_norm/scale"},
-          .axes = {0},
-          .shape = {layer_config.qkv_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "qkv1_w",
-          .source_names = {"attn/q_einsum/w"},
-          .axes = {0, 2, 1},
-          .shape = {layer_config.heads * layer_config.qkv_dim,
-                    config.model_dim},
-          .concat_names = {"qkv_ein", "qkv2_w"},
-      },
-      TensorInfo{
-          .name = "qkv2_w",
-          .source_names = {"attn/kv_einsum/w"},
-          .axes = {1, 0, 3, 2},
-          .shape = {2 * layer_config.kv_heads * layer_config.qkv_dim,
-                    config.model_dim},
-          .concat_names = {""},
-      },
-      TensorInfo{
-          .name = "q_ein",
-          .source_names = {"attention_block/proj_q/kernel"},
-          .axes = {1, 0},
-          .shape = {layer_config.model_dim, layer_config.model_dim},
-          .concat_names = {"qkv_ein", "k_ein", "v_ein"},
-      },
-      TensorInfo{
-          .name = "k_ein",
-          .source_names = {"attention_block/proj_k/kernel"},
-          .axes = {1, 0},
-          .shape = {layer_config.qkv_dim, layer_config.model_dim},
-          .concat_names = {""},
-      },
-      TensorInfo{
-          .name = "v_ein",
-          .source_names = {"attention_block/proj_v/kernel"},
-          .axes = {1, 0},
-          .shape = {layer_config.qkv_dim, layer_config.model_dim},
-          .concat_names = {""},
-      },
-      TensorInfo{
-          .name = "qkv_ein",
-          .source_names = {"attn/qkv_einsum/w"},
-          .axes = {1, 0, 3, 2},
-          .shape = {(layer_config.heads + 2 * layer_config.kv_heads) *
-                        layer_config.qkv_dim,
-                    config.model_dim},
-      },
-      TensorInfo{
-          .name = "attn_ob",
-          .source_names = {"attention_block/proj_final/bias"},
-          .axes = {0},
-          .shape = {config.model_dim},
-          .min_size = Type::kF32,
-      },
-      // Griffin layers.
-      TensorInfo{
-          .name = "gr_lin_x_w",
-          .source_names = {"recurrent_block/linear_x/kernel"},
-          .axes = {1, 0},
-          .shape = {layer_config.griffin_dim, layer_config.griffin_dim},
-      },
-      TensorInfo{
-          .name = "gr_lin_x_b",
-          .source_names = {"recurrent_block/linear_x/bias"},
-          .axes = {0},
-          .shape = {layer_config.griffin_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "gr_lin_y_w",
-          .source_names = {"recurrent_block/linear_y/kernel"},
-          .axes = {1, 0},
-          .shape = {layer_config.griffin_dim, layer_config.griffin_dim},
-      },
-      TensorInfo{
-          .name = "gr_lin_y_b",
-          .source_names = {"recurrent_block/linear_y/bias"},
-          .axes = {0},
-          .shape = {layer_config.griffin_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "gr_lin_out_w",
-          .source_names = {"recurrent_block/linear_out/kernel"},
-          .axes = {1, 0},
-          .shape = {layer_config.griffin_dim, layer_config.griffin_dim},
-      },
-      TensorInfo{
-          .name = "gr_lin_out_b",
-          .source_names = {"recurrent_block/linear_out/bias"},
-          .axes = {0},
-          .shape = {layer_config.griffin_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "gr_conv_w",
-          .source_names = {"recurrent_block/conv_1d/w"},
-          .axes = {0, 1},
-          .shape = {layer_config.conv1d_width, layer_config.griffin_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "gr_conv_b",
-          .source_names = {"recurrent_block/conv_1d/b"},
-          .axes = {0},
-          .shape = {layer_config.griffin_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "gr1_gate_w",
-          .source_names = {"recurrent_block/rg_lru/input_gate/w"},
-          .axes = {0, 2, 1},
-          .shape = {layer_config.heads,
-                    layer_config.griffin_dim / layer_config.heads,
-                    layer_config.griffin_dim / layer_config.heads},
-          .concat_names = {"gr_gate_w", "gr2_gate_w"},
-      },
-      TensorInfo{
-          .name = "gr2_gate_w",
-          .source_names = {"recurrent_block/rg_lru/a_gate/w"},
-          .axes = {0, 2, 1},
-          .shape = {layer_config.heads,
-                    layer_config.griffin_dim / layer_config.heads,
-                    layer_config.griffin_dim / layer_config.heads},
-          .concat_names = {""},
-      },
-      TensorInfo{
-          .name = "gr_gate_w",
-          .source_names = {"recurrent_block/rg_lru/gate/w"},
-          .axes = {0, 2, 1},
-          .shape = {2 * layer_config.heads,
-                    layer_config.griffin_dim / layer_config.heads,
-                    layer_config.griffin_dim / layer_config.heads},
-      },
-      TensorInfo{
-          .name = "gr1_gate_b",
-          .source_names = {"recurrent_block/rg_lru/input_gate/b"},
-          .axes = {0},
-          .shape = {layer_config.griffin_dim},
-          .concat_names = {"gr_gate_b", "gr2_gate_b"},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "gr2_gate_b",
-          .source_names = {"recurrent_block/rg_lru/a_gate/b"},
-          .axes = {0},
-          .shape = {layer_config.griffin_dim},
-          .concat_names = {""},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "gr_gate_b",
-          .source_names = {"recurrent_block/rg_lru/input_gate/b"},
-          .axes = {0, 1},
-          .shape = {2 * layer_config.griffin_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "gr_a",
-          .source_names = {"recurrent_block/rg_lru/a_param"},
-          .axes = {0},
-          .shape = {layer_config.griffin_dim},
-          .min_size = Type::kF32,
-          .scaled_softplus = true,
-      },
-
-      TensorInfo{
-          .name = "gating_ein",
-          .source_names = {"mlp/gating_einsum/w", "mlp/gating_einsum",
-                           "mlp_block/ffw_up/w"},
-          .axes = {0, layer_config.optimized_gating ? 1u : 2u,
-                   layer_config.optimized_gating ? 2u : 1u},
-          .shape = {2, layer_config.ff_hidden_dim, config.model_dim},
-      },
-      TensorInfo{
-          .name = "gating1_w",
-          .source_names = {"none"},
-          .axes = {0, layer_config.optimized_gating ? 1u : 2u,
-                   layer_config.optimized_gating ? 2u : 1u},
-          .shape = {layer_config.ff_hidden_dim, config.model_dim},
-      },
-      TensorInfo{
-          .name = "gating2_w",
-          .source_names = {"none"},
-          .axes = {0, layer_config.optimized_gating ? 1u : 2u,
-                   layer_config.optimized_gating ? 2u : 1u},
-          .shape = {layer_config.ff_hidden_dim, config.model_dim},
-      },
-      TensorInfo{
-          .name = "linear_w",
-          .source_names = {"mlp/linear/w", "mlp/linear",
-                           "mlp_block/ffw_down/kernel"},
-          .axes = {1, 0},
-          .shape = {config.model_dim, layer_config.ff_hidden_dim},
-      },
-      TensorInfo{
-          .name = "pre_att_ns",
-          .source_names = {"pre_attention_norm/scale",
-                           "temporal_pre_norm/scale"},
-          .axes = {0},
-          .shape = {config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "pre_ff_ns",
-          .source_names = {"pre_ffw_norm/scale", "channel_pre_norm/scale"},
-          .axes = {0},
-          .shape = {config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "post_att_ns",
-          .source_names = {"post_attention_norm/scale"},
-          .axes = {0},
-          .shape = {config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "post_ff_ns",
-          .source_names = {"post_ffw_norm/scale"},
-          .axes = {0},
-          .shape = {config.model_dim},
-          .min_size = Type::kBF16,
-      },
-      TensorInfo{
-          .name = "ffw_gat_b",
-          .source_names = {"mlp_block/ffw_up/b"},
-          .axes = {0},
-          .shape = {2 * layer_config.ff_hidden_dim},
-          .min_size = Type::kF32,
-      },
-      TensorInfo{
-          .name = "ffw_out_b",
-          .source_names = {"mlp_block/ffw_down/bias"},
-          .axes = {0},
-          .shape = {config.model_dim},
-          .min_size = Type::kF32,
-      },
-  };
-  if (reshape_att) {
-    tensors.push_back(TensorInfo{
-        .name = "att_w",
-        .source_names = {"attn/attn_vec_einsum/w",
-                         "attention_block/proj_final/kernel"},
-        .preshape = {layer_config.heads, layer_config.qkv_dim,
-                     config.model_dim},
-        .axes = {2, 0, 1},
-        .shape = {config.model_dim, layer_config.heads, layer_config.qkv_dim},
-        .cols_take_extra_dims = true,
-    });
-    tensors.push_back(TensorInfo{
-        .name = "att_ein",
-        .shape = {layer_config.heads, config.model_dim, layer_config.qkv_dim},
-    });
-  } else {
-    tensors.push_back(TensorInfo{
-        .name = "att_ein",
-        .source_names = {"attn/attn_vec_einsum/w",
-                         "attention_block/proj_final/kernel"},
-        .preshape = {layer_config.heads, layer_config.qkv_dim,
-                     config.model_dim},
-        .axes = {0, 2, 1},
-        .shape = {layer_config.heads, config.model_dim, layer_config.qkv_dim},
-    });
-    tensors.push_back(TensorInfo{
-        .name = "att_w",
-        .shape = {config.model_dim, layer_config.heads, layer_config.qkv_dim},
-        .cols_take_extra_dims = true,
-    });
-  }
-  return tensors;
-}
-
-}  // namespace
-
-TensorIndex::TensorIndex(const ModelConfig& config, int llm_layer_idx,
-                         int img_layer_idx, bool reshape_att)
-    : config_(config),
-      llm_layer_idx_(llm_layer_idx),
-      img_layer_idx_(img_layer_idx) {
-  int layer_idx = std::max(llm_layer_idx_, img_layer_idx_);
-  std::string suffix;
-  if (layer_idx >= 0) {
-    suffix = "_" + std::to_string(layer_idx);
-  }
-  if (llm_layer_idx < 0 && img_layer_idx < 0) {
-    tensors_ = ModelTensors(config);
-  } else if (llm_layer_idx_ < 0 && 0 <= img_layer_idx &&
-             img_layer_idx < config.vit_config.layer_configs.size()) {
-    const auto& layer_config = config.vit_config.layer_configs[img_layer_idx];
-    tensors_ = ImageLayerTensors(config, layer_config, img_layer_idx);
-  } else if (0 <= llm_layer_idx &&
-             llm_layer_idx < config.layer_configs.size()) {
-    const auto& layer_config = config.layer_configs[llm_layer_idx];
-    tensors_ = LLMLayerTensors(config, layer_config, reshape_att);
-  }
-  for (size_t i = 0; i < tensors_.size(); ++i) {
-    std::string key = tensors_[i].name + suffix;
-    name_map_.insert({key, i});
-  }
-}
-
-TensorInfo TensorIndex::TensorInfoFromSourcePath(
-    const std::string& path) const {
-  for (const auto& tensor : tensors_) {
-    for (const auto& source_name : tensor.source_names) {
-      auto pos = path.rfind(source_name);
-      if (pos != std::string::npos && path.size() == pos + source_name.size())
-        return tensor;
-    }
-  }
-  return TensorInfo();
-}
-
-const TensorInfo* TensorIndex::FindName(const std::string& name) const {
-  std::string name_to_find = name;
-  if (!std::isdigit(name[name.size() - 1])) {
-    if (img_layer_idx_ >= 0 && llm_layer_idx_ < 0) {
-      name_to_find = name + "_" + std::to_string(img_layer_idx_);
-    } else if (llm_layer_idx_ >= 0) {
-      name_to_find = name + "_" + std::to_string(llm_layer_idx_);
-    }
-  }
-  auto it = name_map_.find(name_to_find);
-  if (it == name_map_.end()) {
-    return nullptr;
-  }
-  return &tensors_[it->second];
-}
-
-}  // namespace gcpp
\ No newline at end of file
diff --git a/gemma/tensor_index.h b/gemma/tensor_index.h
deleted file mode 100644
index dc6b86c..0000000
--- a/gemma/tensor_index.h
+++ /dev/null
@@ -1,101 +0,0 @@
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_TENSOR_INDEX_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_TENSOR_INDEX_H_
-
-#include <stddef.h>
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "compression/shared.h"
-#include "gemma/configs.h"
-
-namespace gcpp {
-
-// Universal tensor information. Holds enough information to construct a
-// tensor in LayerWeightsPtrs/ModelWeightsPtrs, as well as to export the
-// tensor from the python model with necessary transpose/reshape info.
-struct TensorInfo {
-  // The name of the tensor in the sbs file
-  std::string name;
-  // Strings to match to the end of the name of the tensor in the python model.
-  std::vector<std::string> source_names;
-  // Initial reshape shape. Use only as a last resort when input may have
-  // dimensions combined that need to be split before the transpose, as it
-  // defeats the post-transpose shape checking. Normally empty.
-  std::vector<size_t> preshape;
-  // Transpose axes arg. If the input tensor has more dimensions than axes,
-  // then leading dimensions are collapsed until the number of axes matches.
-  std::vector<size_t> axes;
-  // Expected final shape of the tensor after reshape/transpose.
-  // Note that this is the shape of the tensor during export,
-  // not the shape of the tensor in the sbs file, as the sbs file
-  // is restricted to 2D tensors. With few exceptions, the sbs file
-  // tensor rows gather all the excess dimensions. See cols_take_extra_dims.
-  std::vector<size_t> shape;
-  // List of names to concatenate with, used only if multiple tensors are
-  // concatenated into one. The first tensor in the concatenation should have
-  // concat names thus: The first name is the name of the result, and the
-  // tensors with the remaining names are concatenated after this.
-  // The remaining tensors to be concatenated should have just a single
-  // empty string in concat_names to indicate that they have been consumed.
-  std::vector<std::string> concat_names;
-  // Axis at which to concatenate.
-  size_t concat_axis = 0;
-  // The minimum compression weight type for this tensor. The default is
-  // kNUQ, which provides maximum compression. Other values such as kBF16
-  // or kF32 can be used to limit the compression to a specific type.
-  Type min_size = Type::kNUQ;
-  // Whether to apply scaled softplus to the data.
-  bool scaled_softplus = false;
-  // Whether the columns or the rows take any extra dimensions.
-  // If false, then [10, 20, 30] -> [10*20, 30] and [30] -> [1, 30].
-  // If true, then [10, 20, 30] -> [10, 20*30] and [30] -> [1, 30].
-  bool cols_take_extra_dims = false;
-};
-
-// Universal index of tensor information, which can be built for a specific
-// layer_idx.
-class TensorIndex {
- public:
-  // Builds a list of TensorInfo for the given layer_idx.
-  // If reshape_att is true, the attn_vec_einsum tensor is reshaped.
-  TensorIndex(const ModelConfig& config, int llm_layer_idx, int img_layer_idx,
-              bool reshape_att);
-  ~TensorIndex() = default;
-
-  // Returns the TensorInfo whose source_name matches the end of the given path,
-  // or an empty TensorInfo if not found.
-  // NOTE: that the returned TensorInfo is a copy, so that the source
-  // TensorIndex can be destroyed without affecting the returned TensorInfo.
-  TensorInfo TensorInfoFromSourcePath(const std::string& path) const;
-
-  // Returns the TensorInfo whose name matches the given name,
-  // or an empty TensorInfo if not found.
-  // NOTE: that the returned TensorInfo is a copy, so that the source
-  // TensorIndex can be destroyed without affecting the returned TensorInfo.
-  TensorInfo TensorInfoFromName(const std::string& name) const {
-    const TensorInfo* info = FindName(name);
-    if (info == nullptr) return TensorInfo();
-    return *info;
-  }
-
-  // Returns the TensorInfo for the given tensor name, for concise construction
-  // of ModelWeightsPtrs/LayerWeightsPtrs.
-  const TensorInfo* FindName(const std::string& name) const;
-
- private:
-  // Config that was used to build the tensor index.
-  const ModelConfig& config_;
-  // Layer that this tensor index is for - either LLM or image.
-  int llm_layer_idx_;
-  int img_layer_idx_;
-  // List of tensor information for this layer.
-  std::vector<TensorInfo> tensors_;
-  // Map from tensor name to index in tensors_.
-  std::unordered_map<std::string, size_t> name_map_;
-};
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_TENSOR_INDEX_H_
diff --git a/gemma/tensor_index_test.cc b/gemma/tensor_index_test.cc
deleted file mode 100644
index 50ff0b6..0000000
--- a/gemma/tensor_index_test.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-#include "gemma/tensor_index.h"
-
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <string>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "compression/compress.h"
-#include "compression/shared.h"
-#include "gemma/configs.h"
-#include "gemma/weights.h"
-#include "util/basics.h"
-#include "hwy/aligned_allocator.h"
-
-namespace gcpp {
-namespace {
-
-// Tests that each tensor in the model can be found by exactly one TensorIndex,
-// and that the TensorIndex returns the correct shape and name for the tensor,
-// for all models.
-TEST(TensorIndexTest, FindName) {
-  for (Model model : kAllModels) {
-    fprintf(stderr, "Testing model %d\n", static_cast<int>(model));
-    ModelConfig config = ConfigFromModel(model);
-    std::vector<TensorIndex> tensor_indexes;
-    tensor_indexes.emplace_back(config, /*llm_layer_idx=*/-1,
-                                /*img_layer_idx=*/-1,
-                                /*split_and_reshape=*/false);
-    for (size_t llm_layer_idx = 0; llm_layer_idx < config.layer_configs.size();
-         ++llm_layer_idx) {
-      tensor_indexes.emplace_back(config, static_cast<int>(llm_layer_idx),
-                                  /*img_layer_idx=*/-1,
-                                   /*split_and_reshape=*/false);
-    }
-    for (size_t img_layer_idx = 0;
-         img_layer_idx < config.vit_config.layer_configs.size();
-         ++img_layer_idx) {
-      tensor_indexes.emplace_back(config, /*llm_layer_idx=*/-1,
-                                  static_cast<int>(img_layer_idx),
-                                  /*split_and_reshape=*/false);
-    }
-    // For each tensor in any model, exactly one TensorIndex should find it.
-    ModelWeightsPtrs<SfpStream> weights(config);
-    ModelWeightsPtrs<SfpStream>::ForEachTensor(
-        {&weights}, ForEachType::kInitNoToc,
-        [&tensor_indexes](const char* name, hwy::Span<MatPtr*> tensors) {
-          int num_found = 0;
-          const MatPtr& tensor = *tensors[0];
-          for (const auto& tensor_index : tensor_indexes) {
-            // Skip the type marker prefix, but we want the layer index suffix.
-            std::string name_to_find(name + 1, strlen(name) - 1);
-            const TensorInfo* info = tensor_index.FindName(name_to_find);
-            if (info != nullptr) {
-              // Test that the MatPtr can be constructed from the TensorInfo,
-              // and that the dimensions match.
-              MatPtrT<SfpStream> mat_ptr(tensor.Name(), tensor_index);
-              EXPECT_STREQ(tensor.Name(), mat_ptr.Name())
-                  << "on tensor " << name;
-              EXPECT_EQ(tensor.Rows(), mat_ptr.Rows()) << "on tensor " << name;
-              EXPECT_EQ(tensor.Cols(), mat_ptr.Cols()) << "on tensor " << name;
-              ++num_found;
-            }
-          }
-          EXPECT_EQ(num_found, 1) << " for tensor " << name;
-        });
-  }
-}
-
-}  // namespace
-}  // namespace gcpp
diff --git a/gemma/tensor_info.cc b/gemma/tensor_info.cc
new file mode 100644
index 0000000..de93cf9
--- /dev/null
+++ b/gemma/tensor_info.cc
@@ -0,0 +1,593 @@
+#include "gemma/tensor_info.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "compression/types.h"
+#include "gemma/configs.h"
+
+namespace gcpp {
+
+void TensorInfoRegistry::Add(const std::string& suffix,
+                             const TensorInfo& info) {
+  const size_t idx = tensors_.size();
+  tensors_.push_back(info);
+  // Also add suffix to `concat_names`.
+  for (std::string& name : tensors_.back().concat_names) {
+    name += suffix;
+  }
+
+  const std::string name = info.base_name + suffix;
+  // Ensure successful insertion because `suffix` ensures uniqueness for
+  // per-layer tensors, and per-model should only be inserted once.
+  HWY_ASSERT_M(idx_from_name_.insert({name, idx}).second, name.c_str());
+}
+
+// Non-layer tensors.
+void TensorInfoRegistry::AddModelTensors(const ModelConfig& config) {
+  const std::string no_suffix;
+  Add(no_suffix, {
+                     .base_name = "c_embedding",
+                     .source_names = {"embedder/input_embedding"},
+                     .axes = {0, 1},
+                     .shape = {config.vocab_size, config.model_dim},
+                     .min_size = Type::kBF16,
+                 });
+  Add(no_suffix, {
+                     .base_name = "c_final_norm",
+                     .source_names = {"final_norm/scale"},
+                     .axes = {0},
+                     .shape = {config.model_dim},
+                     .min_size = Type::kBF16,
+                 });
+  Add(no_suffix, {
+                     .base_name = "enc_norm_bias",
+                     .source_names = {"img/Transformer/encoder_norm/bias"},
+                     .axes = {0},
+                     .shape = {config.vit_config.model_dim},
+                     .min_size = Type::kBF16,
+                 });
+  Add(no_suffix, {
+                     .base_name = "enc_norm_scale",
+                     .source_names = {"img/Transformer/encoder_norm/scale"},
+                     .axes = {0},
+                     .shape = {config.vit_config.model_dim},
+                     .min_size = Type::kBF16,
+                 });
+  Add(no_suffix, {
+                     .base_name = "img_emb_bias",
+                     .source_names = {"img/embedding/bias"},
+                     .axes = {0},
+                     .shape = {config.vit_config.model_dim},
+                     .min_size = Type::kF32,
+                 });
+  Add(no_suffix,
+      {
+          .base_name = "img_emb_kernel",
+          .source_names = {"img/embedding/kernel"},
+          .axes = {3, 0, 1, 2},
+          .shape = {config.vit_config.model_dim, config.vit_config.patch_width,
+                    config.vit_config.patch_width, 3},
+          .min_size = Type::kBF16,
+          .cols_take_extra_dims = true,
+      });
+  Add(no_suffix,
+      {
+          .base_name = "img_head_bias",
+          .source_names = {"img/head/bias", "embedder/mm_input_projection/b"},
+          .axes = {0},
+          .shape = {config.model_dim},
+          .min_size = Type::kF32,
+      });
+  Add(no_suffix,
+      {
+          .base_name = "img_head_kernel",
+          .source_names = {"img/head/kernel", "embedder/mm_input_projection/w"},
+          .axes = {1, 0},
+          .shape = {config.model_dim, config.vit_config.model_dim},
+          .min_size = Type::kBF16,
+      });
+  Add(no_suffix, {
+                     .base_name = "img_pos_emb",
+                     .source_names = {"img/pos_embedding"},
+                     .axes = {0, 1},
+                     .shape = {/*1,*/ config.vit_config.seq_len,
+                               config.vit_config.model_dim},
+                     .min_size = Type::kF32,
+                 });
+  // RMS norm applied to soft tokens prior to pos embedding.
+  Add(no_suffix, {
+                     .base_name = "mm_embed_norm",
+                     .source_names = {"embedder/mm_soft_embedding_norm/scale"},
+                     .axes = {0},
+                     .shape = {config.vit_config.model_dim},
+                     .min_size = Type::kBF16,
+                 });
+}
+
+// Returns the tensors for the given image layer config.
+void TensorInfoRegistry::AddImageLayerTensors(const ModelConfig& config,
+                                              const LayerConfig& layer_config,
+                                              const size_t img_layer_idx) {
+  const std::string suffix = LayerSuffix(img_layer_idx);
+
+  // Vit layers.
+  Add(suffix, {
+                  .base_name = "attn_out_w",
+                  .source_names = {"MultiHeadDotProductAttention_0/out/kernel"},
+                  .axes = {2, 0, 1},
+                  .shape = {config.vit_config.model_dim, layer_config.heads,
+                            layer_config.qkv_dim},
+                  .min_size = Type::kBF16,
+                  .cols_take_extra_dims = true,
+              });
+  Add(suffix, {
+                  .base_name = "attn_out_b",
+                  .source_names = {"MultiHeadDotProductAttention_0/out/bias"},
+                  .axes = {0},
+                  .shape = {config.vit_config.model_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix,
+      {
+          .base_name = "q_ein_w",
+          .source_names = {"MultiHeadDotProductAttention_0/query/kernel"},
+          .axes = {1, 2, 0},
+          .shape = {layer_config.heads, layer_config.qkv_dim,
+                    config.vit_config.model_dim},
+          .concat_names = {"qkv_ein_w", "k_ein_w", "v_ein_w"},
+          .concat_axis = 1,
+          .min_size = Type::kBF16,
+      });
+  Add(suffix, {
+                  .base_name = "k_ein_w",
+                  .source_names = {"MultiHeadDotProductAttention_0/key/kernel"},
+                  .axes = {1, 2, 0},
+                  .shape = {layer_config.heads, layer_config.qkv_dim,
+                            config.vit_config.model_dim},
+                  .concat_names = {""},
+                  .min_size = Type::kBF16,
+              });
+  Add(suffix,
+      {
+          .base_name = "v_ein_w",
+          .source_names = {"MultiHeadDotProductAttention_0/value/kernel"},
+          .axes = {1, 2, 0},
+          .shape = {layer_config.heads, layer_config.qkv_dim,
+                    config.vit_config.model_dim},
+          .concat_names = {""},
+          .min_size = Type::kBF16,
+      });
+  Add(suffix, {
+                  .base_name = "qkv_ein_w",
+                  .source_names = {"MultiHeadDotProductAttention_0/qkv/kernel"},
+                  .axes = {1, 2, 0},
+                  .shape = {layer_config.heads, 3 * layer_config.qkv_dim,
+                            config.vit_config.model_dim},
+                  .min_size = Type::kBF16,
+              });
+  Add(suffix, {
+                  .base_name = "q_ein_b",
+                  .source_names = {"MultiHeadDotProductAttention_0/query/bias"},
+                  .axes = {0, 1},
+                  .shape = {layer_config.heads, layer_config.qkv_dim},
+                  .concat_names = {"qkv_ein_b", "k_ein_b", "v_ein_b"},
+                  .concat_axis = 1,
+                  .min_size = Type::kF32,
+              });
+  Add(suffix, {
+                  .base_name = "k_ein_b",
+                  .source_names = {"MultiHeadDotProductAttention_0/key/bias"},
+                  .axes = {0, 1},
+                  .shape = {layer_config.kv_heads, layer_config.qkv_dim},
+                  .concat_names = {""},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix, {
+                  .base_name = "v_ein_b",
+                  .source_names = {"MultiHeadDotProductAttention_0/value/bias"},
+                  .axes = {0, 1},
+                  .shape = {layer_config.kv_heads, layer_config.qkv_dim},
+                  .concat_names = {""},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix, {
+                  .base_name = "qkv_ein_b",
+                  .source_names = {"MultiHeadDotProductAttention_0/qkv/bias"},
+                  .axes = {0, 1},
+                  .shape = {layer_config.heads + layer_config.kv_heads * 2,
+                            layer_config.qkv_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix,
+      {
+          .base_name = "linear_0_w",
+          .source_names = {"MlpBlock_0/Dense_0/kernel"},
+          .axes = {1, 0},
+          .shape = {layer_config.ff_hidden_dim, config.vit_config.model_dim},
+          .min_size = Type::kBF16,
+      });
+  Add(suffix, {
+                  .base_name = "linear_0_b",
+                  .source_names = {"MlpBlock_0/Dense_0/bias"},
+                  .axes = {0},
+                  .shape = {layer_config.ff_hidden_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix,
+      {
+          .base_name = "linear_1_w",
+          .source_names = {"MlpBlock_0/Dense_1/kernel"},
+          .axes = {1, 0},
+          .shape = {config.vit_config.model_dim, layer_config.ff_hidden_dim},
+          .min_size = Type::kBF16,
+      });
+  Add(suffix, {
+                  .base_name = "linear_1_b",
+                  .source_names = {"MlpBlock_0/Dense_1/bias"},
+                  .axes = {0},
+                  .shape = {config.vit_config.model_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix,
+      {
+          .base_name = "ln_0_bias",
+          .source_names = {"img/Transformer/encoderblock/LayerNorm_0/bias",
+                           "img/Transformer/encoderblock_" +
+                               std::to_string(img_layer_idx) +
+                               "/LayerNorm_0/bias"},
+          .axes = {0},
+          .shape = {config.vit_config.model_dim},
+          .min_size = Type::kBF16,
+      });
+  Add(suffix,
+      {
+          .base_name = "ln_0_scale",
+          .source_names = {"img/Transformer/encoderblock/LayerNorm_0/scale",
+                           "img/Transformer/encoderblock_" +
+                               std::to_string(img_layer_idx) +
+                               "/LayerNorm_0/scale"},
+          .axes = {0},
+          .shape = {config.vit_config.model_dim},
+          .min_size = Type::kBF16,
+      });
+  Add(suffix,
+      {
+          .base_name = "ln_1_bias",
+          .source_names = {"img/Transformer/encoderblock/LayerNorm_1/bias",
+                           "img/Transformer/encoderblock_" +
+                               std::to_string(img_layer_idx) +
+                               "/LayerNorm_1/bias"},
+          .axes = {0},
+          .shape = {config.vit_config.model_dim},
+          .min_size = Type::kBF16,
+      });
+  Add(suffix,
+      {
+          .base_name = "ln_1_scale",
+          .source_names = {"img/Transformer/encoderblock/LayerNorm_1/scale",
+                           "img/Transformer/encoderblock_" +
+                               std::to_string(img_layer_idx) +
+                               "/LayerNorm_1/scale"},
+          .axes = {0},
+          .shape = {config.vit_config.model_dim},
+          .min_size = Type::kBF16,
+      });
+}
+
+void TensorInfoRegistry::AddGriffinLayerTensors(const LayerConfig& layer_config,
+                                                const size_t layer_idx) {
+  const std::string suffix = LayerSuffix(layer_idx);
+  Add(suffix, {
+                  .base_name = "gr_lin_x_w",
+                  .source_names = {"recurrent_block/linear_x/kernel"},
+                  .axes = {1, 0},
+                  .shape = {layer_config.griffin_dim, layer_config.griffin_dim},
+              });
+  Add(suffix, {
+                  .base_name = "gr_lin_x_b",
+                  .source_names = {"recurrent_block/linear_x/bias"},
+                  .axes = {0},
+                  .shape = {layer_config.griffin_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix, {
+                  .base_name = "gr_lin_y_w",
+                  .source_names = {"recurrent_block/linear_y/kernel"},
+                  .axes = {1, 0},
+                  .shape = {layer_config.griffin_dim, layer_config.griffin_dim},
+              });
+  Add(suffix, {
+                  .base_name = "gr_lin_y_b",
+                  .source_names = {"recurrent_block/linear_y/bias"},
+                  .axes = {0},
+                  .shape = {layer_config.griffin_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix, {
+                  .base_name = "gr_lin_out_w",
+                  .source_names = {"recurrent_block/linear_out/kernel"},
+                  .axes = {1, 0},
+                  .shape = {layer_config.griffin_dim, layer_config.griffin_dim},
+              });
+  Add(suffix, {
+                  .base_name = "gr_lin_out_b",
+                  .source_names = {"recurrent_block/linear_out/bias"},
+                  .axes = {0},
+                  .shape = {layer_config.griffin_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix,
+      {
+          .base_name = "gr_conv_w",
+          .source_names = {"recurrent_block/conv_1d/w"},
+          .axes = {0, 1},
+          .shape = {layer_config.conv1d_width, layer_config.griffin_dim},
+          .min_size = Type::kF32,
+      });
+  Add(suffix, {
+                  .base_name = "gr_conv_b",
+                  .source_names = {"recurrent_block/conv_1d/b"},
+                  .axes = {0},
+                  .shape = {layer_config.griffin_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix, {
+                  .base_name = "gr1_gate_w",
+                  .source_names = {"recurrent_block/rg_lru/input_gate/w"},
+                  .axes = {0, 2, 1},
+                  .shape = {layer_config.heads,
+                            layer_config.griffin_dim / layer_config.heads,
+                            layer_config.griffin_dim / layer_config.heads},
+                  .concat_names = {"gr_gate_w", "gr2_gate_w"},
+              });
+  Add(suffix, {
+                  .base_name = "gr2_gate_w",
+                  .source_names = {"recurrent_block/rg_lru/a_gate/w"},
+                  .axes = {0, 2, 1},
+                  .shape = {layer_config.heads,
+                            layer_config.griffin_dim / layer_config.heads,
+                            layer_config.griffin_dim / layer_config.heads},
+                  .concat_names = {""},
+              });
+  Add(suffix, {
+                  .base_name = "gr_gate_w",
+                  .source_names = {"recurrent_block/rg_lru/gate/w"},
+                  .axes = {0, 2, 1},
+                  .shape = {2 * layer_config.heads,
+                            layer_config.griffin_dim / layer_config.heads,
+                            layer_config.griffin_dim / layer_config.heads},
+              });
+  Add(suffix, {
+                  .base_name = "gr1_gate_b",
+                  .source_names = {"recurrent_block/rg_lru/input_gate/b"},
+                  .axes = {0},
+                  .shape = {layer_config.griffin_dim},
+                  .concat_names = {"gr_gate_b", "gr2_gate_b"},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix, {
+                  .base_name = "gr2_gate_b",
+                  .source_names = {"recurrent_block/rg_lru/a_gate/b"},
+                  .axes = {0},
+                  .shape = {layer_config.griffin_dim},
+                  .concat_names = {""},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix, {
+                  .base_name = "gr_gate_b",
+                  .source_names = {"recurrent_block/rg_lru/input_gate/b"},
+                  .axes = {0, 1},
+                  .shape = {2 * layer_config.griffin_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix, {
+                  .base_name = "gr_a",
+                  .source_names = {"recurrent_block/rg_lru/a_param"},
+                  .axes = {0},
+                  .shape = {layer_config.griffin_dim},
+                  .min_size = Type::kF32,
+                  .scaled_softplus = true,
+              });
+}
+
+void TensorInfoRegistry::AddLayerTensors(const ModelConfig& config,
+                                         const LayerConfig& layer_config,
+                                         const size_t layer_idx) {
+  const std::string suffix = LayerSuffix(layer_idx);
+  Add(suffix, {
+                  .base_name = "key_norm",
+                  .source_names = {"attn/_key_norm/scale"},
+                  .axes = {0},
+                  .shape = {layer_config.qkv_dim},
+                  .min_size = Type::kBF16,
+              });
+  Add(suffix, {
+                  .base_name = "query_norm",
+                  .source_names = {"attn/_query_norm/scale"},
+                  .axes = {0},
+                  .shape = {layer_config.qkv_dim},
+                  .min_size = Type::kBF16,
+              });
+  Add(suffix, {
+                  .base_name = "qkv1_w",
+                  .source_names = {"attn/q_einsum/w"},
+                  .axes = {0, 2, 1},
+                  .shape = {layer_config.heads * layer_config.qkv_dim,
+                            config.model_dim},
+                  .concat_names = {"qkv_ein", "qkv2_w"},
+              });
+  Add(suffix, {
+                  .base_name = "qkv2_w",
+                  .source_names = {"attn/kv_einsum/w"},
+                  .axes = {1, 0, 3, 2},
+                  .shape = {2 * layer_config.kv_heads * layer_config.qkv_dim,
+                            config.model_dim},
+                  .concat_names = {""},
+              });
+  Add(suffix, {
+                  .base_name = "q_ein",
+                  .source_names = {"attention_block/proj_q/kernel"},
+                  .axes = {1, 0},
+                  .shape = {layer_config.model_dim, layer_config.model_dim},
+                  .concat_names = {"qkv_ein", "k_ein", "v_ein"},
+              });
+  Add(suffix, {
+                  .base_name = "k_ein",
+                  .source_names = {"attention_block/proj_k/kernel"},
+                  .axes = {1, 0},
+                  .shape = {layer_config.qkv_dim, layer_config.model_dim},
+                  .concat_names = {""},
+              });
+  Add(suffix, {
+                  .base_name = "v_ein",
+                  .source_names = {"attention_block/proj_v/kernel"},
+                  .axes = {1, 0},
+                  .shape = {layer_config.qkv_dim, layer_config.model_dim},
+                  .concat_names = {""},
+              });
+  Add(suffix, {
+                  .base_name = "qkv_ein",
+                  .source_names = {"attn/qkv_einsum/w"},
+                  .axes = {1, 0, 3, 2},
+                  .shape = {(layer_config.heads + 2 * layer_config.kv_heads) *
+                                layer_config.qkv_dim,
+                            config.model_dim},
+              });
+  Add(suffix, {
+                  .base_name = "attn_ob",
+                  .source_names = {"attention_block/proj_final/bias"},
+                  .axes = {0},
+                  .shape = {config.model_dim},
+                  .min_size = Type::kF32,
+              });
+
+  Add(suffix, {
+                  .base_name = "gating_ein",
+                  .source_names = {"mlp/gating_einsum/w", "mlp/gating_einsum",
+                                   "mlp_block/ffw_up/w"},
+                  .axes = {0, layer_config.optimized_gating ? 1u : 2u,
+                           layer_config.optimized_gating ? 2u : 1u},
+                  .shape = {2, layer_config.ff_hidden_dim, config.model_dim},
+              });
+  Add(suffix, {
+                  .base_name = "gating1_w",
+                  .source_names = {"none"},
+                  .axes = {0, layer_config.optimized_gating ? 1u : 2u,
+                           layer_config.optimized_gating ? 2u : 1u},
+                  .shape = {layer_config.ff_hidden_dim, config.model_dim},
+              });
+  Add(suffix, {
+                  .base_name = "gating2_w",
+                  .source_names = {"none"},
+                  .axes = {0, layer_config.optimized_gating ? 1u : 2u,
+                           layer_config.optimized_gating ? 2u : 1u},
+                  .shape = {layer_config.ff_hidden_dim, config.model_dim},
+              });
+  Add(suffix, {
+                  .base_name = "linear_w",
+                  .source_names = {"mlp/linear/w", "mlp/linear",
+                                   "mlp_block/ffw_down/kernel"},
+                  .axes = {1, 0},
+                  .shape = {config.model_dim, layer_config.ff_hidden_dim},
+              });
+  Add(suffix, {
+                  .base_name = "pre_att_ns",
+                  .source_names = {"pre_attention_norm/scale",
+                                   "temporal_pre_norm/scale"},
+                  .axes = {0},
+                  .shape = {config.model_dim},
+                  .min_size = Type::kBF16,
+              });
+  Add(suffix,
+      {
+          .base_name = "pre_ff_ns",
+          .source_names = {"pre_ffw_norm/scale", "channel_pre_norm/scale"},
+          .axes = {0},
+          .shape = {config.model_dim},
+          .min_size = Type::kBF16,
+      });
+  Add(suffix, {
+                  .base_name = "post_att_ns",
+                  .source_names = {"post_attention_norm/scale"},
+                  .axes = {0},
+                  .shape = {config.model_dim},
+                  .min_size = Type::kBF16,
+              });
+  Add(suffix, {
+                  .base_name = "post_ff_ns",
+                  .source_names = {"post_ffw_norm/scale"},
+                  .axes = {0},
+                  .shape = {config.model_dim},
+                  .min_size = Type::kBF16,
+              });
+  Add(suffix, {
+                  .base_name = "ffw_gat_b",
+                  .source_names = {"mlp_block/ffw_up/b"},
+                  .axes = {0},
+                  .shape = {2 * layer_config.ff_hidden_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix, {
+                  .base_name = "ffw_out_b",
+                  .source_names = {"mlp_block/ffw_down/bias"},
+                  .axes = {0},
+                  .shape = {config.model_dim},
+                  .min_size = Type::kF32,
+              });
+  Add(suffix,
+      {
+          .base_name = "att_ein",
+          .source_names = {"attn/attn_vec_einsum/w",
+                           "attention_block/proj_final/kernel"},
+          .preshape = {layer_config.heads, layer_config.qkv_dim,
+                       config.model_dim},
+          .axes = {0, 2, 1},
+          .shape = {layer_config.heads, config.model_dim, layer_config.qkv_dim},
+      });
+  Add(suffix,
+      {
+          .base_name = "att_w",
+          .shape = {config.model_dim, layer_config.heads, layer_config.qkv_dim},
+          .cols_take_extra_dims = true,
+      });
+
+  if (config.model == Model::GRIFFIN_2B) {
+    AddGriffinLayerTensors(layer_config, layer_idx);
+  }
+}
+
+TensorInfoRegistry::TensorInfoRegistry(const ModelConfig& config) {
+  // Upper bound on the number of `Add()` calls in `Add*Tensors()`. Loose bound
+  // in case those are changed without updating this. Better to allocate a bit
+  // more than to 1.5-2x the size if too little.
+  tensors_.reserve(10 + 32 * config.layer_configs.size() +
+                   24 * config.vit_config.layer_configs.size());
+  AddModelTensors(config);
+  for (size_t i = 0; i < config.layer_configs.size(); ++i) {
+    AddLayerTensors(config, config.layer_configs[i], i);
+  }
+  for (size_t i = 0; i < config.vit_config.layer_configs.size(); ++i) {
+    AddImageLayerTensors(config, config.vit_config.layer_configs[i], i);
+  }
+}
+
+TensorInfo TensorInfoRegistry::TensorInfoFromSourcePath(const std::string& path,
+                                                        int layer_idx) const {
+  for (const TensorInfo& tensor : tensors_) {
+    for (const std::string& source_name : tensor.source_names) {
+      // path ends with source_name?
+      const size_t pos = path.rfind(source_name);
+      if (pos != std::string::npos && path.size() == pos + source_name.size()) {
+        std::string name = tensor.base_name;
+        if (layer_idx >= 0) name += LayerSuffix(static_cast<size_t>(layer_idx));
+        return TensorInfoFromName(name);
+      }
+    }
+  }
+  return TensorInfo();
+}
+
+}  // namespace gcpp
diff --git a/gemma/tensor_info.h b/gemma/tensor_info.h
new file mode 100644
index 0000000..c8252a4
--- /dev/null
+++ b/gemma/tensor_info.h
@@ -0,0 +1,141 @@
+#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_TENSOR_INFO_H_
+#define THIRD_PARTY_GEMMA_CPP_GEMMA_TENSOR_INFO_H_
+
+#include <stddef.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "compression/types.h"  // Type
+#include "gemma/configs.h"
+#include "util/basics.h"  // Extents2D
+
+namespace gcpp {
+
+// Tensor metadata. This is far more than required to construct the `MatPtr` in
+// `LayerWeightsPtrs/WeightsPtrs`; they only use `.shape` via `ExtentsFromInfo`.
+// This is also bound to Python and filled by the exporter.
+struct TensorInfo {
+  // The base name of the tensor without a layer suffix.
+  std::string base_name;
+  // Strings to match to the end of the name of the tensor in the python model.
+  std::vector<std::string> source_names;
+  // Initial reshape shape. Use only as a last resort when input may have
+  // dimensions combined that need to be split before the transpose, as it
+  // defeats the post-transpose shape checking. Normally empty.
+  std::vector<size_t> preshape;
+  // Transpose axes arg. If the input tensor has more dimensions than axes,
+  // then leading dimensions are collapsed until the number of axes matches.
+  std::vector<size_t> axes;
+  // Expected final shape of the tensor after reshape/transpose.
+  // Note that this is the shape of the tensor during export,
+  // not the shape of the tensor in the sbs file, as the sbs file
+  // is restricted to 2D tensors. With few exceptions, the sbs file
+  // tensor rows gather all the excess dimensions. See cols_take_extra_dims.
+  std::vector<size_t> shape;
+  // List of names to concatenate with, used only if multiple tensors are
+  // concatenated into one. The first tensor in the concatenation should have
+  // concat names thus: The first name is the name of the result, and the
+  // tensors with the remaining names are concatenated after this.
+  // The remaining tensors to be concatenated should have just a single
+  // empty string in concat_names to indicate that they have been consumed.
+  std::vector<std::string> concat_names;
+  // Axis at which to concatenate.
+  size_t concat_axis = 0;
+  // The highest permissible compression for this tensor. The default is
+  // kNUQ, which provides maximum compression. Other values such as kBF16
+  // or kF32 can be used to limit the compression to a specific type.
+  Type min_size = Type::kNUQ;
+  // Whether to apply scaled softplus to the data.
+  bool scaled_softplus = false;
+  // Whether the columns or the rows take any extra dimensions.
+  // If false, then [10, 20, 30] -> [10*20, 30] and [30] -> [1, 30].
+  // If true, then [10, 20, 30] -> [10, 20*30] and [30] -> [1, 30].
+  bool cols_take_extra_dims = false;
+};
+
+// Collapses/expands the tensor dims into 2D extents, which may be 0, 0 for
+// not-present tensors such as ViT in a text-only model. Safely handles nullptr
+// returned from `TensorInfoRegistry::Find`, hence not a member function.
+static inline Extents2D ExtentsFromInfo(const TensorInfo* tensor) {
+  if (tensor == nullptr) return Extents2D(0, 0);
+
+  size_t cols = tensor->shape.back();
+  size_t rows = 1;
+  if (tensor->cols_take_extra_dims) {
+    rows = tensor->shape[0];
+    for (size_t i = 1; i < tensor->shape.size() - 1; ++i) {
+      cols *= tensor->shape[i];
+    }
+  } else {  // rows take extra dims
+    for (size_t i = 0; i < tensor->shape.size() - 1; ++i) {
+      rows *= tensor->shape[i];
+    }
+  }
+  // Sometimes only one of rows or cols is zero; set both for consistency.
+  if (rows == 0 || cols == 0) rows = cols = 0;
+  return Extents2D(rows, cols);
+}
+
+static inline std::string LayerSuffix(size_t layer_idx) {
+  return std::string("_") + std::to_string(layer_idx);
+}
+
+// Returns tensor base name without any layer suffix.
+static inline std::string StripLayerSuffix(const std::string& name) {
+  return name.substr(0, name.rfind('_'));
+}
+
+// Holds all `TensorInfo` for a model and retrieves them by (unique) name.
+class TensorInfoRegistry {
+ public:
+  explicit TensorInfoRegistry(const ModelConfig& config);
+  ~TensorInfoRegistry() = default;
+
+  // Returns nullptr if not found, otherwise the `TensorInfo` for the given
+  // `name`, which either lacks a suffix, or is per-layer and ends with
+  // `LayerSuffix(layer_idx)`. Used in `WeightsPtrs/LayerWeightsPtrs`.
+  const TensorInfo* Find(const std::string& name) const {
+    auto it = idx_from_name_.find(name);
+    if (it == idx_from_name_.end()) return nullptr;
+    return &tensors_[it->second];
+  }
+
+  // Returns a copy of the `TensorInfo` whose name matches the given name, or a
+  // default-constructed `TensorInfo` if not found. Destroying
+  // `TensorInfoRegistry` afterward will not invalidate the returned value.
+  TensorInfo TensorInfoFromName(const std::string& name) const {
+    const TensorInfo* info = Find(name);
+    if (info == nullptr) return TensorInfo();
+    return *info;
+  }
+
+  // Returns a copy of the `TensorInfo` whose source_name matches the end of the
+  // given path, and whose name ends with the given layer_idx, otherwise a
+  // default-constructed `TensorInfo`. Destroying `TensorInfoRegistry`
+  // afterward will not invalidate the returned value.
+  TensorInfo TensorInfoFromSourcePath(const std::string& path,
+                                      int layer_idx) const;
+
+ private:
+  // `suffix` is empty (only) for per-model tensors, otherwise `LayerSuffix`.
+  void Add(const std::string& suffix, const TensorInfo& info);
+  void AddModelTensors(const ModelConfig& config);
+  void AddLayerTensors(const ModelConfig& config,
+                       const LayerConfig& layer_config, size_t layer_idx);
+  void AddGriffinLayerTensors(const LayerConfig& layer_config,
+                              size_t layer_idx);
+
+  void AddImageLayerTensors(const ModelConfig& config,
+                            const LayerConfig& layer_config,
+                            size_t img_layer_idx);
+
+  std::vector<TensorInfo> tensors_;
+  // Includes entries for base name *and* the suffixed name for each layer.
+  std::unordered_map<std::string, size_t> idx_from_name_;
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_TENSOR_INFO_H_
diff --git a/gemma/tensor_info_test.cc b/gemma/tensor_info_test.cc
new file mode 100644
index 0000000..8a95376
--- /dev/null
+++ b/gemma/tensor_info_test.cc
@@ -0,0 +1,40 @@
+#include "gemma/tensor_info.h"
+
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+#include "compression/types.h"  // SfpStream
+#include "gemma/configs.h"
+#include "gemma/weights.h"
+#include "util/mat.h"
+#include "hwy/base.h"  // HWY_ASSERT_M
+
+namespace gcpp {
+namespace {
+
+// Tests for all models that each tensor in the model can be found and that the
+// TensorInfoRegistry returns the correct shape and name for the tensor.
+TEST(TensorInfoRegistryTest, Find) {
+  ForEachModel([&](Model model) {
+    const ModelConfig config(model, Type::kSFP, ChooseWrapping(model));
+    fprintf(stderr, "Testing %s (%s)\n", config.display_name.c_str(),
+            config.Specifier().c_str());
+    const TensorInfoRegistry tensors(config);
+    // Each tensor in the model should be known/found.
+    WeightsPtrs weights(config);
+    weights.ForEachTensor(nullptr, nullptr, [&tensors](const TensorArgs& t) {
+      const TensorInfo* info = tensors.Find(t.mat.Name());
+      HWY_ASSERT_M(info, t.mat.Name());
+      // Test that the `MatPtr` can be constructed from the TensorInfo,
+      // and that the dimensions match.
+      const MatPtr mat_ptr(t.mat.Name(), Type::kUnknown,
+                           ExtentsFromInfo(tensors.Find(t.mat.Name())));
+      EXPECT_STREQ(t.mat.Name(), mat_ptr.Name()) << t.mat.Name();
+      EXPECT_EQ(t.mat.Rows(), mat_ptr.Rows()) << t.mat.Name();
+      EXPECT_EQ(t.mat.Cols(), mat_ptr.Cols()) << t.mat.Name();
+    });
+  });
+}
+
+}  // namespace
+}  // namespace gcpp
diff --git a/gemma/tokenizer.cc b/gemma/tokenizer.cc
index e48abae..6e39f27 100644
--- a/gemma/tokenizer.cc
+++ b/gemma/tokenizer.cc
@@ -21,9 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "compression/io.h"      // Path
-#include "compression/shared.h"  // PromptWrapping
-#include "gemma/common.h"        // Wrap
+#include "gemma/configs.h"       // PromptWrapping
 #include "hwy/base.h"              // HWY_ASSERT
 #include "hwy/profiler.h"
 // copybara:import_next_line:sentencepiece
@@ -37,24 +35,20 @@ constexpr bool kShowTokenization = false;
 class GemmaTokenizer::Impl {
  public:
   Impl() = default;
-  explicit Impl(const Path& tokenizer_path) {
-    PROFILER_ZONE("Startup.tokenizer");
-    spp_ = std::make_unique<sentencepiece::SentencePieceProcessor>();
-    if (!spp_->Load(tokenizer_path.path).ok()) {
-      HWY_ABORT("Failed to load the tokenizer file.");
-    }
-  }
   // Loads the tokenizer from a serialized proto.
   explicit Impl(const std::string& tokenizer_proto) {
+    if (tokenizer_proto == kMockTokenizer) return;
     PROFILER_ZONE("Startup.tokenizer");
     spp_ = std::make_unique<sentencepiece::SentencePieceProcessor>();
     if (!spp_->LoadFromSerializedProto(tokenizer_proto).ok()) {
-      fprintf(stderr, "serialized proto size=%zu.\n", tokenizer_proto.size());
-      HWY_ABORT("Failed to load the tokenizer from serialized proto.");
+      HWY_ABORT("Failed to load tokenizer from %zu byte serialized proto.",
+                tokenizer_proto.size());
     }
   }
 
-  std::string Serialize() const { return spp_->serialized_model_proto(); }
+  std::string Serialize() const {
+    return spp_ ? spp_->serialized_model_proto() : kMockTokenizer;
+  }
 
   bool Encode(const std::string& input,
               std::vector<std::string>* pieces) const {
@@ -82,22 +76,18 @@ class GemmaTokenizer::Impl {
   std::unique_ptr<sentencepiece::SentencePieceProcessor> spp_;
 };
 
-GemmaTokenizer::GemmaTokenizer(const Path& tokenizer_path) {
-  impl_ = std::make_unique<Impl>(tokenizer_path);
+GemmaTokenizer::GemmaTokenizer(const std::string& tokenizer_proto)
+    : impl_(std::make_unique<Impl>(tokenizer_proto)) {
+  HWY_ASSERT(impl_);
 }
 
 // Default suffices, but they must be defined after GemmaTokenizer::Impl.
-GemmaTokenizer::GemmaTokenizer() = default;
 GemmaTokenizer::~GemmaTokenizer() = default;
 GemmaTokenizer::GemmaTokenizer(GemmaTokenizer&& other) = default;
 GemmaTokenizer& GemmaTokenizer::operator=(GemmaTokenizer&& other) = default;
 
 std::string GemmaTokenizer::Serialize() const { return impl_->Serialize(); }
 
-void GemmaTokenizer::Deserialize(const std::string& tokenizer_proto) {
-  impl_ = std::make_unique<Impl>(tokenizer_proto);
-}
-
 bool GemmaTokenizer::Encode(const std::string& input,
                             std::vector<std::string>* pieces) const {
   return impl_->Encode(input, pieces);
@@ -114,57 +104,109 @@ bool GemmaTokenizer::Decode(const std::vector<int>& ids,
   return impl_->Decode(ids, detokenized);
 }
 
-std::vector<int> WrapAndTokenize(const GemmaTokenizer& tokenizer,
-                                 const ModelInfo& info, size_t pos,
-                                 std::string& prompt) {
-  Wrap(info, pos, prompt);
+GemmaChatTemplate::GemmaChatTemplate(const GemmaTokenizer& tokenizer,
+                                     Model model) {
+  sot_user_.reserve(3);
+  if (!tokenizer.Encode("<start_of_turn>user\n", &sot_user_)) return;
+  sot_model_.reserve(3);
+  HWY_ASSERT(tokenizer.Encode("<start_of_turn>model\n", &sot_model_));
+  eot_.reserve(2);
+  HWY_ASSERT(tokenizer.Encode("<end_of_turn>\n", &eot_));
 
-  std::vector<int> tokens;
-  HWY_ASSERT(tokenizer.Encode(prompt, &tokens));
-  // Both pre-trained and instruction-tuned require BOS as first token.
-  if (pos == 0) {
-    tokens.insert(tokens.begin(), BOS_ID);
-  }
-
-  // PaliGemma separator. The SEP token "\n" is always tokenized separately.
-  if (info.wrapping == PromptWrapping::PALIGEMMA
-      // || info.wrapping == PromptWrapping::GEMMA_VLM
-  ) {
-    std::vector<int> sep_tokens;
-    HWY_ASSERT(tokenizer.Encode("\n", &sep_tokens));
-    tokens.insert(tokens.end(), sep_tokens.begin(), sep_tokens.end());
-  }
-
-  return tokens;
+  HWY_ASSERT(tokenizer.Encode("\n", &pali_sep_));
+  vlm_soi_.reserve(2);
+  HWY_ASSERT(tokenizer.Encode("\n\n<start_of_image>", &vlm_soi_));
+  vlm_eoi_.reserve(2);
+  HWY_ASSERT(tokenizer.Encode("<end_of_image>\n\n", &vlm_eoi_));
 }
 
-std::vector<int> WrapVLM(const GemmaTokenizer& tokenizer, const ModelInfo& info,
-                         size_t pos, std::vector<int>& tokens,
-                         size_t image_batch_size, size_t max_image_batch_size) {
-  HWY_ASSERT(info.wrapping == PromptWrapping::GEMMA_VLM);
-  size_t num_images = hwy::DivCeil(image_batch_size, max_image_batch_size);
+std::vector<int> GemmaChatTemplate::Apply(size_t pos,
+                                          const std::vector<int>& ids) const {
+  HWY_ASSERT_M(!sot_user_.empty() && !sot_model_.empty() && !eot_.empty(),
+               "GemmaChatTemplate has not been initialized.");
+  std::vector<int> out;
+  out.reserve(eot_.size() + sot_user_.size() + ids.size() + eot_.size() +
+              sot_model_.size());
 
-  std::vector<int> sep_tokens;
-  HWY_ASSERT(tokenizer.Encode("\n", &sep_tokens));
-
-  std::string begin_image_prompt = "\n\n<start_of_image>";
-  std::vector<int> begin_image_tokens =
-      WrapAndTokenize(tokenizer, info, pos, begin_image_prompt);
-
-  std::string end_image_prompt = "<end_of_image>\n\n";
-  std::vector<int> end_image_tokens =
-      WrapAndTokenize(tokenizer, info, pos, end_image_prompt);
-
-  for (size_t i = 0; i < num_images; ++i) {
-    tokens.insert(tokens.begin(), begin_image_tokens.begin(),
-                  begin_image_tokens.end());
-    tokens.insert(tokens.begin() + begin_image_tokens.size(), image_batch_size,
-                  -2);
-    tokens.insert(tokens.begin() + begin_image_tokens.size() + image_batch_size,
-                  end_image_tokens.begin(), end_image_tokens.end());
+  // Start with BOS, or prepend end_of_turn if this is a continuation.
+  if (pos == 0) {
+    out.push_back(BOS_ID);
+  } else {
+    out.insert(out.cend(), eot_.cbegin(), eot_.cend());
   }
+  // Start of user turn, user prompt, end of turn; then start of model turn.
+  out.insert(out.cend(), sot_user_.cbegin(), sot_user_.cend());
+  out.insert(out.cend(), ids.cbegin(), ids.cend());
+  out.insert(out.cend(), eot_.cbegin(), eot_.cend());
+  out.insert(out.cend(), sot_model_.cbegin(), sot_model_.cend());
+  return out;
+}
 
-  return tokens;
+std::vector<int> GemmaChatTemplate::WrapPali(const std::vector<int>& text_part,
+                                             size_t image_batch_size) const {
+  HWY_ASSERT_M(!pali_sep_.empty(),
+               "GemmaChatTemplate has not been initialized.");
+  std::vector<int> out;
+  out.reserve(image_batch_size + 1 + text_part.size() + pali_sep_.size());
+  out.resize(image_batch_size, 0);
+  out.push_back(BOS_ID);
+  out.insert(out.cend(), text_part.cbegin(), text_part.cend());
+  out.insert(out.cend(), pali_sep_.cbegin(), pali_sep_.cend());
+  return out;
+}
+
+std::vector<int> GemmaChatTemplate::WrapVLM(const std::vector<int>& text_part,
+                                            size_t image_batch_size) const {
+  HWY_ASSERT_M(!vlm_soi_.empty() && !vlm_eoi_.empty(),
+               "GemmaChatTemplate has not been initialized.");
+  std::vector<int> out;
+  out.reserve(text_part.size() + vlm_soi_.size() + image_batch_size +
+              vlm_eoi_.size());
+  out.insert(out.cend(), text_part.cbegin(), text_part.cend());
+  out.insert(out.cend(), vlm_soi_.cbegin(), vlm_soi_.cend());
+  out.insert(out.cend(), image_batch_size, -2);
+  out.insert(out.cend(), vlm_eoi_.cbegin(), vlm_eoi_.cend());
+  return out;
+}
+
+// Text
+std::vector<int> WrapAndTokenize(const GemmaTokenizer& tokenizer,
+                                 const GemmaChatTemplate& chat_template,
+                                 const PromptWrapping wrapping, size_t pos,
+                                 const std::string& prompt) {
+  std::vector<int> tokens;
+  HWY_ASSERT(tokenizer.Encode(prompt, &tokens));
+
+  switch (wrapping) {
+    case PromptWrapping::GEMMA_IT:
+    case PromptWrapping::GEMMA_VLM:
+      return chat_template.Apply(pos, tokens);
+    default:
+      if (pos == 0) {
+        tokens.insert(tokens.cbegin(), BOS_ID);
+      }
+      return tokens;
+  }
+}
+
+// Vision
+std::vector<int> WrapAndTokenize(const GemmaTokenizer& tokenizer,
+                                 const GemmaChatTemplate& chat_template,
+                                 const PromptWrapping wrapping, size_t pos,
+                                 const std::string& prompt,
+                                 size_t image_batch_size) {
+  std::vector<int> text_part;
+  HWY_ASSERT(tokenizer.Encode(prompt, &text_part));
+  switch (wrapping) {
+    case PromptWrapping::PALIGEMMA:
+      HWY_ASSERT(pos == 0);
+      return chat_template.WrapPali(text_part, image_batch_size);
+    case PromptWrapping::GEMMA_VLM:
+      return chat_template.Apply(
+          pos, chat_template.WrapVLM(text_part, image_batch_size));
+    default:
+      HWY_ASSERT_M(false, "Current variant does not support vision prompt.");
+  }
 }
 
 }  // namespace gcpp
diff --git a/gemma/tokenizer.h b/gemma/tokenizer.h
index 0bbd8f4..aca01f9 100644
--- a/gemma/tokenizer.h
+++ b/gemma/tokenizer.h
@@ -22,29 +22,30 @@
 #include <string>
 #include <vector>
 
-#include "compression/io.h"  // Path
-#include "gemma/common.h"    // ModelInfo
+#include "gemma/configs.h"  // PromptWrapping
 
 namespace gcpp {
 
-// The tokenizer's end of sentence and beginning of sentence token ids.
-constexpr int EOS_ID = 1;
-constexpr int SECONDARY_EOS_ID = 106;  // for Gemma 3
-constexpr int BOS_ID = 2;
+constexpr int BOS_ID = 2;  // beginning of sequence
+
+// To avoid the complexity of storing the tokenizer into testdata/ or
+// downloading from gs://, while still always writing a blob for the tokenizer,
+// but also avoiding empty blobs, we store this placeholder string.
+constexpr const char* kMockTokenizer = "unavailable";
 
 class GemmaTokenizer {
+  // These must be defined after the definition of `Impl`.
  public:
-  GemmaTokenizer();
-  explicit GemmaTokenizer(const Path& tokenizer_path);
-
-  // must come after definition of Impl
+  // If unavailable, pass `kMockTokenizer`.
+  explicit GemmaTokenizer(const std::string& tokenizer_proto);
   ~GemmaTokenizer();
   GemmaTokenizer(GemmaTokenizer&& other);
   GemmaTokenizer& operator=(GemmaTokenizer&& other);
 
+  // Returns `kMockTokenizer` if unavailable.
   std::string Serialize() const;
-  void Deserialize(const std::string& tokenizer_proto);
 
+  // Returns false on failure or if unavailable.
   bool Encode(const std::string& input, std::vector<std::string>* pieces) const;
   bool Encode(const std::string& input, std::vector<int>* ids) const;
   bool Decode(const std::vector<int>& ids, std::string* detokenized) const;
@@ -54,13 +55,38 @@ class GemmaTokenizer {
   std::unique_ptr<Impl> impl_;
 };
 
-std::vector<int> WrapAndTokenize(const GemmaTokenizer& tokenizer,
-                                 const ModelInfo& info, size_t pos,
-                                 std::string& prompt);
+class GemmaChatTemplate {
+ public:
+  // No effect if `tokenizer` is unavailable, but any other method may abort.
+  GemmaChatTemplate(const GemmaTokenizer& tokenizer, Model model);
 
-std::vector<int> WrapVLM(const GemmaTokenizer& tokenizer, const ModelInfo& info,
-                         size_t pos, std::vector<int>& tokens,
-                         size_t image_batch_size, size_t max_image_batch_size);
+  // Given prompt tokens, this returns the wrapped prompt including BOS and
+  // any "start_of_turn" structure required by the model.
+  std::vector<int> Apply(size_t pos, const std::vector<int>& ids) const;
+  std::vector<int> WrapPali(const std::vector<int>& text_part,
+                            size_t image_batch_size) const;
+  std::vector<int> WrapVLM(const std::vector<int>& text_part,
+                           size_t image_batch_size) const;
+
+ private:
+  std::vector<int> sot_user_;
+  std::vector<int> sot_model_;
+  std::vector<int> eot_;
+  std::vector<int> pali_sep_;
+  std::vector<int> vlm_soi_;
+  std::vector<int> vlm_eoi_;
+};
+
+std::vector<int> WrapAndTokenize(const GemmaTokenizer& tokenizer,
+                                 const GemmaChatTemplate& chat_template,
+                                 PromptWrapping wrapping, size_t pos,
+                                 const std::string& prompt);
+
+std::vector<int> WrapAndTokenize(const GemmaTokenizer& tokenizer,
+                                 const GemmaChatTemplate& chat_template,
+                                 PromptWrapping wrapping, size_t pos,
+                                 const std::string& prompt,
+                                 size_t image_batch_size);
 
 }  // namespace gcpp
 
diff --git a/gemma/vit.cc b/gemma/vit.cc
new file mode 100644
index 0000000..3549f85
--- /dev/null
+++ b/gemma/vit.cc
@@ -0,0 +1,349 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>  // sqrtf
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
+#include "gemma/activations.h"
+#include "gemma/gemma.h"
+#include "gemma/gemma_args.h"
+#include "gemma/weights.h"
+#include "paligemma/image.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/profiler.h"
+
+// Compiles this file for multiple architectures via "foreach_target.h", to
+// which we pass the filename via macro 'argument'.
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "gemma/vit.cc"  // NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+// After highway.h
+#include "gemma/gemma-inl.h"
+#include "ops/ops-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+
+// Wrapper class; holds arguments in member variables to shorten call sites.
+// The main differences to GemmaAttention are:
+// - no KV Cache necessary, attention is always all-to-all and not causal.
+// - no potential wrap-around, attention always goes from 0 to kSeqLen.
+// - no need for batching, as we are always computing attention for kSeqLen
+//   tokens.
+// This results in a much simpler implementation. However, to avoid duplicating
+// code, we should still consider merging the two classes.
+// TODO(keysers): Refactor to share code with GemmaAttention.
+class VitAttention {
+  // Computes Q, K, V for all heads, stored in activations_.q.
+  HWY_NOINLINE void ComputeQKV() {
+    PROFILER_ZONE("Gen.VitAttention.QKV");
+    auto& qkv = activations_.attention.q;
+    HWY_ASSERT(qkv.Rows() == num_tokens_);
+    HWY_ASSERT(qkv.Cols() == layer_config_.heads * 3 * layer_config_.qkv_dim);
+    CallMatMul(activations_.attention.pre_att_rms_out, layer_.vit.qkv_einsum_w,
+               layer_.vit.qkv_einsum_b.PackedScale1(), env_, qkv);
+  }
+
+  // TODO(philculliton): transition fully to MatMul.
+  HWY_NOINLINE void DotSoftmaxWeightedSumMatrix() {
+    const size_t qkv_dim = layer_config_.qkv_dim;
+    const size_t heads = layer_config_.heads;
+    HWY_ASSERT_M(heads == layer_config_.kv_heads, "Vit expects MHA");
+    const size_t seq_len =
+        static_cast<size_t>(activations_.attention.div_seq_len.GetDivisor());
+    const float query_scale = 1.0f / sqrtf(static_cast<float>(qkv_dim));
+    PROFILER_ZONE("Gen.VitAttention.DotSoftmax");
+
+    // Shift Q, K, VT to MatStorageT.
+    MatStorageT<float> Q("Q2", Extents2D(num_tokens_, qkv_dim),
+                         env_.ctx.allocator, MatPadding::kPacked);
+    MatStorageT<float> K("K2", Extents2D(seq_len, qkv_dim), env_.ctx.allocator,
+                         MatPadding::kPacked);
+    MatStorageT<float> C("C2", Extents2D(num_tokens_, seq_len),
+                         env_.ctx.allocator, MatPadding::kPacked);
+
+    // Initialize att_out to zero prior to head loop.
+    ZeroInit(activations_.attention.att_out);
+
+    for (size_t head = 0; head < heads; ++head) {
+      pool_.Run(0, num_tokens_, [&](uint64_t task, size_t worker) HWY_ATTR {
+        const size_t token = task;
+        float* HWY_RESTRICT q =
+            activations_.attention.q.Row(token) + head * 3 * qkv_dim;
+        // TODO: shift to MatMul with A.scale once MatMul is confirmed working
+        MulByConst(query_scale, q, qkv_dim, worker);
+        hwy::CopyBytes(q, Q.Row(token), qkv_dim * sizeof(float));
+      });
+
+      pool_.Run(0, seq_len, [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
+        const size_t seq_idx = task;
+        float* HWY_RESTRICT k = activations_.attention.q.Row(seq_idx) +
+                                head * 3 * qkv_dim + qkv_dim;
+        hwy::CopyBytes(k, K.Row(seq_idx), qkv_dim * sizeof(float));
+      });
+
+      // this produces C, a (num_tokens_, seq_len) matrix of dot products
+      CallMatMul(Q, K, nullptr, env_, C);
+
+      pool_.Run(0, num_tokens_, [&](uint64_t task, size_t worker) HWY_ATTR {
+        float* HWY_RESTRICT c = C.Row(task);
+        Softmax(c, C.Cols(), worker);
+      });
+
+      pool_.Run(0, num_tokens_, [&](uint64_t task, size_t worker) HWY_ATTR {
+        size_t token = task;
+        float* HWY_RESTRICT att_out =
+            activations_.attention.att_out.Row(token) + head * qkv_dim;
+        for (size_t i = 0; i < seq_len; ++i) {
+          float* HWY_RESTRICT v = activations_.attention.q.Row(i) +
+                                  head * 3 * qkv_dim + 2 * qkv_dim;
+          MulByConstAndAdd(C.Row(token)[i], v, att_out, qkv_dim, worker);
+        }
+      });
+    }
+  }
+
+  HWY_NOINLINE void DotSoftmaxWeightedSum() {
+    const size_t qkv_dim = layer_config_.qkv_dim;
+    const size_t heads = layer_config_.heads;
+    HWY_ASSERT_M(heads == layer_config_.kv_heads, "Vit expects MHA");
+    const size_t seq_len =
+        static_cast<size_t>(activations_.attention.div_seq_len.GetDivisor());
+    const float query_scale = 1.0f / sqrtf(static_cast<float>(qkv_dim));
+    PROFILER_ZONE("Gen.VitAttention.DotSoftmax");
+
+    // Compute Q.K, softmax, and weighted V.
+    pool_.Run(0, layer_config_.heads * num_tokens_,
+              [&](uint64_t task, size_t worker) HWY_ATTR {
+                const size_t head = task % layer_config_.heads;
+                const size_t token = task / layer_config_.heads;
+                // Compute Q.K scores, which are "logits" stored in head_att.
+                float* HWY_RESTRICT q =
+                    activations_.attention.q.Row(token) + head * 3 * qkv_dim;
+                MulByConst(query_scale, q, qkv_dim, worker);
+                float* HWY_RESTRICT head_att =
+                    activations_.attention.att.Row(token) + head * seq_len;
+                for (size_t i = 0; i < seq_len; ++i) {
+                  float* HWY_RESTRICT k = activations_.attention.q.Row(i) +
+                                          head * 3 * qkv_dim + qkv_dim;
+                  head_att[i] = Dot(q, k, qkv_dim);  // score = q.k
+                }
+                // SoftMax yields "probabilities" in head_att.
+                Softmax(head_att, seq_len, worker);
+                // Compute weighted sum of v into att_out.
+                float* HWY_RESTRICT att_out =
+                    activations_.attention.att_out.Row(token) + head * qkv_dim;
+                hwy::ZeroBytes(att_out, qkv_dim * sizeof(*att_out));
+                for (size_t i = 0; i < seq_len; ++i) {
+                  float* HWY_RESTRICT v = activations_.attention.q.Row(i) +
+                                          head * 3 * qkv_dim + 2 * qkv_dim;
+                  MulByConstAndAdd(head_att[i], v, att_out, qkv_dim, worker);
+                }
+              });
+  }
+
+  // Sums encoded (`att_out`) over num_heads (`layer_config_.heads`) and
+  // head_dim (`qkv_dim`) into output (`att_sums`).
+  HWY_NOINLINE void SumHeads() {
+    PROFILER_ZONE("Gen.VitAttention.SumHeads");
+    auto* bias = layer_.vit.attn_out_b.PackedScale1();
+    // att_weights and att_out are concatenated heads, each of length
+    // qkv_dim. Thus the [num_tokens_, layer_config_.model_dim]
+    // matmul output is the sum over heads.
+    CallMatMul(activations_.attention.att_out, layer_.vit.attn_out_w, bias,
+               env_, activations_.attention.att_sums);
+  }
+
+ public:
+  VitAttention(size_t num_tokens, size_t layer_idx, Activations& activations,
+               const LayerWeightsPtrs& layer, MatMulEnv& env)
+      : num_tokens_(num_tokens),
+        activations_(activations),
+        layer_(layer),
+        layer_config_(layer.layer_config),
+        env_(env),
+        pool_(env_.ctx.pools.Pool(0)) {}
+
+  HWY_INLINE void operator()() {
+    ComputeQKV();
+    if (activations_.attention.config.wrapping == PromptWrapping::GEMMA_VLM) {
+      DotSoftmaxWeightedSumMatrix();
+    } else {
+      DotSoftmaxWeightedSum();
+    }
+    SumHeads();
+  }
+
+ private:
+  const size_t num_tokens_;
+  Activations& activations_;
+  const LayerWeightsPtrs& layer_;
+  const LayerConfig& layer_config_;
+  MatMulEnv& env_;
+  hwy::ThreadPool& pool_;
+};
+
+// Same as FFWNoVit, but with different layer members and no second
+// gating matrix.
+void FFWVit(const LayerWeightsPtrs& layer, Activations& activations,
+            MatMulEnv& env) {
+  PROFILER_ZONE("Gen.FFW.ViT");
+  const LayerConfig& layer_config = layer.layer_config;
+
+  const bool add_bias = layer_config.ff_biases;
+  const float* bias1 = add_bias ? layer.vit.linear_0_b.PackedScale1() : nullptr;
+  const float* output_bias =
+      add_bias ? layer.vit.linear_1_b.PackedScale1() : nullptr;
+
+  // Compute the hidden layer activations.
+  CallMatMul(activations.pre_ffw_rms_out, layer.vit.linear_0_w, bias1, env,
+             activations.C1);
+
+  // Activation (Gelu), store in C1.
+  ActivationBatched(layer_config.activation, activations.C1, env.ctx.pools);
+
+  // Hidden layer -> output layer.
+  CallMatMul(activations.C1, layer.vit.linear_1_w, output_bias, env,
+             activations.ffw_out);
+}
+
+// Vit transformer layer. Some comments below refer to the Vit implementation in
+// the Big Vision codebase. See
+// github.com/google-research/big_vision/blob/main/big_vision/models/vit.py
+// TODO(keysers): consider adding a wrapper for both LayerNorm with RMSNorm and
+// try merging this with TransformerLayer.
+void VitTransformerLayer(size_t num_tokens, const size_t layer_idx,
+                         const LayerWeightsPtrs& layer,
+                         Activations& activations, MatMulEnv& env) {
+  const size_t model_dim = activations.attention.config.model_dim;
+  auto type = layer.layer_config.type;
+  HWY_DASSERT(type == LayerAttentionType::kVit);
+  (void)type;
+  (void)model_dim;
+
+  auto& x = activations.x;
+  HWY_DASSERT(x.Rows() == num_tokens);
+  HWY_DASSERT(x.Cols() == model_dim);
+
+  // y = nn.LayerNorm()(x)
+  // y ~ pre_att_rms_out
+  LayerNormBatched(x, layer.vit.layer_norm_0_scale, layer.vit.layer_norm_0_bias,
+                   activations.attention.pre_att_rms_out);
+
+  // y = out["sa"] = nn.MultiHeadDotProductAttention(...)(y, y)
+  // y ~ att_sums
+  VitAttention(num_tokens, layer_idx, activations, layer, env)();
+
+  // x = out["+sa"] = x + y
+  AddFromBatched(activations.attention.att_sums, x, env.ctx);
+
+  // y = nn.LayerNorm()(x)
+  // y ~ pre_ffw_rms_out
+  LayerNormBatched(x, layer.vit.layer_norm_1_scale, layer.vit.layer_norm_1_bias,
+                   activations.pre_ffw_rms_out);
+
+  // y = out["mlp"] = MlpBlock(...)(y)
+  // y ~ ffw_out
+  FFWVit(layer, activations, env);
+
+  // x = out["+mlp"] = x + y
+  AddFromBatched(activations.ffw_out, x, env.ctx);
+}
+
+// Gets the patches of the image and embeds them with the image embedding
+// kernel. The result is stored in activations.x.
+static HWY_NOINLINE void EmbedImagePatches(const Image& image,
+                                           const ModelConfig& model_config,
+                                           const WeightsPtrs& weights,
+                                           Activations& activations,
+                                           MatMulEnv& env) {
+  const size_t model_dim = model_config.vit_config.model_dim;
+  const size_t patch_width = model_config.vit_config.patch_width;
+  const size_t num_tokens = model_config.vit_config.seq_len;
+  const size_t patch_size = patch_width * patch_width * 3;
+  HWY_DASSERT(weights.vit_img_embedding_kernel.Rows() == model_dim);
+  HWY_DASSERT(weights.vit_img_embedding_kernel.Cols() == patch_size);
+  HWY_DASSERT(activations.x.Cols() == model_dim);
+  (void)model_dim;
+  // img/embedding/kernel has original shape (14, 14, 3, 1152)
+  // H x W x C x D transposed to D x (H x W x C) so here (1152, 14 * 14 * 3)
+  // image_patches is (256, 14 * 14 * 3)
+  // Must be padded, see `DoDecompressA`.
+  MatStorageT<float> image_patches("patches", Extents2D(num_tokens, patch_size),
+                                   env.ctx.allocator, MatPadding::kOdd);
+  for (size_t i = 0; i < num_tokens; ++i) {
+    image.GetPatch(i, image_patches.Row(i));
+  }
+  CallMatMul(image_patches, weights.vit_img_embedding_kernel,
+             weights.vit_img_embedding_bias.PackedScale1(), env, activations.x);
+  // Add position embeddings.
+  CallUpcastedActivation(&weights.vit_img_pos_embedding,
+                         [&](const auto* weights_t) {
+                           AddFromBatched(*weights_t, activations.x, env.ctx);
+                         });
+}
+
+// Prefills the image tokens with the ViT encoder.
+void PrefillVit(const ModelConfig& model_config, const WeightsPtrs& weights,
+                const RuntimeConfig& runtime_config, const Image& image,
+                ImageTokens& image_tokens, Activations& activations,
+                MatMulEnv& env) {
+  PROFILER_ZONE("Gen.PrefillVit");
+  const size_t num_tokens = model_config.vit_config.seq_len;
+  const size_t vit_model_dim = model_config.vit_config.model_dim;
+  HWY_ASSERT(num_tokens == activations.x.Rows());
+  // Embed the image patches.
+  EmbedImagePatches(image, model_config, weights, activations, env);
+  // Go through all layers.
+  for (size_t layer_idx = 0;
+       layer_idx < model_config.vit_config.layer_configs.size(); ++layer_idx) {
+    VitTransformerLayer(num_tokens, layer_idx, *weights.VitLayer(layer_idx),
+                        activations, env);
+  }
+  // Final Layernorm.
+  LayerNormBatched(activations.x, weights.vit_encoder_norm_scale,
+                   weights.vit_encoder_norm_bias, activations.x);
+
+  if (model_config.wrapping == PromptWrapping::GEMMA_VLM) {
+    activations.x = AvgPool4x4(activations.x, env.ctx.allocator);
+
+    // Apply soft embedding norm before input projection.
+    CallUpcasted(&weights.mm_embed_norm, [&](const auto* weights_t) {
+      RMSNormInplace(weights_t->PackedScale1(), 0, activations.x.Row(0),
+                     vit_model_dim, /*worker=*/0);
+    });
+  }
+
+  // Apply head embedding into image_tokens of size of the LLM kModelDim.
+  CallMatMul(activations.x, weights.vit_img_head_kernel,
+             weights.vit_img_head_bias.PackedScale1(), env, image_tokens);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
diff --git a/gemma/vit.h b/gemma/vit.h
new file mode 100644
index 0000000..d6562f6
--- /dev/null
+++ b/gemma/vit.h
@@ -0,0 +1,50 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_VIT_H_
+#define THIRD_PARTY_GEMMA_CPP_GEMMA_VIT_H_
+
+// Declares vision transformer FFW/Prefill for all SIMD targets.
+
+#include <stddef.h>
+
+#include "gemma/gemma.h"
+#include "hwy/highway.h"
+
+namespace gcpp {
+
+// Passed to HWY_VISIT_TARGETS; declares for one target.
+#define GEMMA_DECL_VIT(TARGET, NAMESPACE)                                      \
+  namespace NAMESPACE {                                                        \
+  void FFWVit(const LayerWeightsPtrs& layer, Activations& activations,         \
+              MatMulEnv& env);                                                 \
+                                                                               \
+  void PrefillVit(const ModelConfig& model_config, const WeightsPtrs& weights, \
+                  const RuntimeConfig& runtime_config, const Image& image,     \
+                  ImageTokens& image_tokens, Activations& activations,         \
+                  MatMulEnv& env);                                             \
+  /* NOLINTNEXTLINE(google-readability-namespace-comments) */                  \
+  }  // namespace NAMESPACE
+
+// Function declarations for each SIMD target. Allows direct call from the
+// per-target namespace. We may later replace this with dynamic dispatch if
+// the overhead is acceptable.
+HWY_VISIT_TARGETS(GEMMA_DECL_VIT)
+
+#undef GEMMA_DECL_VIT
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_VIT_H_
diff --git a/gemma/weights.cc b/gemma/weights.cc
index d281391..3418acf 100644
--- a/gemma/weights.cc
+++ b/gemma/weights.cc
@@ -15,272 +15,172 @@
 
 #include "gemma/weights.h"
 
-#include <cstdio>
-#include <cstdlib>
-#include <memory>
-#include <random>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <mutex>  // NOLINT
 #include <string>
 #include <vector>
 
-#include "compression/blob_store.h"
-#include "compression/compress-inl.h"
 #include "compression/compress.h"
-#include "compression/io.h"  // Path
-#include "compression/shared.h"
-#include "gemma/common.h"
+#include "compression/types.h"
 #include "gemma/configs.h"
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"  // HWY_ABORT
+#include "gemma/gemma_args.h"
+#include "gemma/model_store.h"
+#include "io/blob_store.h"
+#include "ops/matmul.h"  // MMParallel
+#include "util/mat.h"
+#include "util/threading_context.h"
+#include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
 #include "hwy/profiler.h"
-#include "hwy/stats.h"
+
+// TODO: move into foreach_target
+#include "compression/compress-inl.h"
 
 namespace gcpp {
 
-template <typename T>
-struct TensorLoader {
-  void operator()(ModelWeightsPtrs<T>& weights, ForEachType fet,
-                  ReadFromBlobStore& loader) {
-    weights.ForEachTensor(
-        {&weights}, fet,
-        [&loader](const char* name, hwy::Span<MatPtr*> tensors) {
-          loader(name, tensors);
-        });
-  }
-};
+// Copies att_weights from `attn_vec_einsum_w`.
+void LayerWeightsPtrs::InitAttWeights(std::vector<MatOwner>& mat_owners,
+                                      const Allocator& allocator) {
+  // We only use this tensor for Gemma layers.
+  if (layer_config.type != LayerAttentionType::kGemma) return;
 
-BlobError ModelWeightsStorage::Load(const Path& weights, Model model_type,
-                                    Type weight_type, PromptWrapping wrapping,
-                                    hwy::ThreadPool& pool,
-                                    std::string* tokenizer_proto) {
-  PROFILER_ZONE("Startup.LoadModelWeightsPtrs");
-  if (!weights.Exists()) {
-    HWY_ABORT("The model weights file '%s' does not exist.",
-              weights.path.c_str());
+  // Files must have one or the other.
+  HWY_ASSERT(attn_vec_einsum_w.HasPtr() ^ att_weights.HasPtr());
+  // Done if we already read the transposed tensor.
+  if (att_weights.HasPtr() && !attn_vec_einsum_w.HasPtr()) return;
+
+  // NUQ is handled by a specialization in weights.cc.
+  HWY_ASSERT(attn_vec_einsum_w.GetType() != Type::kNUQ);
+
+  const size_t model_dim = layer_config.model_dim;
+  const size_t heads = layer_config.heads;
+  const size_t qkv_dim = layer_config.qkv_dim;
+
+  // Reshape [heads, model_dim, qkv_dim] to [model_dim, heads * qkv_dim].
+  att_weights.SetType(attn_vec_einsum_w.GetType());
+  HWY_ASSERT(att_weights.Rows() == model_dim);
+  HWY_ASSERT(att_weights.Cols() == heads * qkv_dim);
+  HWY_ASSERT(attn_vec_einsum_w.Rows() == heads * model_dim);
+  HWY_ASSERT(attn_vec_einsum_w.Cols() == qkv_dim);
+
+  {
+    static std::mutex m;
+    std::lock_guard<std::mutex> lock(m);
+    mat_owners.push_back(MatOwner());
+    mat_owners.back().AllocateFor(att_weights, allocator, MatPadding::kOdd);
   }
-  ReadFromBlobStore loader(weights);
-  ForEachType fet =
-      loader.HaveToc() ? ForEachType::kLoadWithToc : ForEachType::kLoadNoToc;
-  std::vector<float> scales;
-  if (fet == ForEachType::kLoadWithToc) {
-    BlobError err = loader.LoadConfig(config_);
-    if (err != 0 || config_.model_dim == 0) {
-      fprintf(stderr, "Failed to load model config: %d\n", err);
-      return err;
+
+  const size_t T_bytes = att_weights.ElementBytes();
+  for (size_t m = 0; m < model_dim; ++m) {
+    uint8_t* HWY_RESTRICT out_row = att_weights.RowBytes(m);
+    for (size_t h = 0; h < heads; ++h) {
+      hwy::CopyBytes(attn_vec_einsum_w.RowBytes(h * model_dim + m),
+                     out_row + h * qkv_dim * T_bytes, qkv_dim * T_bytes);
     }
-    if (tokenizer_proto != nullptr) {
-      err = loader.LoadTokenizer(*tokenizer_proto);
-      if (err != 0) {
-        fprintf(stderr, "Failed to load tokenizer: %d\n", err);
-        return err;
-      }
-    }
-  } else {
-    if (weight_type == Type::kUnknown || model_type == Model::UNKNOWN) {
-      fprintf(stderr,
-              "weight type (%d) and model type (%d) must be specified when "
-              "no config is present in weights file\n",
-              static_cast<int>(weight_type), static_cast<int>(model_type));
-      return __LINE__;
-    }
-    // No Toc-> no config.
-    config_ = ConfigFromModel(model_type);
-    config_.weight = weight_type;
-    config_.wrapping = wrapping;
-    scales.resize(config_.num_tensor_scales + config_.vit_config.num_scales);
   }
-  CreateForType(config_.weight, pool);
-  CallForModelWeightT<TensorLoader>(fet, loader);
-  if (!scales.empty()) {
-    loader.LoadScales(scales.data(), scales.size());
-  }
-  BlobError err = loader.ReadAll(pool, model_storage_);
-  if (err != 0) {
-    fprintf(stderr, "Failed to load model weights: %d\n", err);
-    return err;
-  }
-  if (!scales.empty()) {
-    GetOrApplyScales(scales);
-  }
-  if (fet == ForEachType::kLoadNoToc) {
-    PROFILER_ZONE("Startup.Reshape");
-    AllocAndCopyWithTranspose(pool);
-  }
-  return 0;
+  att_weights.SetScale(attn_vec_einsum_w.Scale());
 }
 
-template <typename T>
-struct TensorSaver {
-  // Adds all the tensors to the blob writer.
-  void operator()(ModelWeightsPtrs<T>& weights, ForEachType fet,
-                  WriteToBlobStore& writer) {
-    weights.ForEachTensor(
-        {&weights}, fet,
-        [&writer](const char* name, hwy::Span<MatPtr*> tensors) {
-          tensors[0]->CallUpcasted(writer, name);
-        });
-  }
-};
+// For FFN. Fast, only updates pointers.
+void LayerWeightsPtrs::SplitW1() {
+  // Used for Gemma and Griffin layers; FFWVit uses different tensors.
+  if (layer_config.type == LayerAttentionType::kVit) return;
 
-BlobError ModelWeightsStorage::Save(const std::string& tokenizer,
-                                    const Path& weights,
-                                    hwy::ThreadPool& pool) {
-  WriteToBlobStore writer(pool);
-  ForEachType fet = ForEachType::kLoadWithToc;
-  CallForModelWeightT<TensorSaver>(fet, writer);
-  writer.AddTokenizer(tokenizer);
-  int err = writer.WriteAll(weights, &config_);
-  if (err != 0) {
-    fprintf(stderr, "Failed to write model weights: %d\n", err);
-    return err;
-  }
-  return 0;
+  // Files have both or neither of w1 and w2.
+  HWY_ASSERT(gating_einsum_w1.HasPtr() == gating_einsum_w2.HasPtr());
+  // w is mutually exclusive with w1 and w2 in the file.
+  HWY_ASSERT(gating_einsum_w.HasPtr() ^ gating_einsum_w1.HasPtr());
+  // Done if we already read split tensors. Note that they are not
+  // necessarily the same type.
+  if (gating_einsum_w1.HasPtr() && !gating_einsum_w.HasPtr()) return;
+
+  const size_t ff_hidden_dim = layer_config.ff_hidden_dim;
+  HWY_ASSERT(gating_einsum_w.Rows() == 2 * ff_hidden_dim);
+  HWY_ASSERT(gating_einsum_w1.Rows() == ff_hidden_dim);
+  HWY_ASSERT(gating_einsum_w2.Rows() == ff_hidden_dim);
+  // Cols are the model_dim but we don't have ModelConfig here.
+  HWY_ASSERT(gating_einsum_w1.Cols() == gating_einsum_w.Cols());
+  HWY_ASSERT(gating_einsum_w2.Cols() == gating_einsum_w.Cols());
+
+  const size_t stride = gating_einsum_w.Stride();
+  gating_einsum_w1.SetPtr(gating_einsum_w.RowBytes(0), stride);
+  gating_einsum_w2.SetPtr(gating_einsum_w.RowBytes(ff_hidden_dim), stride);
+  gating_einsum_w1.SetType(gating_einsum_w.GetType());
+  gating_einsum_w2.SetType(gating_einsum_w.GetType());
+  gating_einsum_w1.SetScale(gating_einsum_w.Scale());
+  gating_einsum_w2.SetScale(gating_einsum_w.Scale());
+  gating_einsum_w.SetPtr(nullptr, gating_einsum_w.Cols());
 }
 
-void ModelWeightsStorage::Allocate(const ModelConfig& config, Type weight_type,
-                                   hwy::ThreadPool& pool) {
-  PROFILER_ZONE("Startup.AllocateModelWeightsPtrs");
-  config_ = config;
-  config_.weight = weight_type;
-  CreateForType(weight_type, pool);
-  if (float_weights_) float_weights_->Allocate(model_storage_, pool);
-  if (bf16_weights_) bf16_weights_->Allocate(model_storage_, pool);
-  if (sfp_weights_) sfp_weights_->Allocate(model_storage_, pool);
-  if (nuq_weights_) nuq_weights_->Allocate(model_storage_, pool);
+// For attention, which might not have a w2. Fast, only updates pointers.
+void LayerWeightsPtrs::SplitAttW1() {
+  // We only use this tensor for Gemma layers.
+  if (layer_config.type != LayerAttentionType::kGemma) return;
+
+  // w is mutually exclusive with w1 in the file.
+  HWY_ASSERT(qkv_einsum_w.HasPtr() ^ qkv_einsum_w1.HasPtr());
+  // Done if we already read split tensors. Note that w2 does not exist for
+  // MHA, and otherwise might not be the same type.
+  if (qkv_einsum_w1.HasPtr() && !qkv_einsum_w.HasPtr()) return;
+
+  const size_t w1_rows = layer_config.heads * layer_config.qkv_dim;
+  const size_t w2_rows = layer_config.kv_heads * 2 * layer_config.qkv_dim;
+  HWY_ASSERT(qkv_einsum_w.Rows() == w1_rows + w2_rows);
+  HWY_ASSERT(qkv_einsum_w1.Rows() == w1_rows);
+  HWY_ASSERT(qkv_einsum_w2.Rows() == w2_rows);
+  // Cols are the model_dim but we don't have ModelConfig here.
+  HWY_ASSERT(qkv_einsum_w1.Cols() == qkv_einsum_w.Cols());
+  HWY_ASSERT(qkv_einsum_w2.Cols() == qkv_einsum_w.Cols());
+
+  const size_t stride = qkv_einsum_w.Stride();
+  qkv_einsum_w1.SetPtr(qkv_einsum_w.RowBytes(0), stride);
+  qkv_einsum_w2.SetPtr(qkv_einsum_w.RowBytes(w1_rows), stride);
+  qkv_einsum_w1.SetType(qkv_einsum_w.GetType());
+  qkv_einsum_w2.SetType(qkv_einsum_w.GetType());
+  qkv_einsum_w1.SetScale(qkv_einsum_w.Scale());
+  qkv_einsum_w2.SetScale(qkv_einsum_w.Scale());
+  qkv_einsum_w.SetPtr(nullptr, qkv_einsum_w.Cols());
 }
 
-class WeightInitializer {
- public:
-  WeightInitializer(std::mt19937& gen) : dist_(0.0f, 1.0f), gen_(gen) {}
-
-  void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
-    float* data = tensors[0]->data<float>();
-    for (size_t i = 0; i < tensors[0]->NumElements(); ++i) {
-      data[i] = dist_(gen_);
-    }
-    tensors[0]->set_scale(1.0f);
-  }
-
- private:
-  std::normal_distribution<float> dist_;
-  std::mt19937& gen_;
-};
-
-void ModelWeightsStorage::RandInit(std::mt19937& gen) {
-  HWY_ASSERT(float_weights_);
-  WeightInitializer init(gen);
-  ModelWeightsPtrs<float>::ForEachTensor({float_weights_.get()},
-                                         ForEachType::kLoadNoToc, init);
+// Must be called after reading weights via `ForEachTensor`.
+// TODO: exporters should bake this into the weights already.
+// WARNING: called from multiple threads; `mat_owners` requires a lock.
+void LayerWeightsPtrs::Fixup(std::vector<MatOwner>& mat_owners,
+                             const Allocator& allocator) {
+  // TODO(janwas): handle NUQ
+  InitAttWeights(mat_owners, allocator);
+  SplitW1();
+  SplitAttW1();
 }
 
-void ModelWeightsStorage::ZeroInit() {
-  if (float_weights_) float_weights_->ZeroInit();
-  if (bf16_weights_) bf16_weights_->ZeroInit();
-  if (sfp_weights_) sfp_weights_->ZeroInit();
-  if (nuq_weights_) nuq_weights_->ZeroInit();
-}
+static void HWY_MAYBE_UNUSED InitAttWeightsNUQ(
+    const LayerConfig& layer_config, MatPtrT<NuqStream>& attn_vec_einsum_w,
+    MatPtrT<NuqStream>& att_weights, std::vector<MatOwner>& mat_owners) {
+  if (!attn_vec_einsum_w.HasPtr()) return;
+  HWY_ASSERT(attn_vec_einsum_w.GetType() == Type::kNUQ);
 
-void ModelWeightsStorage::GetOrApplyScales(std::vector<float>& scales) {
-  if (float_weights_) float_weights_->GetOrApplyScales(scales);
-  if (bf16_weights_) bf16_weights_->GetOrApplyScales(scales);
-  if (sfp_weights_) sfp_weights_->GetOrApplyScales(scales);
-  if (nuq_weights_) nuq_weights_->GetOrApplyScales(scales);
-}
-
-void ModelWeightsStorage::AllocAndCopyWithTranspose(hwy::ThreadPool& pool) {
-  if (float_weights_)
-    float_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
-  if (bf16_weights_)
-    bf16_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
-  if (sfp_weights_)
-    sfp_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
-  if (nuq_weights_)
-    nuq_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
-}
-
-void ModelWeightsStorage::CopyWithTranspose(hwy::ThreadPool& pool) {
-  if (float_weights_) float_weights_->CopyWithTranspose(pool);
-  if (bf16_weights_) bf16_weights_->CopyWithTranspose(pool);
-  if (sfp_weights_) sfp_weights_->CopyWithTranspose(pool);
-  if (nuq_weights_) nuq_weights_->CopyWithTranspose(pool);
-}
-
-namespace {
-
-void LogVec(const char* name, const float* data, size_t len) {
-  hwy::Stats stats;
-  for (size_t i = 0; i < len; ++i) {
-    stats.Notify(data[i]);
-  }
-  printf("%-20s  %12zu   %13.10f   %8.5f   %13.10f\n",
-         name, len, stats.Min(), stats.Mean(), stats.Max());
-}
-
-}  // namespace
-
-void ModelWeightsStorage::LogWeightStats() {
-  size_t total_weights = 0;
-  // Only for float weights.
-  ModelWeightsPtrs<float>::ForEachTensor(
-      {float_weights_.get()}, ForEachType::kInitNoToc,
-      [&total_weights](const char* name, hwy::Span<MatPtr*> tensors) {
-        const MatPtr& tensor = *tensors[0];
-        if (tensor.scale() != 1.0f) {
-          printf("[scale=%f] ", tensor.scale());
-        }
-        LogVec(name, tensor.data<float>(), tensor.NumElements());
-        total_weights += tensor.NumElements();
-      });
-  printf("%-20s  %12zu\n", "Total", total_weights);
-}
-
-void ModelWeightsStorage::CreateForType(Type weight_type,
-                                        hwy::ThreadPool& pool) {
-  switch (weight_type) {
-    case Type::kF32:
-      float_weights_ = std::make_unique<ModelWeightsPtrs<float>>(config_);
-      break;
-    case Type::kBF16:
-      bf16_weights_ = std::make_unique<ModelWeightsPtrs<BF16>>(config_);
-      break;
-    case Type::kSFP:
-      sfp_weights_ =
-          std::make_unique<ModelWeightsPtrs<SfpStream>>(config_);
-      break;
-    case Type::kNUQ:
-      nuq_weights_ =
-          std::make_unique<ModelWeightsPtrs<NuqStream>>(config_);
-      break;
-    default:
-      HWY_ABORT("Weight type %d unsupported.", static_cast<int>(weight_type));
-  }
-}
-
-template <>
-void LayerWeightsPtrs<NuqStream>::Reshape(MatStorage* storage) {
-  if (attn_vec_einsum_w.data() == nullptr) return;
+  HWY_ASSERT(att_weights.HasPtr());
+  att_weights.SetType(Type::kNUQ);
 
   const size_t model_dim = layer_config.model_dim;
   const size_t heads = layer_config.heads;
   const size_t qkv_dim = layer_config.qkv_dim;
 
   // Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
-  if (storage != nullptr) {
-    storage->Allocate();
-    att_weights.SetPtr(*storage);
-  }
-
-  const hwy::HWY_NAMESPACE::ScalableTag<float> df;
-
   hwy::AlignedFreeUniquePtr<float[]> attn_vec_einsum_w_tmp =
       hwy::AllocateAligned<float>(model_dim * heads * qkv_dim);
   hwy::AlignedFreeUniquePtr<float[]> att_weights_tmp =
       hwy::AllocateAligned<float>(model_dim * heads * qkv_dim);
 
-  HWY_NAMESPACE::DecompressAndZeroPad(
-      df, MakeSpan(attn_vec_einsum_w.data(), model_dim * heads * qkv_dim), 0,
-      attn_vec_einsum_w_tmp.get(), model_dim * heads * qkv_dim);
+  const hwy::HWY_NAMESPACE::ScalableTag<float> df;
+  HWY_NAMESPACE::DecompressAndZeroPad(df, attn_vec_einsum_w.Span(), 0,
+                                      attn_vec_einsum_w_tmp.get(),
+                                      model_dim * heads * qkv_dim);
 
   for (size_t m = 0; m < model_dim; ++m) {
     float* HWY_RESTRICT out_row = att_weights_tmp.get() + m * heads * qkv_dim;
@@ -293,13 +193,360 @@ void LayerWeightsPtrs<NuqStream>::Reshape(MatStorage* storage) {
 
   CompressWorkingSet work;
   hwy::ThreadPool pool(0);
+  HWY_NAMESPACE::Compress(att_weights_tmp.get(), model_dim * heads * qkv_dim,
+                          work, att_weights.Span(),
+                          /*packed_ofs=*/0, pool);
 
-  HWY_NAMESPACE::Compress(
-      att_weights_tmp.get(), model_dim * heads * qkv_dim, work,
-      MakeSpan(att_weights.data(), model_dim * heads * qkv_dim),
-      /*packed_ofs=*/0, pool);
+  att_weights.SetScale(attn_vec_einsum_w.Scale());
+}
 
-  att_weights.set_scale(attn_vec_einsum_w.scale());
+static void HWY_MAYBE_UNUSED SplitW1NUQ(const LayerConfig& layer_config) {
+  // TODO(janwas): implement.
+}
+
+// Zero-initializes only the allocated tensors in `*this`.
+void WeightsPtrs::ZeroInit() {
+  ForEachTensor(nullptr, nullptr, [](const TensorArgs& t) {
+    if (!t.mat.HasPtr()) return;
+    gcpp::ZeroInit(t.mat);
+  });
+}
+
+// Copies only the allocated tensors in `*this` from tensors in `other`.
+void WeightsPtrs::CopyFrom(const WeightsPtrs& other) {
+  ForEachTensor(const_cast<WeightsPtrs*>(&other), nullptr,
+                [](const TensorArgs& t) {
+                  if (!t.mat.HasPtr()) return;
+                  HWY_ASSERT(t.other_mat1 && t.other_mat1->HasPtr());
+                  CopyMat(*t.other_mat1, t.mat);
+                });
+}
+
+// For reshaping file tensors to the shape expected by the code. This would
+// ideally already happen in the importer. Called by `ReadFromBlobs`.
+void WeightsPtrs::Fixup(std::vector<MatOwner>& mat_owners,
+                        ThreadingContext& ctx) {
+  // TODO: use 1D parallel-for helper function
+  hwy::ThreadPool& pool = ctx.pools.Pool();
+  pool.Run(0, c_layers.size(), [&](uint64_t layer, size_t /*thread*/) {
+    GetLayer(layer)->Fixup(mat_owners, ctx.allocator);
+  });
+
+  pool.Run(0, vit_layers.size(), [&](uint64_t layer, size_t /*thread*/) {
+    VitLayer(layer)->Fixup(mat_owners, ctx.allocator);
+  });
+}
+
+std::vector<uint32_t> WeightsPtrs::AddTensorDataToWriter(
+    BlobWriter& writer) const {
+  std::vector<uint32_t> serialized_mat_ptrs;
+  // ForEachTensor is non-const but the lambda does not modify *this.
+  const_cast<WeightsPtrs*>(this)->ForEachTensor(
+      nullptr, nullptr, [&](const TensorArgs& t) {
+        if (t.flags & TensorArgs::kMaybeRead && !t.mat.HasPtr()) return;
+        HWY_ASSERT_M(t.mat.HasPtr(), t.mat.Name());
+        writer.Add(t.mat.Name(), t.mat.Packed(), t.mat.PackedBytes());
+        t.mat.AppendTo(serialized_mat_ptrs);
+      });
+  return serialized_mat_ptrs;
+}
+
+// Decides whether to read or map based on heuristics and user override.
+static WeightsPtrs::Mode ChooseMode(uint64_t file_bytes,
+                                    const LoaderArgs& loader,
+                                    const InferenceArgs& inference,
+                                    const Allocator& allocator) {
+  Tristate to_bf16 = loader.to_bf16;
+  Tristate map = loader.map;
+
+  // Disable mapping if not padded to the base page size.
+  if (file_bytes % allocator.BasePageBytes() != 0) {
+    if (map == Tristate::kTrue) {  // Only complain if explicitly requested.
+      HWY_WARN("Unable to map non-padded file (%zu, %zu), reading instead.",
+               static_cast<size_t>(file_bytes >> 10),
+               allocator.BasePageBytes());
+    }
+    map = Tristate::kFalse;
+  }
+
+  // Check for user override:
+  if (to_bf16 == Tristate::kTrue && map == Tristate::kTrue) {
+    HWY_WARN("Cannot have to_bf16 && map, to_bf16 takes precedence.");
+  }
+  if (to_bf16 == Tristate::kTrue) return WeightsPtrs::Mode::kReadBF16;
+  if (map == Tristate::kTrue) return WeightsPtrs::Mode::kMap;
+
+  if (to_bf16 == Tristate::kDefault) {
+    // Heuristic: sub-bf16 compression is not helpful if compute-bound.
+    const size_t batch_size =
+        HWY_MAX(inference.prefill_tbatch_size, inference.decode_qbatch_size);
+    to_bf16 = (batch_size >= 128) ? Tristate::kTrue : Tristate::kFalse;
+  }
+
+  if (map == Tristate::kDefault) {
+    // Heuristic: map if large fraction of total. Do not decide based on
+    // `FreeMiB` because it  is generally low.
+    const size_t file_mib = file_bytes >> 20;
+    const size_t total_mib = allocator.TotalMiB();
+    if (file_mib > total_mib) {
+      HWY_WARN("Weight file %zu MiB > detected memory %zu MiB.",
+               static_cast<size_t>(file_mib), total_mib);
+    }
+    // Large fraction of total.
+    map = (file_mib >= total_mib / 3) ? Tristate::kTrue : Tristate::kFalse;
+  }
+
+  // If the `map` heuristic triggers, use that for safety.
+  if (map == Tristate::kTrue) return WeightsPtrs::Mode::kMap;
+  return (to_bf16 == Tristate::kTrue) ? WeightsPtrs::Mode::kReadBF16
+                                      : WeightsPtrs::Mode::kRead;
+}
+
+struct TensorToRead {
+  MatPtr* mat;
+  BlobRange range;
+  // Some tensors opt out of padding via kPacked flags.
+  MatPadding padding;
+
+  // only for kReadBF16
+  bool keep_type = false;
+  Type prev_type;
+};
+
+// Allocates multiple in parallel and binds to NUMA nodes.
+static void AllocateAndBindAll(std::vector<TensorToRead>& tensors,
+                               const WeightsPtrs::Mode mode,
+                               std::vector<MatOwner>& owners,
+                               ThreadingContext& ctx) {
+  const size_t start = owners.size();
+  owners.resize(start + tensors.size());
+
+  MMParallel parallel(ctx);
+
+  // Allocate in parallel because faulting in large tensors is slow.
+  ctx.pools.Pool().Run(
+      0, tensors.size(), [&](uint64_t task, size_t /*thread*/) {
+        TensorToRead& tensor = tensors[task];
+        MatPtr& mat = *tensor.mat;
+
+        tensor.prev_type = mat.GetType();
+        // We only care about MatMul inputs; skip F32 or small tensors.
+        if (tensor.prev_type == Type::kF32 || mat.Rows() < 1024) {
+          tensor.keep_type = true;
+          tensor.padding = MatPadding::kPacked;  // single I/O for simplicity
+        } else if (mode == WeightsPtrs::Mode::kReadBF16) {
+          mat.SetType(Type::kBF16);
+        }
+
+        owners[start + task].AllocateFor(*tensor.mat, ctx.allocator,
+                                         tensor.padding);
+        BindB(*tensor.mat, tensor.mat->ElementBytes(), parallel);
+      });
+}
+
+// Mode == kMap
+static void MapAll(const std::vector<TensorToRead>& tensors,
+                   const MapPtr& mapped, uint64_t file_bytes) {
+  PROFILER_ZONE("Startup.Weights.Map");
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    // SetPtr does not change the stride, but it is expected to be packed
+    // because that is what Compress() writes to the file.
+    const size_t mat_bytes = tensors[i].mat->PackedBytes();
+    // Ensure blob size matches that computed from metadata.
+    HWY_ASSERT_M(mat_bytes == tensors[i].range.bytes, tensors[i].mat->Name());
+    // Ensure the blob lies within the file mapping.
+    const uint64_t offset = tensors[i].range.offset;
+    HWY_ASSERT_M(offset + mat_bytes <= file_bytes, tensors[i].mat->Name());
+
+    tensors[i].mat->SetPtr(const_cast<uint8_t*>(mapped.get() + offset),
+                           tensors[i].mat->Stride());
+  }
+}
+
+// Mode == kReadBF16:
+
+template <typename T>
+static void DecompressToBF16(MatPtr& mat,
+                             const hwy::AlignedFreeUniquePtr<uint8_t[]>& buf) {
+  hwy::HWY_NAMESPACE::ScalableTag<BF16> dbf;
+  const size_t cols = mat.Cols();
+
+  const size_t num_packed = CompressedArrayElements<T>(mat.Extents().Area());
+  const PackedSpan<T> packed{HWY_RCAST_ALIGNED(T*, buf.get()), num_packed};
+
+  size_t packed_ofs = 0;
+  for (size_t r = 0; r < mat.Rows(); ++r, packed_ofs += cols) {
+    HWY_NAMESPACE::DecompressAndZeroPad(
+        dbf, packed, packed_ofs, HWY_RCAST_ALIGNED(BF16*, mat.RowBytes(r)),
+        cols);
+  }
+}
+
+static void ReadAllToBF16(const std::vector<TensorToRead>& tensors,
+                          const BlobReader& reader, hwy::ThreadPool& pool) {
+  pool.Run(0, tensors.size(), [&](uint64_t task, size_t thread) {
+    PROFILER_ZONE2(thread, "Startup.Weights.ReadBF16");
+    const TensorToRead& tensor = tensors[task];
+    MatPtr& mat = *tensor.mat;
+
+    if (tensor.keep_type) {
+      HWY_ASSERT(reader.file().Read(tensor.range.offset, tensor.range.bytes,
+                                    mat.Packed()));
+      return;
+    }
+
+    // Read to a temporary buffer.
+    const hwy::AlignedFreeUniquePtr<uint8_t[]> buf =
+        hwy::AllocateAligned<uint8_t>(tensor.range.bytes);
+    HWY_ASSERT(
+        reader.file().Read(tensor.range.offset, tensor.range.bytes, buf.get()));
+
+    if constexpr (GEMMA_ENABLE_NUQ) {
+      if (tensor.prev_type == Type::kNUQ) {
+        return DecompressToBF16<NuqStream>(*tensor.mat, buf);
+      }
+    }
+    switch (tensor.prev_type) {
+      case Type::kF32:
+        return DecompressToBF16<float>(*tensor.mat, buf);
+      case Type::kBF16:
+        return DecompressToBF16<BF16>(*tensor.mat, buf);
+      case Type::kSFP:
+        return DecompressToBF16<SfpStream>(*tensor.mat, buf);
+      default:
+        HWY_ABORT("Unsupported type %s", TypeName(tensor.prev_type));
+    }
+  });
+}
+
+// Mode == kRead:
+
+static std::vector<IOBatch> MakeBatches(
+    const std::vector<TensorToRead>& tensors, const uint64_t file_bytes) {
+  PROFILER_ZONE("Startup.Weights.MakeBatches");
+  // Batches must be contiguous but blobs are padded, hence at least one
+  // batch per tensor, and more when tensor rows exceed the batch size.
+  std::vector<IOBatch> batches;
+  batches.reserve(tensors.size());
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    const BlobRange& range = tensors[i].range;
+    MatPtr& mat = *tensors[i].mat;
+    uint64_t offset = range.offset;
+    HWY_ASSERT(range.End() <= file_bytes);
+
+    batches.emplace_back(offset, range.key_idx);
+    const size_t file_bytes_per_row = mat.Cols() * mat.ElementBytes();
+    const size_t mem_stride_bytes = mat.Stride() * mat.ElementBytes();
+    uint8_t* row_bytes = mat.RowBytes(0);
+    for (size_t r = 0; r < mat.Rows(); ++r) {
+      if (!batches.back().Add(row_bytes, file_bytes_per_row)) {  // Full batch.
+        batches.emplace_back(offset, range.key_idx);
+        // Adding to an empty batch is always successful.
+        HWY_ASSERT(batches.back().Add(row_bytes, file_bytes_per_row));
+      }
+      offset += file_bytes_per_row;
+      // Must zero-initialize the in-memory row padding, see MatMul.
+      hwy::ZeroBytes(row_bytes + file_bytes_per_row,
+                      mem_stride_bytes - file_bytes_per_row);
+      row_bytes += mem_stride_bytes;
+    }
+    HWY_ASSERT(offset == range.End());
+  }
+
+  HWY_ASSERT(batches.size() >= tensors.size());
+  return batches;
+}
+
+// Parallel synchronous I/O. Note that O_DIRECT seems undesirable because we
+// want to use the OS cache between consecutive runs.
+static void ReadBatches(const BlobReader& reader,
+                        const std::vector<IOBatch>& batches,
+                        hwy::ThreadPool& pool) {
+  // >5x speedup from parallel reads when cached.
+  pool.Run(0, batches.size(), [&](uint64_t i, size_t thread) {
+    PROFILER_ZONE2(thread, "Startup.Weights.Read");
+    const IOBatch& batch = batches[i];
+    const std::string& key = reader.Keys()[batch.KeyIdx()];
+    const uint64_t bytes_read = batch.Read(reader.file());
+    if (bytes_read != batch.TotalBytes()) {
+      HWY_ABORT("Read failed for %s from %zu, %zu bytes; got %zu.", key.c_str(),
+                static_cast<size_t>(batch.Offset()),
+                static_cast<size_t>(batch.TotalBytes()),
+                static_cast<size_t>(bytes_read));
+    }
+  });
+}
+
+// Aborts on error. Updates `mode` to the actual mode used. Returns mapped
+// memory or nullptr if `kMap` was not used.
+static MapPtr MapOrReadAll(std::vector<TensorToRead>& tensors,
+                           BlobReader& reader, WeightsPtrs::Mode* mode,
+                           std::vector<MatOwner>& mat_owners,
+                           ThreadingContext& ctx) {
+  if (*mode == WeightsPtrs::Mode::kMap) {
+    if (MapPtr mapped = reader.Map()) {
+      MapAll(tensors, mapped, reader.file().FileSize());
+      return mapped;
+    }
+    HWY_WARN("Failed to map file (%zu KiB), reading instead.",
+             static_cast<size_t>(reader.file_bytes() >> 10));
+    // If we wanted to map but failed, memory is probably not plentiful, so
+    // fall through to kRead because kReadBF16 requires more memory.
+    *mode = WeightsPtrs::Mode::kRead;
+  }
+
+  {
+    PROFILER_ZONE("Startup.Weights.Allocate");
+    // NOTE: this changes the stride of `mats`!
+    AllocateAndBindAll(tensors, *mode, mat_owners, ctx);
+  }
+
+  hwy::ThreadPool& pool = ctx.pools.Pool();
+
+  if (*mode == WeightsPtrs::Mode::kReadBF16) {
+    ReadAllToBF16(tensors, reader, pool);
+    return MapPtr();
+  }
+
+  const std::vector<IOBatch> batches =
+      MakeBatches(tensors, reader.file_bytes());
+  ReadBatches(reader, batches, pool);
+  return MapPtr();
+}
+
+WeightsPtrs::Mode WeightsPtrs::ReadFromBlobs(const ModelStore& model,
+                                             BlobReader& reader,
+                                             const LoaderArgs& loader,
+                                             const InferenceArgs& inference,
+                                             std::vector<MatOwner>& mat_owners,
+                                             ThreadingContext& ctx) {
+  // List of tensors to read/map, and where from.
+  std::vector<TensorToRead> tensors;
+
+  // Enumerate all weights (negligible cost).
+  ForEachTensor(nullptr, nullptr, [&](const TensorArgs& t) {
+    const MatPadding padding = (t.flags & TensorArgs::kPacked)
+                                   ? MatPadding::kPacked
+                                   : MatPadding::kOdd;
+    size_t key_idx;
+    if (model.FindAndUpdateMatPtr(t.mat, key_idx)) {
+      tensors.push_back(
+          {.mat = &t.mat, .range = reader.Range(key_idx), .padding = padding});
+      return;
+    }
+    if (t.flags & TensorArgs::kMaybeRead) return;  // optional and not found.
+    HWY_ABORT("Tensor %s is required but not found in file.", t.mat.Name());
+  });
+
+  Mode mode = ChooseMode(reader.file_bytes(), loader, inference, ctx.allocator);
+  mapped_ = MapOrReadAll(tensors, reader, &mode, mat_owners, ctx);
+
+  {
+    PROFILER_ZONE("Startup.Fixup");
+    Fixup(mat_owners, ctx);
+  }
+  return mode;
 }
 
 }  // namespace gcpp
diff --git a/gemma/weights.h b/gemma/weights.h
index 5fd544b..de3652a 100644
--- a/gemma/weights.h
+++ b/gemma/weights.h
@@ -17,607 +17,455 @@
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_H_
 
 #include <stddef.h>
+#include <stdint.h>
 
-#include <complex>
-#include <cstdio>
-#include <memory>
-#include <random>
 #include <string>
-#include <unordered_set>
 #include <vector>
 
-#include "compression/compress.h"
-#include "compression/shared.h"
-#include "gemma/common.h"
-#include "gemma/configs.h"
-#include "gemma/tensor_index.h"
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "compression/types.h"
+#include "gemma/configs.h"      // ModelConfig
+#include "gemma/gemma_args.h"   // InferenceArgs
+#include "gemma/model_store.h"  // ModelStore
+#include "gemma/tensor_info.h"  // TensorInfoRegistry
+#include "io/blob_store.h"      // BlobWriter
+#include "util/mat.h"           // MatPtr
+#include "util/threading_context.h"
 
 namespace gcpp {
 
-// Different tensors need to appear in a ForEachTensor, according to what is
-// happening.
-enum class ForEachType {
-  // Under normal circumstances, when not initializing or loading, we can
-  // include all tensors and ignore the null ones.
-  kIgnoreNulls,
-  // If there is a table of contents, we can include all tensors.
-  kLoadWithToc,
-  // There is no table of contents, so we have to be careful to only include
-  // tensors that are actually present.
-  kLoadNoToc,
-  // We need to initialize all tensors needed when there is no table of
-  // contents. This differs from kLoadNoToc in that we need to include any
-  // tensor that is allocated but not loaded directly from file.
-  kInitNoToc,
+// Argument passed to the `ForEachTensor` callback.
+struct TensorArgs {
+  // `other_mat1` and `other_mat2` can be nullptr, or tensor(s) of the same
+  // name/type from another `LayerWeightsPtrs` for iterating over tensor pairs
+  // (for copying) or triples (for `AdamUpdateMV`). Set by `TENSOR_ARGS`.
+  // `flags` is a combination of zero or more `Flags`.
+  TensorArgs(MatPtr& mat, MatPtr* other_mat1, MatPtr* other_mat2, int flags)
+      : mat(mat),
+        other_mat1(other_mat1),
+        other_mat2(other_mat2),
+        flags(flags) {}
+
+  MatPtr& mat;
+  MatPtr* other_mat1;  // either/both can be nullptr.
+  MatPtr* other_mat2;
+
+  enum Flags {
+    // Default: Read the tensor from the file and abort if it is not found.
+    kMustRead = 0,
+
+    // Not an error if the tensor is not present in the file. For example,
+    // the _w1/_w2 tensors are not always present.
+    kMaybeRead = 1,
+
+    // Avoid padding tensor rows when reading. Used for some Griffin tensors
+    // whose index computations do not use Row() accessors.
+    kPacked = 2,
+  };
+  const int flags;
 };
 
-template <class Weight>
+// Shorthand for creating the argument to the `ForEachTensor` callback. A macro
+// seems less bad than member pointer syntax.
+#define TENSOR_ARGS(mat, flag)                     \
+  TensorArgs(mat, other1 ? &other1->mat : nullptr, \
+             other2 ? &other2->mat : nullptr, TensorArgs::flag)
+
+// Finds tensors by name in `TensorInfoRegistry` (constructed from
+// `ModelConfig`) and constructs `MatPtr` metadata with those shapes.
+class MatFinder {
+ public:
+  MatFinder(const std::string& suffix, const TensorInfoRegistry& tensors)
+      : suffix_(suffix), tensors_(tensors) {}
+
+  // Retrieves shape by name via `TensorInfo` from `TensorInfoRegistry`.
+  MatPtr operator()(const std::string& base_name) const {
+    const std::string name = std::string(base_name) + suffix_;
+    return MatPtr(name.c_str(), Type::kUnknown,
+                  ExtentsFromInfo(tensors_.Find(name)));
+  }
+
+ private:
+  const std::string suffix_;
+  const TensorInfoRegistry& tensors_;
+};
+
+// Per-layer weight metadata and pointers. The tensor data is owned by
+// `MatOwner`.
 struct LayerWeightsPtrs {
-  // Large data is constructed separately.
-  explicit LayerWeightsPtrs(const LayerConfig& config,
-                            const TensorIndex& tensor_index)
-      : attn_vec_einsum_w("att_ein", tensor_index),
-        qkv_einsum_w("qkv_ein", tensor_index),
-        qkv_einsum_w1("qkv1_w", tensor_index),
-        qkv_einsum_w2("qkv2_w", tensor_index),
-        attention_output_biases("attn_ob", tensor_index),
-        griffin({.linear_x_w = {"gr_lin_x_w", tensor_index},
-                 .linear_x_biases = {"gr_lin_x_b", tensor_index},
-                 .linear_y_w = {"gr_lin_y_w", tensor_index},
-                 .linear_y_biases = {"gr_lin_y_b", tensor_index},
-                 .linear_out_w = {"gr_lin_out_w", tensor_index},
-                 .linear_out_biases = {"gr_lin_out_b", tensor_index},
-                 .conv_w = {"gr_conv_w", tensor_index},
-                 .conv_biases = {"gr_conv_b", tensor_index},
-                 .gate_w = {"gr_gate_w", tensor_index},
-                 .gate_biases = {"gr_gate_b", tensor_index},
-                 .a = {"gr_a", tensor_index}}),
+  // Initializes tensor metadata without allocating.
+  // NOTE: do not store layer_idx, TransformerLayer and Attention may use
+  // other values for purposes of the KV cache.
+  LayerWeightsPtrs(size_t layer_idx, const LayerConfig& config,
+                   const TensorInfoRegistry& tensors)
+      : finder_(LayerSuffix(layer_idx), tensors),
+        qkv_einsum_w(finder_("qkv_ein")),
+        qkv_einsum_w1(finder_("qkv1_w")),
+        qkv_einsum_w2(finder_("qkv2_w")),
+        attention_output_biases(finder_("attn_ob")),
+        griffin({.linear_x_w = finder_("gr_lin_x_w"),
+                 .linear_x_biases = finder_("gr_lin_x_b"),
+                 .linear_y_w = finder_("gr_lin_y_w"),
+                 .linear_y_biases = finder_("gr_lin_y_b"),
+                 .linear_out_w = finder_("gr_lin_out_w"),
+                 .linear_out_biases = finder_("gr_lin_out_b"),
+                 .conv_w = finder_("gr_conv_w"),
+                 .conv_biases = finder_("gr_conv_b"),
+                 .gate_w = finder_("gr_gate_w"),
+                 .gate_biases = finder_("gr_gate_b"),
+                 .a = finder_("gr_a")}),
         // MultiHeadDotProductAttention.
-        vit({.attn_out_w = {"attn_out_w", tensor_index},
-             .attn_out_b = {"attn_out_b", tensor_index},
-             .qkv_einsum_w = {"qkv_ein_w", tensor_index},
-             .qkv_einsum_b = {"qkv_ein_b", tensor_index},
-             .linear_0_w = {"linear_0_w", tensor_index},
-             .linear_0_b = {"linear_0_b", tensor_index},
-             .linear_1_w = {"linear_1_w", tensor_index},
-             .linear_1_b = {"linear_1_b", tensor_index},
-             .layer_norm_0_bias = {"ln_0_bias", tensor_index},
-             .layer_norm_0_scale = {"ln_0_scale", tensor_index},
-             .layer_norm_1_bias = {"ln_1_bias", tensor_index},
-             .layer_norm_1_scale = {"ln_1_scale", tensor_index}}),
-        gating_einsum_w("gating_ein", tensor_index),
-        gating_einsum_w1("gating1_w", tensor_index),
-        gating_einsum_w2("gating2_w", tensor_index),
-        linear_w("linear_w", tensor_index),
-        pre_attention_norm_scale("pre_att_ns", tensor_index),
-        pre_ffw_norm_scale("pre_ff_ns", tensor_index),
-        post_attention_norm_scale("post_att_ns", tensor_index),
-        post_ffw_norm_scale("post_ff_ns", tensor_index),
-        ffw_gating_biases("ffw_gat_b", tensor_index),
-        ffw_output_biases("ffw_out_b", tensor_index),
-        att_weights("att_w", tensor_index),
-        key_norm_scale("key_norm", tensor_index),
-        query_norm_scale("query_norm", tensor_index),
-        layer_config(config) {}
+        vit({.attn_out_w = finder_("attn_out_w"),
+             .attn_out_b = finder_("attn_out_b"),
+             .qkv_einsum_w = finder_("qkv_ein_w"),
+             .qkv_einsum_b = finder_("qkv_ein_b"),
+             .linear_0_w = finder_("linear_0_w"),
+             .linear_0_b = finder_("linear_0_b"),
+             .linear_1_w = finder_("linear_1_w"),
+             .linear_1_b = finder_("linear_1_b"),
+             .layer_norm_0_bias = finder_("ln_0_bias"),
+             .layer_norm_0_scale = finder_("ln_0_scale"),
+             .layer_norm_1_bias = finder_("ln_1_bias"),
+             .layer_norm_1_scale = finder_("ln_1_scale")}),
+        gating_einsum_w(finder_("gating_ein")),
+        gating_einsum_w1(finder_("gating1_w")),
+        gating_einsum_w2(finder_("gating2_w")),
+        linear_w(finder_("linear_w")),
+        pre_attention_norm_scale(finder_("pre_att_ns")),
+        pre_ffw_norm_scale(finder_("pre_ff_ns")),
+        post_attention_norm_scale(finder_("post_att_ns")),
+        post_ffw_norm_scale(finder_("post_ff_ns")),
+        ffw_gating_biases(finder_("ffw_gat_b")),
+        ffw_output_biases(finder_("ffw_out_b")),
+
+        attn_vec_einsum_w(finder_("att_ein")),
+        att_weights(finder_("att_w")),
+
+        key_norm_scale(finder_("key_norm")),
+        query_norm_scale(finder_("query_norm")),
+
+        layer_config(config) {
+  }
   ~LayerWeightsPtrs() = default;
 
-  // If weights are f32, also f32; otherwise at least bf16. Useful for ops that
-  // do not yet support smaller compressed types, or require at least bf16. When
-  // weights are f32, we also want such tensors to be f32.
-  // If weights are complex, this is also complex.
-  using WeightF32OrBF16 =
-      hwy::If<hwy::IsSame<Weight, std::complex<double>>(), std::complex<double>,
-              hwy::If<hwy::IsSame<Weight, double>(), double,
-                      hwy::If<IsF32<Weight>(), float, BF16>>>;
+  const MatFinder finder_;
 
-  template <class T>
-  using ArrayT = MatPtrT<T>;
-
-  ArrayT<Weight> attn_vec_einsum_w;
-  // qkv_einsum_w holds 2 different matrices, which may be separated out.
-  // On loading, which is used depends on what is in the file.
-  // At inference, the one with a non-null ptr is used.
-  ArrayT<Weight> qkv_einsum_w;
-  ArrayT<Weight> qkv_einsum_w1;
-  ArrayT<Weight> qkv_einsum_w2;
-  ArrayT<float> attention_output_biases;
+  // Files either have qkv_einsum_w with 2 stacked matrices or separate
+  // w1/w2 tensors. Fixup ensures w1/w2 are ready for use by gemma-inl.h.
+  MatPtr qkv_einsum_w;
+  MatPtr qkv_einsum_w1;
+  MatPtr qkv_einsum_w2;
+  MatPtrT<float> attention_output_biases;
 
   struct {
-    ArrayT<Weight> linear_x_w;
-    ArrayT<float> linear_x_biases;
-    ArrayT<Weight> linear_y_w;
-    ArrayT<float> linear_y_biases;
-    ArrayT<Weight> linear_out_w;
-    ArrayT<float> linear_out_biases;
-    ArrayT<float> conv_w;
-    ArrayT<float> conv_biases;
-    ArrayT<Weight> gate_w;
-    ArrayT<float> gate_biases;
-    ArrayT<float> a;
+    MatPtr linear_x_w;
+    MatPtrT<float> linear_x_biases;
+    MatPtr linear_y_w;
+    MatPtrT<float> linear_y_biases;
+    MatPtr linear_out_w;
+    MatPtrT<float> linear_out_biases;
+    MatPtrT<float> conv_w;
+    MatPtrT<float> conv_biases;
+    MatPtr gate_w;
+    MatPtrT<float> gate_biases;
+    MatPtrT<float> a;
   } griffin;
 
   struct {
     // MultiHeadDotProductAttention.
-    ArrayT<WeightF32OrBF16> attn_out_w;
-    ArrayT<float> attn_out_b;
-    ArrayT<WeightF32OrBF16> qkv_einsum_w;
-    ArrayT<float> qkv_einsum_b;
+    MatPtr attn_out_w;  // at least BF16.
+    MatPtrT<float> attn_out_b;
+    MatPtr qkv_einsum_w;  // at least BF16.
+    MatPtrT<float> qkv_einsum_b;
     // MlpBlock.
-    ArrayT<WeightF32OrBF16> linear_0_w;
-    ArrayT<float> linear_0_b;
-    ArrayT<WeightF32OrBF16> linear_1_w;
-    ArrayT<float> linear_1_b;
+    MatPtr linear_0_w;  // at least BF16.
+    MatPtrT<float> linear_0_b;
+    MatPtr linear_1_w;  // at least BF16.
+    MatPtrT<float> linear_1_b;
     // LayerNorm.
-    ArrayT<WeightF32OrBF16> layer_norm_0_bias;
-    ArrayT<WeightF32OrBF16> layer_norm_0_scale;
-    ArrayT<WeightF32OrBF16> layer_norm_1_bias;
-    ArrayT<WeightF32OrBF16> layer_norm_1_scale;
+    MatPtr layer_norm_0_bias;   // at least BF16.
+    MatPtr layer_norm_0_scale;  // at least BF16.
+    MatPtr layer_norm_1_bias;   // at least BF16.
+    MatPtr layer_norm_1_scale;  // at least BF16.
   } vit;
 
-  // gating_einsum_w holds 2 different matrices, which may be separated out.
-  // On loading, which is used depends on what is in the file.
-  // At inference, the one with a non-null ptr is used.
-  ArrayT<Weight> gating_einsum_w;
-  ArrayT<Weight> gating_einsum_w1;
-  ArrayT<Weight> gating_einsum_w2;
-  ArrayT<Weight> linear_w;
-  // We don't yet have an RMSNorm that accepts all Weight.
-  ArrayT<WeightF32OrBF16> pre_attention_norm_scale;
-  ArrayT<WeightF32OrBF16> pre_ffw_norm_scale;
-  ArrayT<WeightF32OrBF16> post_attention_norm_scale;
-  ArrayT<WeightF32OrBF16> post_ffw_norm_scale;
+  // Files either have gating_einsum_w with 2 stacked matrices or separate
+  // w1/w2 tensors. `Fixup` ensures w1/w2 are ready for use by gemma-inl.h.
+  MatPtr gating_einsum_w;
+  MatPtr gating_einsum_w1;
+  MatPtr gating_einsum_w2;
+  MatPtr linear_w;
+  MatPtr pre_attention_norm_scale;   // at least BF16.
+  MatPtr pre_ffw_norm_scale;         // at least BF16.
+  MatPtr post_attention_norm_scale;  // at least BF16.
+  MatPtr post_ffw_norm_scale;        // at least BF16.
 
-  ArrayT<float> ffw_gating_biases;
-  ArrayT<float> ffw_output_biases;
+  MatPtrT<float> ffw_gating_biases;
+  MatPtrT<float> ffw_output_biases;
 
-  // Reshaped attention; not loaded from disk via ForEachTensor.
-  ArrayT<Weight> att_weights;
+  MatPtr attn_vec_einsum_w;  // Use att_weights instead of this.
+  MatPtr att_weights;        // Use this instead of attn_vec_einsum_w.
+
+  MatPtr key_norm_scale;    // at least BF16.
+  MatPtr query_norm_scale;  // at least BF16.
 
   const LayerConfig& layer_config;
 
-  // Initializes att_weights from attn_vec_einsum_w, hence this must be called
-  // after loading weights via ForEachTensor.
-  // TODO: update compression/convert_weights to bake this in.
-  void Reshape(MatStorage* storage) {
-    static_assert(!hwy::IsSame<Weight, NuqStream>());
-
-    if (attn_vec_einsum_w.data() == nullptr) return;
-
-    const size_t model_dim = layer_config.model_dim;
-    const size_t heads = layer_config.heads;
-    const size_t qkv_dim = layer_config.qkv_dim;
-
-    // Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
-    if (storage != nullptr) {
-      storage->Allocate();
-      att_weights.SetPtr(*storage);
-    }
-    for (size_t m = 0; m < model_dim; ++m) {
-      Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim;
-      for (size_t h = 0; h < heads; ++h) {
-        hwy::CopyBytes(
-            attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim,
-            out_row + h * qkv_dim, qkv_dim * sizeof(Weight));
-      }
-    }
-    att_weights.set_scale(attn_vec_einsum_w.scale());
-  }
-
-  ArrayT<WeightF32OrBF16> key_norm_scale;
-  ArrayT<WeightF32OrBF16> query_norm_scale;
-
-// Used by ForEachTensor for per-layer tensors.
-#define GEMMA_CALL_FUNC(member)                                             \
-  {                                                                         \
-    for (int i = 0; i < ptrs.size(); ++i) {                                 \
-      tensors[i] = &ptrs[i]->member;                                        \
-    }                                                                       \
-    if (tensors[0]->Ptr() != nullptr || fet != ForEachType::kIgnoreNulls) { \
-      func(ptrs[0]->member.CacheName(layer_idx, sep, sep_index).c_str(),    \
-           hwy::Span<MatPtr*>(tensors.data(), ptrs.size()));                \
-    }                                                                       \
-  }
-
+  // Calls `func(TensorArgs)` for each tensor which is in use for the
+  // current `layer_config`. `other1` and `other2` are optional arguments so we
+  // can also iterate over pairs or triples of tensors for `AdamUpdateMV`.
+  // Public because also called by `WeightsPtrs`.
   template <class Func>
-  static void ForEachTensor(const std::vector<LayerWeightsPtrs<Weight>*>& ptrs,
-                            int layer_idx, ForEachType fet, Func func,
-                            char sep = ' ', int sep_index = -1) {
-    std::vector<MatPtr*> tensors(ptrs.size(), nullptr);
-    auto type = ptrs[0]->layer_config.type;
-    if (type == LayerAttentionType::kVit) {
+  void ForEachTensor(LayerWeightsPtrs* other1, LayerWeightsPtrs* other2,
+                     Func func) {
+    if (layer_config.type == LayerAttentionType::kVit) {
       // MHA.
-      GEMMA_CALL_FUNC(vit.attn_out_w);
-      GEMMA_CALL_FUNC(vit.attn_out_b);
-      GEMMA_CALL_FUNC(vit.qkv_einsum_w);
-      GEMMA_CALL_FUNC(vit.qkv_einsum_b);
+      func(TENSOR_ARGS(vit.attn_out_w, kMustRead));
+      func(TENSOR_ARGS(vit.attn_out_b, kMustRead));
+      func(TENSOR_ARGS(vit.qkv_einsum_w, kMustRead));
+      // Used as 1D MatMul bias, but has `heads + 2 * kv_heads` rows, hence
+      // must not be padded.
+      func(TENSOR_ARGS(vit.qkv_einsum_b, kMustRead | TensorArgs::kPacked));
       // MlpBlock.
-      GEMMA_CALL_FUNC(vit.linear_0_w);
-      GEMMA_CALL_FUNC(vit.linear_0_b);
-      GEMMA_CALL_FUNC(vit.linear_1_w);
-      GEMMA_CALL_FUNC(vit.linear_1_b);
+      func(TENSOR_ARGS(vit.linear_0_w, kMustRead));
+      func(TENSOR_ARGS(vit.linear_0_b, kMustRead));
+      func(TENSOR_ARGS(vit.linear_1_w, kMustRead));
+      func(TENSOR_ARGS(vit.linear_1_b, kMustRead));
       // LayerNorm.
-      GEMMA_CALL_FUNC(vit.layer_norm_0_bias);
-      GEMMA_CALL_FUNC(vit.layer_norm_0_scale);
-      GEMMA_CALL_FUNC(vit.layer_norm_1_bias);
-      GEMMA_CALL_FUNC(vit.layer_norm_1_scale);
+      func(TENSOR_ARGS(vit.layer_norm_0_bias, kMustRead));
+      func(TENSOR_ARGS(vit.layer_norm_0_scale, kMustRead));
+      func(TENSOR_ARGS(vit.layer_norm_1_bias, kMustRead));
+      func(TENSOR_ARGS(vit.layer_norm_1_scale, kMustRead));
       return;
     }
-    if (type == LayerAttentionType::kGemma) {
-      if (fet != ForEachType::kLoadNoToc) {
-        GEMMA_CALL_FUNC(att_weights);
-      }
-      if (fet == ForEachType::kInitNoToc || fet == ForEachType::kLoadNoToc ||
-          fet == ForEachType::kIgnoreNulls) {
-        GEMMA_CALL_FUNC(attn_vec_einsum_w);
-      }
-      GEMMA_CALL_FUNC(qkv_einsum_w);
-      if (fet == ForEachType::kIgnoreNulls ||
-          fet == ForEachType::kLoadWithToc) {
-        // The unwanted ones will be null or not in the toc.
-        GEMMA_CALL_FUNC(qkv_einsum_w1);
-        GEMMA_CALL_FUNC(qkv_einsum_w2);
-      }
+    if (layer_config.type == LayerAttentionType::kGemma) {
+      // Either read from file, or allocated during Fixup().
+      func(TENSOR_ARGS(att_weights, kMaybeRead));
+      func(TENSOR_ARGS(attn_vec_einsum_w, kMaybeRead));
+      func(TENSOR_ARGS(qkv_einsum_w, kMaybeRead));
+      func(TENSOR_ARGS(qkv_einsum_w1, kMaybeRead));
+      func(TENSOR_ARGS(qkv_einsum_w2, kMaybeRead));
     } else {
-      GEMMA_CALL_FUNC(griffin.linear_x_w);
-      GEMMA_CALL_FUNC(griffin.linear_x_biases);
-      GEMMA_CALL_FUNC(griffin.linear_y_w);
-      GEMMA_CALL_FUNC(griffin.linear_y_biases);
-      GEMMA_CALL_FUNC(griffin.linear_out_w);
-      GEMMA_CALL_FUNC(griffin.linear_out_biases);
-      GEMMA_CALL_FUNC(griffin.conv_w);
-      GEMMA_CALL_FUNC(griffin.conv_biases);
-      GEMMA_CALL_FUNC(griffin.gate_w);
-      GEMMA_CALL_FUNC(griffin.gate_biases);
-      GEMMA_CALL_FUNC(griffin.a);
+      func(TENSOR_ARGS(griffin.linear_x_w, kMustRead));
+      func(TENSOR_ARGS(griffin.linear_x_biases, kMustRead));
+      func(TENSOR_ARGS(griffin.linear_y_w, kMustRead));
+      func(TENSOR_ARGS(griffin.linear_y_biases, kMustRead));
+      func(TENSOR_ARGS(griffin.linear_out_w, kMustRead));
+      func(TENSOR_ARGS(griffin.linear_out_biases, kMustRead));
+      // conv_w and gate_w are not accessed via Row(), hence must not be padded.
+      // Note that *biases are 1D, hence packing/padding does not matter.
+      func(TENSOR_ARGS(griffin.conv_w, kMustRead | TensorArgs::kPacked));
+      func(TENSOR_ARGS(griffin.conv_biases, kMustRead));
+      func(TENSOR_ARGS(griffin.gate_w, kMustRead | TensorArgs::kPacked));
+      func(TENSOR_ARGS(griffin.gate_biases, kMustRead));
+      func(TENSOR_ARGS(griffin.a, kMustRead));
     }
-    GEMMA_CALL_FUNC(gating_einsum_w);
-    if (fet == ForEachType::kIgnoreNulls || fet == ForEachType::kLoadWithToc) {
-      // The unwanted ones will be null or not in the toc.
-      GEMMA_CALL_FUNC(gating_einsum_w1);
-      GEMMA_CALL_FUNC(gating_einsum_w2);
-    }
-    GEMMA_CALL_FUNC(linear_w);
-    GEMMA_CALL_FUNC(pre_attention_norm_scale);
-    GEMMA_CALL_FUNC(pre_ffw_norm_scale);
-
-    if (ptrs[0]->layer_config.post_norm == PostNormType::Scale) {
-      GEMMA_CALL_FUNC(post_attention_norm_scale);
-      GEMMA_CALL_FUNC(post_ffw_norm_scale);
-    }
-    if (ptrs[0]->layer_config.use_qk_norm) {
-      GEMMA_CALL_FUNC(key_norm_scale);
-      GEMMA_CALL_FUNC(query_norm_scale);
+    {
+      func(TENSOR_ARGS(gating_einsum_w, kMaybeRead));
+      func(TENSOR_ARGS(gating_einsum_w1, kMaybeRead));
+      func(TENSOR_ARGS(gating_einsum_w2, kMaybeRead));
+      func(TENSOR_ARGS(linear_w, kMaybeRead));
+      func(TENSOR_ARGS(pre_attention_norm_scale, kMustRead));
+      func(TENSOR_ARGS(pre_ffw_norm_scale, kMustRead));
     }
 
-    if (ptrs[0]->layer_config.ff_biases) {
-      GEMMA_CALL_FUNC(ffw_gating_biases);
-      GEMMA_CALL_FUNC(ffw_output_biases);
+    if (layer_config.post_norm == PostNormType::Scale) {
+      func(TENSOR_ARGS(post_attention_norm_scale, kMustRead));
+      func(TENSOR_ARGS(post_ffw_norm_scale, kMustRead));
+    }
+    if (layer_config.use_qk_norm) {
+      func(TENSOR_ARGS(key_norm_scale, kMustRead));
+      func(TENSOR_ARGS(query_norm_scale, kMustRead));
     }
 
-    if (ptrs[0]->layer_config.softmax_attn_output_biases &&
-        type == LayerAttentionType::kGemma) {
-      GEMMA_CALL_FUNC(attention_output_biases);
+    if (layer_config.ff_biases) {
+      func(TENSOR_ARGS(ffw_gating_biases, kMustRead));
+      func(TENSOR_ARGS(ffw_output_biases, kMustRead));
     }
-  }
 
-  // Sets all the tensors in the layer to zero. Memory must have been allocated.
-  void ZeroInit(int layer_idx) {
-    ForEachTensor({this}, layer_idx, ForEachType::kIgnoreNulls,
-                  [](const char*, hwy::Span<MatPtr*> tensors) {
-                    tensors[0]->ZeroInit();
-                  });
-  }
-
-  // Allocates memory for all the tensors in the layer.
-  // Note that this is slow and only used for a stand-alone layer.
-  void Allocate(std::vector<MatStorage>& layer_storage) {
-    ForEachTensor(
-        {this}, /*layer_idx=*/0, ForEachType::kInitNoToc,
-        [&layer_storage](const char* name, hwy::Span<MatPtr*> tensors) {
-          layer_storage.emplace_back(*tensors[0]);
-          layer_storage.back().Allocate();
-          tensors[0]->SetPtr(layer_storage.back());
-        });
-  }
-};
-
-template <class Weight>
-struct ModelWeightsPtrs {
-  explicit ModelWeightsPtrs(const ModelConfig& config)
-      : ModelWeightsPtrs(
-            config,
-            TensorIndex(config, /*llm_layer_idx=*/-1, /*vit_layer_idx=*/-1,
-                        /*reshape_att=*/false)) {}
-  ModelWeightsPtrs(const ModelConfig& config, const TensorIndex& tensor_index)
-      : embedder_input_embedding("c_embedding", tensor_index),
-        final_norm_scale("c_final_norm", tensor_index),
-        vit_encoder_norm_bias("enc_norm_bias", tensor_index),
-        vit_encoder_norm_scale("enc_norm_scale", tensor_index),
-        vit_img_embedding_bias("img_emb_bias", tensor_index),
-        vit_img_embedding_kernel("img_emb_kernel", tensor_index),
-        vit_img_pos_embedding("img_pos_emb", tensor_index),
-        vit_img_head_bias("img_head_bias", tensor_index),
-        vit_img_head_kernel("img_head_kernel", tensor_index),
-        mm_embed_norm("mm_embed_norm", tensor_index),
-        scale_names(config.scale_names),
-        weights_config(config) {
-    c_layers.reserve(config.layer_configs.size());
-    for (int index = 0; index < static_cast<int>(config.layer_configs.size());
-         ++index) {
-      const auto& layer_config = config.layer_configs[index];
-      TensorIndex tensor_index(config, index, /*vit_layer_idx=*/-1,
-                               /*reshape_att=*/false);
-      c_layers.push_back(LayerWeightsPtrs<Weight>(layer_config, tensor_index));
+    if (layer_config.softmax_attn_output_biases &&
+        layer_config.type == LayerAttentionType::kGemma) {
+      func(TENSOR_ARGS(attention_output_biases, kMustRead));
     }
-    for (int index = 0;
-         index < static_cast<int>(config.vit_config.layer_configs.size());
-         ++index) {
-      const auto& layer_config = config.vit_config.layer_configs[index];
-      TensorIndex tensor_index(config, /*llm_layer_idx=*/-1, index,
-                               /*reshape_att=*/false);
-      vit_layers.push_back(
-          LayerWeightsPtrs<Weight>(layer_config, tensor_index));
-    }
-  }
+  }  // `ForEachTensor`
 
-  ~ModelWeightsPtrs() = default;
-  using WeightF32OrBF16 = typename LayerWeightsPtrs<Weight>::WeightF32OrBF16;
-  using WeightF32OrInputT = hwy::If<hwy::IsSame<WeightF32OrBF16, BF16>(),
-                                    EmbedderInputT, WeightF32OrBF16>;
-
-  MatPtrT<WeightF32OrInputT> embedder_input_embedding;
-  MatPtrT<WeightF32OrBF16> final_norm_scale;
-
-  // Vit parts.
-  MatPtrT<WeightF32OrBF16> vit_encoder_norm_bias;
-  MatPtrT<WeightF32OrBF16> vit_encoder_norm_scale;
-  MatPtrT<float> vit_img_embedding_bias;
-  MatPtrT<WeightF32OrBF16> vit_img_embedding_kernel;
-  MatPtrT<float> vit_img_pos_embedding;
-  // The head maps from VitConfig::kModelDim (Vit final layer) to
-  // kModelDim (LLM input).
-  MatPtrT<float> vit_img_head_bias;
-  MatPtrT<WeightF32OrBF16> vit_img_head_kernel;
-
-  MatPtrT<WeightF32OrBF16> mm_embed_norm;
-
-  std::unordered_set<std::string> scale_names;
-
-  const ModelConfig& weights_config;
-
-  std::vector<LayerWeightsPtrs<Weight>> c_layers;
-  std::vector<LayerWeightsPtrs<Weight>> vit_layers;
-
-  // Called by weights.cc after Loading, before att_w has been allocated.
-  void AllocAndCopyWithTranspose(hwy::ThreadPool& pool,
-                                 std::vector<MatStorage>& model_storage) {
-    size_t storage_index = model_storage.size();
-    for (auto& layer : c_layers) {
-      model_storage.emplace_back(layer.att_weights);
-    }
-    pool.Run(0, c_layers.size(),
-             [this, &model_storage, storage_index](uint64_t layer,
-                                                   size_t /*thread*/) {
-               GetLayer(layer)->Reshape(&model_storage[storage_index + layer]);
-             });
-  }
-  // For when the storage has already been allocated.
-  void CopyWithTranspose(hwy::ThreadPool& pool) {
-    pool.Run(0, c_layers.size(), [this](uint64_t layer, size_t /*thread*/) {
-      GetLayer(layer)->Reshape(nullptr);
+  // Zero-initializes all allocated tensors in the layer.
+  void ZeroInit() {
+    ForEachTensor(nullptr, nullptr, [](const TensorArgs& t) {
+      if (!t.mat.HasPtr()) return;
+      gcpp::ZeroInit(t.mat);
     });
   }
 
-  void ZeroInit() {
-    embedder_input_embedding.ZeroInit();
-    final_norm_scale.ZeroInit();
-    for (size_t i = 0; i < c_layers.size(); ++i) {
-      c_layers[i].ZeroInit(i);
-    }
-  }
-
-  const LayerWeightsPtrs<Weight>* GetLayer(size_t layer) const {
-    return &c_layers[layer];
-  }
-  LayerWeightsPtrs<Weight>* GetLayer(size_t layer) { return &c_layers[layer]; }
-  const LayerWeightsPtrs<Weight>* GetVitLayer(size_t layer) const {
-    return &vit_layers[layer];
-  }
-  LayerWeightsPtrs<Weight>* GetVitLayer(size_t layer) {
-    return &vit_layers[layer];
-  }
-
-  void Allocate(std::vector<MatStorage>& model_storage, hwy::ThreadPool& pool) {
-    std::vector<MatPtr*> model_toc;
-    ForEachTensor(
-        {this}, ForEachType::kInitNoToc,
-        [&model_toc, &model_storage](const char*, hwy::Span<MatPtr*> tensors) {
-          model_toc.push_back(tensors[0]);
-          model_storage.emplace_back(*tensors[0]);
-        });
-    // Allocate in parallel using the pool.
-    pool.Run(0, model_toc.size(),
-             [&model_toc, &model_storage](uint64_t task, size_t /*thread*/) {
-               // model_storage may have had content before we started.
-               size_t idx = task + model_storage.size() - model_toc.size();
-               model_storage[idx].Allocate();
-               model_toc[task]->SetPtr(model_storage[idx]);
-             });
-  }
-
-  // Copies the data from other to *this.
-  void CopyFrom(const ModelWeightsPtrs<Weight>& other) {
-    ForEachTensor({this, const_cast<ModelWeightsPtrs<Weight>*>(&other)},
-                  ForEachType::kIgnoreNulls,
-                  [](const char*, hwy::Span<MatPtr*> tensors) {
-                    hwy::CopyBytes(tensors[1]->Ptr(), tensors[0]->Ptr(),
-                                   tensors[1]->SizeBytes());
-                  });
-  }
-
-  // If scales is empty, computes and returns the scale factors for the tensors,
-  // otherwise applies the scale factors to the tensors.
-  void GetOrApplyScales(std::vector<float>& scales) {
-    int scale_pos = 0;
-    ForEachTensor(
-        {this}, ForEachType::kIgnoreNulls,
-        [&scales, &scale_pos, this](const char*, hwy::Span<MatPtr*> tensors) {
-          if (this->scale_names.count(tensors[0]->Name())) {
-            if (scale_pos < scales.size()) {
-              tensors[0]->set_scale(scales[scale_pos]);
-            } else {
-              float scale = ScaleWeights(tensors[0]->data<float>(),
-                                         tensors[0]->NumElements());
-              scales.push_back(scale);
-            }
-            ++scale_pos;
-          }
-        });
-    HWY_ASSERT(scale_pos == weights_config.num_tensor_scales);
-  }
-
-  template <class Func>
-  static void ForEachTensor(const std::vector<ModelWeightsPtrs<Weight>*>& ptrs,
-                            ForEachType fet, Func func) {
-    std::vector<LayerWeightsPtrs<Weight>*> layers(ptrs.size());
-    std::vector<LayerWeightsPtrs<Weight>*> vit_layers(ptrs.size());
-    std::vector<MatPtr*> tensors(ptrs.size(), nullptr);
-    // Variables used by GEMMA_CALL_FUNC.
-    int layer_idx = -1;
-    char sep = ' ';
-    int sep_index = -1;
-    GEMMA_CALL_FUNC(embedder_input_embedding);
-    GEMMA_CALL_FUNC(final_norm_scale);
-    if (ptrs[0]->weights_config.vit_config.layer_configs.size() > 0) {
-      // Vit parts.
-      GEMMA_CALL_FUNC(vit_encoder_norm_bias);
-      GEMMA_CALL_FUNC(vit_encoder_norm_scale);
-      GEMMA_CALL_FUNC(vit_img_embedding_bias);
-      GEMMA_CALL_FUNC(vit_img_embedding_kernel);
-      GEMMA_CALL_FUNC(vit_img_pos_embedding);
-      GEMMA_CALL_FUNC(vit_img_head_bias);
-      GEMMA_CALL_FUNC(vit_img_head_kernel);
-
-      if (ptrs[0]->weights_config.wrapping == PromptWrapping::GEMMA_VLM)
-        GEMMA_CALL_FUNC(mm_embed_norm);
-    }
-
-    for (int layer_idx = 0; layer_idx < ptrs[0]->c_layers.size(); ++layer_idx) {
-      for (int i = 0; i < ptrs.size(); ++i) {
-        layers[i] = ptrs[i]->GetLayer(layer_idx);
-      }
-      LayerWeightsPtrs<Weight>::ForEachTensor(layers, layer_idx, fet, func);
-    }
-
-    // Vit layers. Not supported for compress_weights.
-    if (ptrs[0]->weights_config.vit_config.layer_configs.size() > 0) {
-      for (int layer_idx = 0; layer_idx < ptrs[0]->vit_layers.size();
-           ++layer_idx) {
-        auto type = ptrs[0]->vit_layers[layer_idx].layer_config.type;
-        HWY_ASSERT(type == LayerAttentionType::kVit);
-        for (int i = 0; i < ptrs.size(); ++i) {
-          vit_layers[i] = ptrs[i]->GetVitLayer(layer_idx);
-        }
-        LayerWeightsPtrs<Weight>::ForEachTensor(vit_layers, layer_idx, fet,
-                                                func);
-      }
-    }
-  }
-};
-#undef GEMMA_CALL_FUNC
-
-// ----------------------------------------------------------------------------
-// Interface
-
-class ModelWeightsStorage {
- public:
-  ModelWeightsStorage() = default;
-  ~ModelWeightsStorage() = default;
-
-  // Loads the weights from a blob store file. Supports multi-file or
-  // single-file format. If the weights file contains a TOC, then it is in
-  // single-file format, and model_type, weight_type, wrapping are ignored,
-  // and tokenizer_proto is required and written to.
-  // With a multi-file format, file, model_type, weight_type, wrapping are
-  // required and tokenizer_proto is ignored.
-  BlobError Load(const Path& weights, Model model_type, Type weight_type,
-                 PromptWrapping wrapping, hwy::ThreadPool& pool,
-                 std::string* tokenizer_proto);
-  // Writes the weights to a blob store file, using the single-file format with
-  // a TOC and config included.
-  BlobError Save(const std::string& tokenizer, const Path& weights,
-                 hwy::ThreadPool& pool);
-  void Allocate(Model model_type, Type weight_type, hwy::ThreadPool& pool) {
-    Allocate(ConfigFromModel(model_type), weight_type, pool);
-  }
-  void Allocate(const ModelConfig& config, Type weight_type,
-                hwy::ThreadPool& pool);
-  void RandInit(std::mt19937& gen);
-  void ZeroInit();
-  void GetOrApplyScales(std::vector<float>& scales);
-  void AllocAndCopyWithTranspose(hwy::ThreadPool& pool);
-  void CopyWithTranspose(hwy::ThreadPool& pool);
-  void LogWeightStats();
-  const ModelConfig& Config() const { return config_; }
-
-  template <typename T>
-  ModelWeightsPtrs<T>* GetWeightsOfType() const {
-    if constexpr (IsSfpStream<T>()) {
-      return sfp_weights_.get();
-    } else if constexpr (IsF32<T>()) {
-      return float_weights_.get();
-    } else if constexpr (IsBF16<T>()) {
-      return bf16_weights_.get();
-    } else if constexpr (IsNuqStream<T>()) {
-      return nuq_weights_.get();
-    } else {
-      return HWY_ABORT("Unsupported type.");
-    }
-  }
-
-  template <template <typename T> class FuncT, typename... TArgs>
-  decltype(auto) CallForModelWeightT(TArgs&&... args) {
-    if (HWY_LIKELY(sfp_weights_))
-      return FuncT<SfpStream>()(*sfp_weights_, std::forward<TArgs>(args)...);
-    if (bf16_weights_)
-      return FuncT<BF16>()(*bf16_weights_, std::forward<TArgs>(args)...);
-    if (nuq_weights_)
-      return FuncT<NuqStream>()(*nuq_weights_, std::forward<TArgs>(args)...);
-    if (float_weights_)
-      return FuncT<float>()(*float_weights_, std::forward<TArgs>(args)...);
-    return HWY_ABORT("No weights loaded.");
-  }
-
-  template <template <typename T> class FuncT, typename... TArgs>
-  decltype(auto) CallForModelWeight(TArgs&&... args) {
-    if (HWY_LIKELY(sfp_weights_))
-      return FuncT<SfpStream>()(*this, std::forward<TArgs>(args)...);
-    if (bf16_weights_)
-      return FuncT<BF16>()(*this, std::forward<TArgs>(args)...);
-    if (nuq_weights_)
-      return FuncT<NuqStream>()(*this, std::forward<TArgs>(args)...);
-    if (float_weights_)
-      return FuncT<float>()(*this, std::forward<TArgs>(args)...);
-    return HWY_ABORT("No weights loaded.");
-  }
+  // Must be called after reading weights via `ForEachTensor`.
+  // TODO: exporters should bake this into the weights already.
+  // WARNING: called from multiple threads; `mat_owners` requires a lock.
+  void Fixup(std::vector<MatOwner>& mat_owners, const Allocator& allocator);
 
  private:
-  void CreateForType(Type weight_type, hwy::ThreadPool& pool);
+  // Copies att_weights from `attn_vec_einsum_w`.
+  void InitAttWeights(std::vector<MatOwner>& mat_owners,
+                      const Allocator& allocator);
 
-  ModelConfig config_;
-  // To eliminate type templates, we hold a pointer to one of each weight type
-  // and dispatch to whichever is non-null.
-  std::unique_ptr<ModelWeightsPtrs<float>> float_weights_;
-  std::unique_ptr<ModelWeightsPtrs<BF16>> bf16_weights_;
-  std::unique_ptr<ModelWeightsPtrs<SfpStream>> sfp_weights_;
-  std::unique_ptr<ModelWeightsPtrs<NuqStream>> nuq_weights_;
-  // Storage for all the matrices and vectors.
-  std::vector<MatStorage> model_storage_;
+  // For FFN. Fast, only updates pointers.
+  void SplitW1();
+
+  // For attention, which might not have a w2. Fast, only updates pointers.
+  void SplitAttW1();
 };
 
+// Holds layer-independent weight metadata and pointers plus per-layer
+// `LayerWeightsPtrs`. The tensor data is owned by `MatOwner`.
+struct WeightsPtrs {
+  explicit WeightsPtrs(const ModelConfig& config)
+      : config_(config),
+        tensors_(config_),
+        finder_("", tensors_),  // no suffix because these are per-model.
+        embedder_input_embedding(finder_("c_embedding")),
+        final_norm_scale(finder_("c_final_norm")),
+        vit_encoder_norm_bias(finder_("enc_norm_bias")),
+        vit_encoder_norm_scale(finder_("enc_norm_scale")),
+        vit_img_embedding_bias(finder_("img_emb_bias")),
+        vit_img_embedding_kernel(finder_("img_emb_kernel")),
+        vit_img_pos_embedding(finder_("img_pos_emb")),
+        vit_img_head_bias(finder_("img_head_bias")),
+        vit_img_head_kernel(finder_("img_head_kernel")),
+        mm_embed_norm(finder_("mm_embed_norm")),
+        c_layers() {
+    c_layers.reserve(config_.layer_configs.size());
+    for (size_t idx = 0; idx < config_.layer_configs.size(); ++idx) {
+      const LayerConfig& layer_config = config_.layer_configs[idx];
+      c_layers.emplace_back(idx, layer_config, tensors_);
+    }
+    for (size_t idx = 0; idx < config_.vit_config.layer_configs.size(); ++idx) {
+      const LayerConfig& layer_config = config_.vit_config.layer_configs[idx];
+      vit_layers.emplace_back(idx, layer_config, tensors_);
+    }
+  }
+
+  ~WeightsPtrs() = default;
+
+  const ModelConfig& config_;
+  // Passed to finder_, hence must be initialized first.
+  const TensorInfoRegistry tensors_;
+  const MatFinder finder_;
+
+  // TODO: switch to SFP?
+  MatPtr embedder_input_embedding;
+  MatPtr final_norm_scale;  // at least BF16.
+
+  // Vit parts.
+  MatPtr vit_encoder_norm_bias;   // at least BF16.
+  MatPtr vit_encoder_norm_scale;  // at least BF16.
+  MatPtrT<float> vit_img_embedding_bias;
+  MatPtr vit_img_embedding_kernel;  // at least BF16.
+  MatPtr vit_img_pos_embedding;     // F32?
+  // The head maps from VitConfig::model_dim (Vit final layer) to
+  // model_dim (LLM input).
+  MatPtrT<float> vit_img_head_bias;
+  MatPtr vit_img_head_kernel;  // at least BF16.
+
+  MatPtr mm_embed_norm;  // at least BF16.
+
+  std::vector<LayerWeightsPtrs> c_layers;
+  std::vector<LayerWeightsPtrs> vit_layers;
+
+  const LayerWeightsPtrs* GetLayer(size_t layer) const {
+    return &c_layers[layer];
+  }
+  LayerWeightsPtrs* GetLayer(size_t layer) { return &c_layers[layer]; }
+  const LayerWeightsPtrs* VitLayer(size_t layer) const {
+    return &vit_layers[layer];
+  }
+  LayerWeightsPtrs* VitLayer(size_t layer) { return &vit_layers[layer]; }
+
+  // Called via `CallT`. `other1` and `other2` are usually null, but can be
+  // used to copy from another set of weights. Public because called by tests
+  // and `WeightsOwner`.
+  template <class Func>
+  void ForEachTensor(WeightsPtrs* other1, WeightsPtrs* other2, Func func) {
+    LayerWeightsPtrs* other_layer1 = nullptr;
+    LayerWeightsPtrs* other_layer2 = nullptr;
+    func(TENSOR_ARGS(embedder_input_embedding, kMustRead));
+    func(TENSOR_ARGS(final_norm_scale, kMustRead));
+
+    if (!config_.vit_config.layer_configs.empty()) {  // Vit parts.
+      func(TENSOR_ARGS(vit_encoder_norm_bias, kMustRead));
+      func(TENSOR_ARGS(vit_encoder_norm_scale, kMustRead));
+      func(TENSOR_ARGS(vit_img_embedding_bias, kMustRead));
+      func(TENSOR_ARGS(vit_img_embedding_kernel, kMustRead));
+      func(TENSOR_ARGS(vit_img_pos_embedding, kMustRead));
+      func(TENSOR_ARGS(vit_img_head_bias, kMustRead));
+      func(TENSOR_ARGS(vit_img_head_kernel, kMustRead));
+
+      if (config_.wrapping == PromptWrapping::GEMMA_VLM) {
+        func(TENSOR_ARGS(mm_embed_norm, kMustRead));
+      }
+    }
+
+    for (size_t layer_idx = 0; layer_idx < c_layers.size(); ++layer_idx) {
+      if (other1) other_layer1 = other1->GetLayer(layer_idx);
+      if (other2) other_layer2 = other2->GetLayer(layer_idx);
+      GetLayer(layer_idx)->ForEachTensor(other_layer1, other_layer2, func);
+    }
+
+    HWY_ASSERT(config_.vit_config.layer_configs.empty() == vit_layers.empty());
+    for (size_t layer_idx = 0; layer_idx < vit_layers.size(); ++layer_idx) {
+      HWY_ASSERT(vit_layers[layer_idx].layer_config.type ==
+                 LayerAttentionType::kVit);
+      other_layer1 = other1 ? other1->VitLayer(layer_idx) : nullptr;
+      other_layer2 = other2 ? other2->VitLayer(layer_idx) : nullptr;
+      VitLayer(layer_idx)->ForEachTensor(other_layer1, other_layer2, func);
+    }
+  }  // `ForEachTensor`
+
+  // Zero-initializes only the allocated tensors in `*this`.
+  void ZeroInit();
+  // Copies only the allocated tensors in `*this` from tensors in `other`.
+  void CopyFrom(const WeightsPtrs& other);
+
+  enum class Mode {
+    // Parallel I/O, decompress to BF16. Best for large batch sizes.
+    kReadBF16,
+    // Parallel I/O, insert row-wise padding. Safe default.
+    kRead,
+    // Best for large weights relative to available memory, especially for
+    // frequent invocations of small batches and short sequences. Adds noise to
+    // performance measurements due to I/O variability.
+    kMap
+  };
+
+  static const char* ToString(Mode mode) {
+    switch (mode) {
+      case Mode::kReadBF16:
+        return "ReadBF16";
+      case Mode::kRead:
+        return "Read";
+      case Mode::kMap:
+        return "Map";
+      default:
+        HWY_DASSERT(false);
+        return "?";
+    }
+  }
+
+  // Reads tensor data from `BlobStore` or aborts on error. `map` is a user
+  // override for whether to map blobs or read them. Returns the mode used.
+  Mode ReadFromBlobs(const ModelStore& model, BlobReader& reader,
+                     const LoaderArgs& loader, const InferenceArgs& inference,
+                     std::vector<MatOwner>& mat_owners, ThreadingContext& ctx);
+
+  // Adds one blob for each tensor's data and returns all serialized MatPtr.
+  std::vector<uint32_t> AddTensorDataToWriter(BlobWriter& writer) const;
+
+ private:
+  // For reshaping file tensors to the shape expected by the code. This would
+  // ideally already happen in the importer. Called by ReadFromBlobs.
+  void Fixup(std::vector<MatOwner>& mat_owners, ThreadingContext& ctx);
+
+  MapPtr mapped_;
+};  // `WeightsPtrs`
+#undef TENSOR_ARGS
+
 }  // namespace gcpp
 
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_H_
\ No newline at end of file
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_H_
diff --git a/io/BUILD.bazel b/io/BUILD.bazel
new file mode 100644
index 0000000..1ca42a4
--- /dev/null
+++ b/io/BUILD.bazel
@@ -0,0 +1,118 @@
+# File I/O and model loading
+
+package(
+    default_applicable_licenses = [
+        "//:license",  # Placeholder comment, do not modify
+    ],
+    # Placeholder for internal compatible_with
+    default_visibility = [
+        # Placeholder for internal visibility,
+        # Users require gcpp::Path etc., which are defined in this package.
+        "//visibility:public",
+    ],
+)
+
+config_setting(
+    name = "android",
+    constraint_values = [
+        "@platforms//os:android",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+FILE_DEPS = select({
+    "//conditions:default": [
+        # Placeholder for io deps, do not remove
+    ],
+    ":android": [],
+    # Placeholder for internal build rules, do not remove
+})
+
+cc_library(
+    name = "io",
+    srcs = [
+        "io.cc",
+        # Placeholder for io backend, do not remove
+    ],
+    hdrs = ["io.h"],
+    local_defines = select({
+        # Placeholder for internal build rules, do not remove
+        "//conditions:default": [],
+    }),
+    deps = [
+        # Placeholder for internal dep, do not remove.,
+        "//:allocator",
+        "@highway//:hwy",
+    ] + FILE_DEPS,
+)
+
+cc_library(
+    name = "fields",
+    srcs = ["fields.cc"],
+    hdrs = ["fields.h"],
+    deps = [
+        "@highway//:hwy",
+    ],
+)
+
+cc_test(
+    name = "fields_test",
+    srcs = ["fields_test.cc"],
+    deps = [
+        ":fields",
+        "@googletest//:gtest_main",  # buildcleaner: keep
+        "@highway//:hwy_test_util",
+    ],
+)
+
+cc_library(
+    name = "blob_store",
+    srcs = ["blob_store.cc"],
+    hdrs = ["blob_store.h"],
+    deps = [
+        ":io",
+        "//:basics",
+        "//:threading_context",
+        "@highway//:hwy",
+        "@highway//:thread_pool",
+    ],
+)
+
+cc_test(
+    name = "blob_store_test",
+    srcs = ["blob_store_test.cc"],
+    deps = [
+        ":blob_store",
+        ":io",
+        "@googletest//:gtest_main",  # buildcleaner: keep
+        "//:basics",
+        "//:threading_context",
+        "@highway//:hwy_test_util",
+    ],
+)
+
+cc_binary(
+    name = "blob_compare",
+    srcs = ["blob_compare.cc"],
+    deps = [
+        ":blob_store",
+        ":io",
+        "//:basics",
+        "//:threading",
+        "//:threading_context",
+        "@highway//:hwy",
+        "@highway//:hwy_test_util",
+        "@highway//:nanobenchmark",
+        "@highway//:thread_pool",
+    ],
+)
+
+cc_binary(
+    name = "migrate_weights",
+    srcs = ["migrate_weights.cc"],
+    deps = [
+        "//:args",
+        "//:benchmark_helper",
+        "//:gemma_lib",
+    ],
+)
diff --git a/io/blob_compare.cc b/io/blob_compare.cc
new file mode 100644
index 0000000..bb25843
--- /dev/null
+++ b/io/blob_compare.cc
@@ -0,0 +1,249 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>  // strcmp
+
+#include <atomic>
+#include <string>
+#include <vector>
+
+#include "io/blob_store.h"
+#include "io/io.h"        // Path
+#include "util/basics.h"  // IndexRange
+#include "util/threading.h"
+#include "util/threading_context.h"
+#include "hwy/aligned_allocator.h"  // Span
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/timer.h"
+
+namespace gcpp {
+
+// Aborts if any keys differ, because then blobs are not comparable.
+void CompareKeys(const BlobReader& reader1, const BlobReader& reader2) {
+  if (reader1.Keys().size() != reader2.Keys().size()) {
+    HWY_ABORT("#keys mismatch: %zu vs %zu\n", reader1.Keys().size(),
+              reader2.Keys().size());
+  }
+  for (size_t i = 0; i < reader1.Keys().size(); ++i) {
+    if (reader1.Keys()[i] != reader2.Keys()[i]) {
+      HWY_ABORT("key %zu mismatch: %s vs %s\n", i, reader1.Keys()[i].c_str(),
+                reader2.Keys()[i].c_str());
+    }
+  }
+}
+
+using KeyVec = std::vector<std::string>;
+using RangeVec = std::vector<BlobRange>;
+
+RangeVec AllRanges(const KeyVec& keys, const BlobReader& reader) {
+  RangeVec ranges;
+  ranges.reserve(keys.size());
+  for (const std::string& key : keys) {
+    const BlobRange* range = reader.Find(key);
+    if (!range) {
+      HWY_ABORT("Key %s not found, but was in KeyVec\n", key.c_str());
+    }
+    ranges.push_back(*range);
+  }
+  return ranges;
+}
+
+// Aborts if any sizes differ, because that already guarantees a mismatch.
+void CompareRangeSizes(const KeyVec& keys, const RangeVec& ranges1,
+                       const RangeVec& ranges2) {
+  HWY_ASSERT(keys.size() == ranges1.size());
+  HWY_ASSERT(keys.size() == ranges2.size());
+  for (size_t i = 0; i < ranges1.size(); ++i) {
+    // Tolerate differing key_idx and offset because blobs may be in different
+    // order in the two files.
+    if (ranges1[i].bytes != ranges2[i].bytes) {
+      HWY_ABORT("range #%zu (%s) size mismatch: %zu vs %zu\n", i,
+                keys[i].c_str(), ranges1[i].bytes, ranges2[i].bytes);
+    }
+  }
+}
+
+// Total amount to allocate for all blobs.
+size_t TotalBytes(const RangeVec& ranges) {
+  size_t total_bytes = 0;
+  for (const BlobRange& range : ranges) {
+    total_bytes += range.bytes;
+  }
+  return total_bytes;
+}
+
+using BytePtr = hwy::AlignedFreeUniquePtr<uint8_t[]>;
+using ByteSpan = hwy::Span<uint8_t>;    // Sections within BytePtr
+using BlobVec = std::vector<ByteSpan>;  // in order of keys
+
+// Assigns pointers within the single allocation and updates `pos`.
+BlobVec ReserveMemory(const RangeVec& ranges, BytePtr& all_blobs, size_t& pos) {
+  BlobVec blobs;
+  for (const BlobRange& range : ranges) {
+    blobs.push_back(ByteSpan(all_blobs.get() + pos, range.bytes));
+    pos += range.bytes;
+  }
+  return blobs;
+}
+
+// Reads one set of blobs in parallel (helpful if in disk cache).
+// Aborts on error.
+void ReadBlobs(BlobReader& reader, const RangeVec& ranges, BlobVec& blobs,
+               hwy::ThreadPool& pool) {
+  HWY_ASSERT(reader.Keys().size() == blobs.size());
+  HWY_ASSERT(ranges.size() == blobs.size());
+  pool.Run(0, blobs.size(), [&](size_t i, size_t /*thread*/) {
+    HWY_ASSERT(ranges[i].bytes == blobs[i].size());
+    reader.file().Read(ranges[i].offset, ranges[i].bytes, blobs[i].data());
+  });
+}
+
+// Parallelizes ReadBlobs across (two) packages, if available.
+void ReadBothBlobs(BlobReader& reader1, BlobReader& reader2,
+                   const RangeVec& ranges1, const RangeVec& ranges2,
+                   size_t total_bytes, BlobVec& blobs1, BlobVec& blobs2,
+                   NestedPools& pools) {
+  const double t0 = hwy::platform::Now();
+  HWY_WARN("Reading %zu GiB, %zux%zu cores: ", total_bytes >> 30,
+           pools.AllPackages().NumWorkers(), pools.Pool().NumWorkers());
+  pools.AllPackages().Run(0, 2, [&](size_t task, size_t pkg_idx) {
+    ReadBlobs(task ? reader2 : reader1, task ? ranges2 : ranges1,
+              task ? blobs2 : blobs1, pools.Pool(pkg_idx));
+  });
+  const double t1 = hwy::platform::Now();
+  HWY_WARN("%.1f GB/s\n", total_bytes / (t1 - t0) * 1E-9);
+}
+
+// Returns number of elements with a mismatch. For float and bf16 blobs, uses
+// L1 and relative error, otherwise byte-wise comparison.
+size_t BlobDifferences(const ByteSpan data1, const ByteSpan data2,
+                       const std::string& key) {
+  if (data1.size() != data2.size() || data1.size() == 0) {
+    HWY_ABORT("key %s size mismatch: %zu vs %zu\n", key.c_str(), data1.size(),
+              data2.size());
+  }
+
+  size_t mismatches = 0;
+  const char type = key[0];
+  if (type == 'F') {
+    HWY_ASSERT(data1.size() % sizeof(float) == 0);
+    for (size_t j = 0; j < data1.size(); j += sizeof(float)) {
+      float f1, f2;
+      hwy::CopyBytes(&data1[j], &f1, sizeof(f1));
+      hwy::CopyBytes(&data2[j], &f2, sizeof(f2));
+      const float l1 = hwy::ScalarAbs(f1 - f2);
+      const float rel = hwy::ScalarAbs(f1) == 0.0f ? 0.0f : l1 / f1;
+      if (l1 > 1E-3f || rel > 1E-2f) {
+        HWY_WARN("key %s %5zu: L1 %.5f rel %.4f\n", key.c_str(), j, l1, rel);
+        ++mismatches;
+      }
+    }
+  } else if (type == 'B') {
+    for (size_t j = 0; j < data1.size(); j += sizeof(hwy::bfloat16_t)) {
+      hwy::bfloat16_t b1, b2;
+      hwy::CopyBytes(&data1[j], &b1, sizeof(b1));
+      hwy::CopyBytes(&data2[j], &b2, sizeof(b2));
+      const float f1 = hwy::ConvertScalarTo<float>(b1);
+      const float f2 = hwy::ConvertScalarTo<float>(b2);
+      const float l1 = hwy::ScalarAbs(f1 - f2);
+      const float rel = hwy::ScalarAbs(f1) == 0.0f ? 0.0f : l1 / f1;
+      if (l1 > 1E-2f || rel > 1E-1f) {
+        HWY_WARN("key %s %5zu: L1 %.5f rel %.4f\n", key.c_str(), j, l1, rel);
+        ++mismatches;
+      }
+    }
+  } else {
+    for (size_t j = 0; j < data1.size(); ++j) {
+      if (data1[j] != data2[j]) {
+        if (mismatches == 0) {
+          HWY_WARN("key %s mismatch at byte %5zu\n", key.c_str(), j);
+        }
+        ++mismatches;
+      }
+    }
+  }
+  return mismatches;
+}
+
+void CompareBlobs(const KeyVec& keys, BlobVec& blobs1, BlobVec& blobs2,
+                  size_t total_bytes, NestedPools& pools) {
+  HWY_WARN("Comparing %zu blobs in parallel: ", keys.size());
+  const double t0 = hwy::platform::Now();
+  std::atomic<size_t> blobs_equal{};
+  std::atomic<size_t> blobs_diff{};
+  const IndexRangePartition ranges = StaticPartition(
+      IndexRange(0, keys.size()), pools.AllPackages().NumWorkers(), 1);
+  ParallelizeOneRange(
+      ranges, pools.AllPackages(),
+      [&](const IndexRange& range, size_t pkg_idx) {
+        pools.Pool(pkg_idx).Run(
+            range.begin(), range.end(), [&](size_t i, size_t /*thread*/) {
+              const size_t mismatches =
+                  BlobDifferences(blobs1[i], blobs2[i], keys[i]);
+              if (mismatches != 0) {
+                HWY_WARN("key %s has %zu mismatches in %zu bytes!\n",
+                         keys[i].c_str(), mismatches, blobs1[i].size());
+                blobs_diff.fetch_add(1);
+              } else {
+                blobs_equal.fetch_add(1);
+              }
+            });
+      });
+  const double t1 = hwy::platform::Now();
+  HWY_WARN("%.1f GB/s; total blob matches=%zu, mismatches=%zu\n",
+           total_bytes / (t1 - t0) * 1E-9, blobs_equal.load(),
+           blobs_diff.load());
+}
+
+// Compares two sbs files, including blob order.
+void ReadAndCompareBlobs(const Path& path1, const Path& path2) {
+  BlobReader reader1(path1);
+  BlobReader reader2(path2);
+
+  CompareKeys(reader1, reader2);
+  const RangeVec ranges1 = AllRanges(reader1.Keys(), reader1);
+  const RangeVec ranges2 = AllRanges(reader2.Keys(), reader2);
+  CompareRangeSizes(reader1.Keys(), ranges1, ranges2);
+
+  // Single allocation, avoid initializing the memory.
+  const size_t total_bytes = TotalBytes(ranges1) + TotalBytes(ranges2);
+  BytePtr all_blobs = hwy::AllocateAligned<uint8_t>(total_bytes);
+  size_t pos = 0;
+  BlobVec blobs1 = ReserveMemory(ranges1, all_blobs, pos);
+  BlobVec blobs2 = ReserveMemory(ranges2, all_blobs, pos);
+
+  ThreadingArgs args;
+  ThreadingContext ctx(args);
+  ReadBothBlobs(reader1, reader2, ranges1, ranges2, total_bytes, blobs1, blobs2,
+                ctx.pools);
+
+  CompareBlobs(reader1.Keys(), blobs1, blobs2, total_bytes, ctx.pools);
+}
+
+}  // namespace gcpp
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    HWY_ABORT("Usage: %s <sbs_path> <sbs_path>\n", argv[0]);
+  }
+  if (strcmp(argv[1], argv[2]) == 0) {
+    HWY_ABORT("Filenames are the same, skipping comparison: %s\n", argv[1]);
+  }
+  gcpp::ReadAndCompareBlobs(gcpp::Path(argv[1]), gcpp::Path(argv[2]));
+  return 0;
+}
diff --git a/io/blob_store.cc b/io/blob_store.cc
new file mode 100644
index 0000000..c3fbea1
--- /dev/null
+++ b/io/blob_store.cc
@@ -0,0 +1,513 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "io/blob_store.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>  // std::move
+#include <vector>
+
+#include "io/io.h"
+#include "util/threading_context.h"
+#include "hwy/aligned_allocator.h"  // Span
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/detect_compiler_arch.h"
+
+namespace gcpp {
+
+static_assert(HWY_IS_LITTLE_ENDIAN, "Assumes little endian");
+
+// Each blob offset is a multiple of this, an upper bound on SVE vectors and
+// usually also larger than L2 cache lines. This is useful when memory mapping
+// the entire file, because offset alignment then determines the alignment of
+// the blob in memory. Aligning each blob to the (largest) page size would be
+// too wasteful, see `kEndAlign`.
+constexpr size_t kBlobAlign = 256;  // test also hard-codes this value
+
+// Linux mmap requires the file to be a multiple of the (base) page size, which
+// can be up to 64 KiB on Arm. Apple uses 16 KiB, most others use 4 KiB.
+constexpr size_t kEndAlign = 64 * 1024;
+
+constexpr size_t kU128Bytes = sizeof(hwy::uint128_t);
+
+// Conversion between strings (<= `kU128Bytes` chars) and the fixed-size u128
+// used to store them on disk.
+static hwy::uint128_t KeyFromString(const char* string) {
+  size_t length = 0;
+  for (size_t i = 0; string[i] != '\0'; ++i) {
+    ++length;
+  }
+  if (length > kU128Bytes) {
+    HWY_ABORT("Key %s is too long, please truncate to 16 chars.", string);
+  }
+  HWY_ASSERT(length != 0);
+
+  hwy::uint128_t ret;
+  hwy::ZeroBytes<sizeof(ret)>(&ret);
+  hwy::CopyBytes(string, &ret, length);
+  return ret;
+}
+
+static std::string StringFromKey(hwy::uint128_t key) {
+  std::string name(sizeof(key) + 1, '\0');
+  hwy::CopyBytes(&key, name.data(), sizeof(key));
+  name.resize(name.find('\0'));
+  return name;
+}
+
+namespace {
+#pragma pack(push, 1)
+struct Header {             // standard layout class
+  uint32_t magic = 0;       // kMagic
+  uint32_t num_blobs = 0;   // never zero
+  uint64_t file_bytes = 0;  // must match actual size of file
+};
+#pragma pack(pop)
+static_assert(sizeof(Header) == 16);
+}  // namespace
+
+// A write I/O request, each serviced by one thread in a pool.
+struct BlobIO {
+  BlobIO(BlobRange range, const void* data) : range(range), data(data) {}
+
+  BlobRange range;
+  const void* data;  // Read-only for writes.
+};
+
+// Little-endian on-disk representation:
+// For V1: the file is represented as
+//    Header + Directory + PadToBlobAlign + Payload + PayToEndAlign.
+// For V2: the file is represented as
+//   Header + PadToBlobAlign + Payload + PadToEndAlign + Directory + Header
+// The Header at the beginning has num_blobs == 0; and the Header at the end has
+// the correct num_blobs.
+//
+// Actual payload is indexed by the directory with keys, offset and bytes; keys
+// are unique, opaque 128-bit keys.
+//
+// The file format deliberately omits a version number because it is unchanging.
+// Additional data may be added only inside new blobs. Changes to the blob
+// contents or type should be handled by renaming keys.
+//
+// The file format deliberately omits a version number because it is unchanging.
+// Additional data may be added only inside new blobs. Changes to the blob
+// contents or type should be handled by renaming keys.
+//
+// This class is for internal use by `BlobReader` and `BlobWriter`. Its
+// interface is more low-level: fixed-size keys instead of strings.
+class BlobStore {
+  static constexpr uint32_t kMagic = 0x0A534253;  // SBS\n
+
+  // Upper limit to avoid allocating a huge vector.
+  static constexpr size_t kMaxBlobs = 16 * 1024;
+
+  // Returns the size of padded header and directory, which is also the start of
+  // the first payload for V1. `num_blobs` is `NumBlobs()` if the header is
+  // already available, otherwise the number of blobs to be written.
+  static size_t PaddedHeaderAndDirBytes(size_t num_blobs) {
+    HWY_ASSERT(num_blobs < kMaxBlobs);
+    // Per blob, a key and offset/size.
+    return RoundUpToAlign(sizeof(Header) + 2 * kU128Bytes * num_blobs);
+  }
+
+  static uint64_t PaddedPayloadBytes(const std::vector<size_t>& blob_sizes) {
+    uint64_t total_payload_bytes = 0;
+    for (size_t blob_size : blob_sizes) {
+      total_payload_bytes += RoundUpToAlign(blob_size);
+    }
+    // Do not round up to `kEndAlign` because the padding also depends on the
+    // directory size. Here we only count the payload.
+    return total_payload_bytes;
+  }
+
+  static void EnsureUnique(hwy::Span<const hwy::uint128_t> keys) {
+    std::unordered_set<std::string> key_set;
+    for (const hwy::uint128_t key : keys) {
+      HWY_ASSERT(key_set.insert(StringFromKey(key)).second);  // ensure inserted
+    }
+  }
+
+  bool ParseHeaderAndDirectoryV1(const File& file) {
+    is_file_v2_ = false;
+    // Read header from the beginning of the file.
+    if (!file.Read(0, sizeof(header_), &header_)) {
+      HWY_WARN("Failed to read BlobStore header.");
+      return false;
+    }
+
+    if (header_.magic != kMagic) {
+      HWY_WARN("BlobStore header magic %08x does not match %08x.",
+               header_.magic, kMagic);
+      return false;
+    }
+
+    if (header_.num_blobs == 0) {
+      // Should parse as V2.
+      return false;
+    }
+
+    if (header_.num_blobs > kMaxBlobs) {
+      HWY_WARN("Too many blobs, likely corrupt file.");
+      return false;
+    }
+
+    directory_.resize(header_.num_blobs * 2);
+    const auto directory_bytes = 2 * kU128Bytes * header_.num_blobs;
+    // Read directory after the header.
+    if (!file.Read(sizeof(header_), directory_bytes, directory_.data())) {
+      HWY_WARN("Failed to read BlobStore directory.");
+      return false;
+    }
+    HWY_ASSERT(IsValid(file.FileSize()));
+    return true;
+  }
+
+  bool ParseHeaderAndDirectoryV2(const File& file) {
+    is_file_v2_ = true;
+    // Read header from the end of the file.
+    size_t offset = file.FileSize() - sizeof(header_);
+    if (!file.Read(offset, sizeof(header_), &header_)) {
+      HWY_WARN("Failed to read BlobStore header.");
+      return false;
+    }
+
+    if (header_.magic != kMagic) {
+      HWY_WARN("BlobStore header magic %08x does not match %08x.",
+               header_.magic, kMagic);
+      return false;
+    }
+
+    if (header_.num_blobs > kMaxBlobs) {
+      HWY_WARN("Too many blobs, likely corrupt file.");
+      return false;
+    }
+    directory_.resize(header_.num_blobs * 2);
+    const auto directory_bytes = 2 * kU128Bytes * header_.num_blobs;
+    offset -= directory_bytes;
+    // Read directory immediately before the header.
+    if (!file.Read(offset, directory_bytes, directory_.data())) {
+      HWY_WARN("Failed to read BlobStore directory.");
+      return false;
+    }
+    HWY_ASSERT(IsValid(file.FileSize()));
+    return true;
+  }
+
+ public:
+  template <typename T>
+  static T RoundUpToAlign(T size_or_offset) {
+    return hwy::RoundUpTo(size_or_offset, kBlobAlign);
+  }
+
+  // Reads header/directory from file.
+  explicit BlobStore(const File& file) {
+    if (ParseHeaderAndDirectoryV1(file)) {
+      return;
+    }
+    if (ParseHeaderAndDirectoryV2(file)) {
+      return;
+    }
+    HWY_ABORT("Failed to read BlobStore header or directory.");
+  }
+
+  // Initializes header/directory for writing to disk.
+  BlobStore(const std::vector<hwy::uint128_t>& keys,
+            const std::vector<size_t>& blob_sizes) {
+    const size_t num_blobs = keys.size();
+    HWY_ASSERT(num_blobs < kMaxBlobs);  // Ensures safe to cast to u32.
+    HWY_ASSERT(keys.size() == blob_sizes.size());
+    EnsureUnique(hwy::Span<const hwy::uint128_t>(keys.data(), num_blobs));
+
+    // Set header_.
+    header_.magic = kMagic;
+    header_.num_blobs = static_cast<uint32_t>(num_blobs);
+
+    const size_t size_before_blobs = BytesBeforeBlobsV2().size();
+    header_.file_bytes =
+        hwy::RoundUpTo(size_before_blobs + PaddedPayloadBytes(blob_sizes) +
+                           PaddedHeaderAndDirBytes(num_blobs),
+                       kEndAlign);
+
+    // Set first num_blobs elements of directory_ which are the keys.
+    directory_.resize(num_blobs * 2);
+    hwy::CopyBytes(keys.data(), directory_.data(), num_blobs * kU128Bytes);
+    EnsureUnique(Keys());
+
+    // Set the second half of directory_ which is the offsets and sizes.
+    uint64_t offset = size_before_blobs;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      SetRange(i, offset, blob_sizes[i]);
+      offset = RoundUpToAlign(offset + blob_sizes[i]);
+    }
+
+    HWY_ASSERT(IsValid(FileSize()));
+  }
+
+  // Must be checked by readers before other methods.
+  bool IsValid(const uint64_t file_size) const {
+    // Ctor failed and already printed a warning.
+    if (directory_.empty()) return false;
+
+    if (header_.magic != kMagic) {
+      HWY_WARN("Given file is not a BlobStore (magic %08x).", header_.magic);
+      return false;
+    }
+    if (header_.num_blobs == 0) {
+      HWY_WARN("Invalid BlobStore (empty), likely corrupt file.");
+      return false;
+    }
+    if (header_.file_bytes != file_size) {
+      HWY_WARN("File length %zu does not match header %zu (truncated?).",
+               static_cast<size_t>(file_size),
+               static_cast<size_t>(header_.file_bytes));
+      return false;
+    }
+
+    // Ensure blobs are back to back.
+    const size_t size_before_blobs = BytesBeforeBlobs().size();
+    const size_t size_after_blobs = BytesAfterBlobs().size();
+
+    uint64_t expected_offset = size_before_blobs;
+    for (size_t key_idx = 0; key_idx < NumBlobs(); ++key_idx) {
+      uint64_t actual_offset;
+      size_t bytes;
+      GetRange(key_idx, actual_offset, bytes);
+      if (expected_offset != actual_offset) {
+        HWY_WARN("Invalid BlobStore: blob %zu at offset %zu but expected %zu.",
+                 key_idx, static_cast<size_t>(actual_offset),
+                 static_cast<size_t>(expected_offset));
+        return false;
+      }
+      expected_offset = RoundUpToAlign(expected_offset + bytes);
+    }
+    // Previously files were not padded to `kEndAlign`, so also allow that.
+    if (expected_offset != header_.file_bytes &&
+        expected_offset + size_after_blobs != header_.file_bytes) {
+      HWY_WARN("Invalid BlobStore: end of blobs %zu but file size %zu.",
+               static_cast<size_t>(expected_offset),
+               static_cast<size_t>(header_.file_bytes));
+      return false;
+    }
+
+    return true;  // all OK
+  }
+
+  static std::vector<uint8_t> BytesBeforeBlobsV2() {
+    const Header kFakeHeaderV2 = {
+        .magic = kMagic,
+        .num_blobs = 0,
+        .file_bytes = kEndAlign,
+    };
+    std::vector<uint8_t> header(PaddedHeaderAndDirBytes(0));
+    hwy::CopyBytes(&kFakeHeaderV2, header.data(), sizeof(Header));
+    return header;
+  }
+
+  std::vector<uint8_t> BytesBeforeBlobs() const {
+    if (is_file_v2_) {
+      return BytesBeforeBlobsV2();
+    } else {
+      const size_t padded_header_and_directory_size =
+          PaddedHeaderAndDirBytes(NumBlobs());
+      std::vector<uint8_t> header_and_directory(
+          padded_header_and_directory_size);
+
+      // Copy header_ at the beginning (offset 0)
+      hwy::CopyBytes(&header_, header_and_directory.data(), sizeof(header_));
+
+      // Copy directory_ immediately after the header_
+      hwy::CopyBytes(directory_.data(),
+                     header_and_directory.data() + sizeof(header_),
+                     2 * kU128Bytes * NumBlobs());
+      return header_and_directory;
+    }
+  }
+
+  std::vector<uint8_t> BytesAfterBlobs() const {
+    // Gets blob end.
+    uint64_t offset = 0;
+    size_t bytes = 0;
+    GetRange(NumBlobs() - 1, offset, bytes);
+    const uint64_t blob_end = RoundUpToAlign(offset + bytes);
+
+    // For V1, just return the file paddings.
+    if (!is_file_v2_) {
+      return std::vector<uint8_t>(FileSize() - blob_end);
+    }
+
+    const size_t header_and_directory_with_file_padding_size =
+        FileSize() - blob_end;
+    std::vector<uint8_t> header_and_directory(
+        header_and_directory_with_file_padding_size);
+
+    const size_t header_size = sizeof(Header);
+    const size_t directory_size = 2 * kU128Bytes * NumBlobs();
+
+    // Copy header_ at the end.
+    offset = header_and_directory_with_file_padding_size - header_size;
+    hwy::CopyBytes(&header_, header_and_directory.data() + offset, header_size);
+
+    // Copy directory_ immediately before the header_.
+    offset -= directory_size;
+    hwy::CopyBytes(directory_.data(), header_and_directory.data() + offset,
+                   directory_size);
+
+    return header_and_directory;
+  }
+
+  size_t FileSize() const { return header_.file_bytes; }
+
+  size_t NumBlobs() const { return static_cast<size_t>(header_.num_blobs); }
+
+  // Not the entirety of `directory_`! The second half is offset/size.
+  hwy::Span<const hwy::uint128_t> Keys() const {
+    return hwy::Span<const hwy::uint128_t>(directory_.data(), NumBlobs());
+  }
+
+  // Retrieves blob's offset and size, not including padding.
+  void GetRange(size_t key_idx, uint64_t& offset, size_t& bytes) const {
+    HWY_ASSERT(key_idx < NumBlobs());
+    const hwy::uint128_t val = directory_[NumBlobs() + key_idx];
+    offset = val.lo;
+    bytes = val.hi;
+    HWY_ASSERT(offset % kBlobAlign == 0);
+    HWY_ASSERT(bytes != 0);
+    HWY_ASSERT(offset + bytes <= header_.file_bytes);
+  }
+
+ private:
+  // Stores offset and range into u128 following the keys, so the directory
+  // can be one array of the same type, and read/written together with keys.
+  void SetRange(size_t key_idx, uint64_t offset, size_t bytes) {
+    HWY_ASSERT(key_idx < NumBlobs());
+    HWY_ASSERT(offset % kBlobAlign == 0);
+    HWY_ASSERT(bytes != 0);
+    HWY_ASSERT(offset + bytes <= header_.file_bytes);
+    hwy::uint128_t& val = directory_[NumBlobs() + key_idx];
+    val.lo = offset;
+    val.hi = bytes;
+  }
+
+  bool is_file_v2_ = true;
+
+  Header header_;
+
+  std::vector<hwy::uint128_t> directory_;  // two per blob, see `SetRange`.
+};  // BlobStore
+
+BlobReader::BlobReader(const Path& blob_path)
+    : blob_path_(blob_path),
+      file_(OpenFileOrAbort(blob_path, "r")),
+      file_bytes_(file_->FileSize()) {
+  if (file_bytes_ == 0) HWY_ABORT("Zero-sized file %s", blob_path.path.c_str());
+
+  BlobStore bs(*file_);
+
+  keys_.reserve(bs.NumBlobs());
+  for (const hwy::uint128_t key : bs.Keys()) {
+    keys_.push_back(StringFromKey(key));
+  }
+
+  ranges_.reserve(bs.NumBlobs());
+  // Populate hash map for O(1) lookups.
+  for (size_t key_idx = 0; key_idx < keys_.size(); ++key_idx) {
+    uint64_t offset;
+    size_t bytes;
+    bs.GetRange(key_idx, offset, bytes);
+    ranges_.emplace_back(
+        BlobRange{.offset = offset, .bytes = bytes, .key_idx = key_idx});
+    key_idx_for_key_[keys_[key_idx]] = key_idx;
+  }
+}
+
+// Split into chunks for load-balancing even if blob sizes vary.
+static void EnqueueChunks(size_t key_idx, uint64_t offset, uint64_t bytes,
+                          const uint8_t* data, std::vector<BlobIO>& writes) {
+  constexpr size_t kChunkBytes = 10 * 1024 * 1024;
+  const uint64_t end = offset + bytes;
+  // Split into whole chunks and possibly one remainder.
+  if (end >= kChunkBytes) {
+    for (; offset <= end - kChunkBytes;
+         offset += kChunkBytes, data += kChunkBytes) {
+      writes.emplace_back(
+          BlobRange{.offset = offset, .bytes = kChunkBytes, .key_idx = key_idx},
+          data);
+    }
+  }
+  if (offset != end) {
+    writes.emplace_back(
+        BlobRange{.offset = offset, .bytes = end - offset, .key_idx = key_idx},
+        data);
+  }
+
+  // Write a padding if necessary.
+  static constexpr uint8_t kZeros[kBlobAlign] = {0};
+  const size_t padding = BlobStore::RoundUpToAlign(bytes) - bytes;
+  if (padding > 0) {
+    writes.emplace_back(
+        BlobRange{.offset = end, .bytes = padding, .key_idx = key_idx},
+        static_cast<const uint8_t*>(kZeros));
+  }
+}
+
+BlobWriter::BlobWriter(const Path& filename, hwy::ThreadPool& pool)
+    : file_(OpenFileOrNull(filename, "w+")), pool_(pool) {
+  if (!file_) HWY_ABORT("Failed to open for writing %s", filename.path.c_str());
+  // Write a fake header to the beginning of the file.
+  std::vector<uint8_t> bytes_before_blobs = BlobStore::BytesBeforeBlobsV2();
+  file_->Write(bytes_before_blobs.data(), bytes_before_blobs.size(), 0);
+}
+
+void BlobWriter::Add(const std::string& key, const void* data, size_t bytes) {
+  HWY_ASSERT(data != nullptr);
+  HWY_ASSERT(bytes != 0);
+  keys_.push_back(KeyFromString(key.c_str()));
+  blob_sizes_.push_back(bytes);
+
+  std::vector<BlobIO> writes;
+  EnqueueChunks(keys_.size() - 1, file_->FileSize(), bytes,
+                static_cast<const uint8_t*>(data), writes);
+
+  hwy::ThreadPool null_pool(0);
+  hwy::ThreadPool& pool_or_serial = file_->IsAppendOnly() ? null_pool : pool_;
+  pool_or_serial.Run(
+      0, writes.size(), [this, &writes](uint64_t i, size_t /*thread*/) {
+        const BlobRange& range = writes[i].range;
+
+        if (!file_->Write(writes[i].data, range.bytes, range.offset)) {
+          const std::string& key = StringFromKey(keys_[range.key_idx]);
+          HWY_ABORT("Write failed for %s from %zu, %zu bytes to %p.",
+                    key.c_str(), static_cast<size_t>(range.offset), range.bytes,
+                    writes[i].data);
+        }
+      });
+}
+
+void BlobWriter::WriteAll() {
+  const BlobStore bs = BlobStore(keys_, blob_sizes_);
+
+  // Write the rest of the bytes, which contains: paddings + directory + header.
+  const auto bytes_after_blobs = bs.BytesAfterBlobs();
+  file_->Write(bytes_after_blobs.data(), bytes_after_blobs.size(),
+               file_->FileSize());
+}
+
+}  // namespace gcpp
diff --git a/io/blob_store.h b/io/blob_store.h
new file mode 100644
index 0000000..f7103d7
--- /dev/null
+++ b/io/blob_store.h
@@ -0,0 +1,139 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_IO_BLOB_STORE_H_
+#define THIRD_PARTY_GEMMA_CPP_IO_BLOB_STORE_H_
+
+// Reads/writes arrays of bytes from/to file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>  // std::unique_ptr
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "io/io.h"                // File, Path, MapPtr
+#include "util/basics.h"          // Tristate
+#include "hwy/aligned_allocator.h"  // Span
+#include "hwy/base.h"               // HWY_ASSERT
+#include "hwy/contrib/thread_pool/thread_pool.h"
+
+namespace gcpp {
+
+// One blob's extents within the file.
+struct BlobRange {
+  uint64_t End() const { return offset + bytes; }
+
+  uint64_t offset = 0;
+  size_t bytes = 0;  // We check blobs are not zero-sized.
+  // Index within `BlobReader::Keys()` for error reporting.
+  size_t key_idx;
+};
+
+// Reads `BlobStore` header, converts keys to strings and creates a hash map for
+// faster lookups.
+// TODO(janwas): rename to BlobFinder or similar.
+// Thread-safe: it is safe to concurrently call all methods except `CloseFile`.
+class BlobReader {
+ public:
+  // Acquires ownership of `file` (which must be non-null) and reads its header.
+  // Aborts on error.
+  explicit BlobReader(const Path& blob_path);
+
+  const Path& blob_path() const { return blob_path_; }
+
+  const File& file() const { return *file_; }
+  uint64_t file_bytes() const { return file_bytes_; }
+  MapPtr Map() { return file_->Map(); }
+  // OK to call if Map() was called; the smart pointer keeps the mapping alive.
+  void CloseFile() { file_.reset(); }
+
+  const std::vector<std::string>& Keys() const { return keys_; }
+
+  const BlobRange& Range(size_t key_idx) const {
+    HWY_ASSERT(key_idx < keys_.size());
+    return ranges_[key_idx];
+  }
+
+  // Returns nullptr if not found. O(1).
+  const BlobRange* Find(const std::string& key) const {
+    auto it = key_idx_for_key_.find(key);
+    if (it == key_idx_for_key_.end()) return nullptr;
+    const BlobRange& range = Range(it->second);
+    HWY_ASSERT(range.offset != 0 && range.bytes != 0);
+    HWY_ASSERT(range.End() <= file_bytes_);
+    return &range;
+  }
+
+  // Returns error, or calls `func(span)` with the blob identified by `key`.
+  // Allocates unaligned memory for the blob; intended for small metadata blobs.
+  template <typename T, class Func>
+  bool CallWithSpan(const std::string& key, const Func& func) const {
+    const BlobRange* range = Find(key);
+    if (!range) {
+      HWY_WARN("Blob %s not found, sizeof T=%zu", key.c_str(), sizeof(T));
+      return false;
+    }
+
+    HWY_ASSERT(range->bytes % sizeof(T) == 0);
+    std::vector<T> storage(range->bytes / sizeof(T));
+    if (!file_->Read(range->offset, range->bytes, storage.data())) {
+      HWY_WARN("Read failed for blob %s from %zu, size %zu; file %zu\n",
+               key.c_str(), static_cast<size_t>(range->offset), range->bytes,
+               static_cast<size_t>(file_bytes_));
+      return false;
+    }
+    func(hwy::Span<const T>(storage.data(), storage.size()));
+    return true;
+  }
+
+ private:
+  Path blob_path_;
+  std::unique_ptr<File> file_;
+  const uint64_t file_bytes_;
+
+  std::vector<std::string> keys_;
+  std::vector<BlobRange> ranges_;
+  std::unordered_map<std::string, size_t> key_idx_for_key_;
+};
+
+// Collects references to blobs and writes them all at once with parallel I/O.
+// Thread-compatible: independent instances can be used concurrently, but it
+// does not make sense to call the methods concurrently.
+class BlobWriter {
+ public:
+  explicit BlobWriter(const Path& filename, hwy::ThreadPool& pool);
+
+  void Add(const std::string& key, const void* data, size_t bytes);
+
+  // For `ModelStore`: this is the `key_idx` of the next blob to be added.
+  size_t NumAdded() const { return keys_.size(); }
+
+  // Stores all blobs to disk in the given order with padding for alignment.
+  // Aborts on error.
+  void WriteAll();
+
+ private:
+  std::unique_ptr<File> file_;
+  std::vector<hwy::uint128_t> keys_;
+  std::vector<size_t> blob_sizes_;
+  hwy::ThreadPool& pool_;
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_IO_BLOB_STORE_H_
diff --git a/io/blob_store_test.cc b/io/blob_store_test.cc
new file mode 100644
index 0000000..36ba27f
--- /dev/null
+++ b/io/blob_store_test.cc
@@ -0,0 +1,162 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "io/blob_store.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <array>
+#include <string>
+#include <vector>
+
+#include "io/io.h"
+#include "util/basics.h"
+#include "util/threading_context.h"
+#include "hwy/tests/hwy_gtest.h"
+#include "hwy/tests/test_util-inl.h"  // HWY_ASSERT_EQ
+
+namespace gcpp {
+namespace {
+
+#if !HWY_TEST_STANDALONE
+class BlobStoreTest : public testing::Test {};
+#endif
+
+TEST(BlobStoreTest, TestReadWrite) {
+  ThreadingArgs threading_args;
+  ThreadingContext ctx(threading_args);
+  hwy::ThreadPool& pool = ctx.pools.Pool();
+
+  static const std::array<float, 4> kOriginalData = {-1, 0, 3.14159, 2.71828};
+
+  // mkstemp will modify path_str so it holds a newly-created temporary file.
+  char path_str[] = "/tmp/blob_store_test.sbs-XXXXXX";
+  const int fd = mkstemp(path_str);
+  HWY_ASSERT(fd > 0);
+
+  const Path path(path_str);
+  std::array<float, 4> buffer = kOriginalData;
+
+  const std::string keyA("0123456789abcdef");  // max 16 characters
+  const std::string keyB("q");
+  BlobWriter writer(path, pool);
+  writer.Add(keyA, "DATA", 5);
+  writer.Add(keyB, buffer.data(), sizeof(buffer));
+  writer.WriteAll();
+  HWY_ASSERT_ARRAY_EQ(kOriginalData.data(), buffer.data(), buffer.size());
+
+  std::fill(buffer.begin(), buffer.end(), 0);
+
+  const BlobReader reader(path);
+
+  HWY_ASSERT_EQ(reader.Keys().size(), 2);
+  HWY_ASSERT_STRING_EQ(reader.Keys()[0].c_str(), keyA.c_str());
+  HWY_ASSERT_STRING_EQ(reader.Keys()[1].c_str(), keyB.c_str());
+
+  const BlobRange* range = reader.Find(keyA);
+  HWY_ASSERT(range);
+  const uint64_t offsetA = range->offset;
+  HWY_ASSERT_EQ(offsetA, 256);
+  HWY_ASSERT_EQ(range->bytes, 5);
+  range = reader.Find(keyB);
+  HWY_ASSERT(range);
+  const uint64_t offsetB = range->offset;
+  HWY_ASSERT_EQ(offsetB, offsetA + 256);
+  HWY_ASSERT_EQ(range->bytes, sizeof(buffer));
+
+  HWY_ASSERT(
+      reader.CallWithSpan<char>(keyA, [](const hwy::Span<const char> span) {
+        HWY_ASSERT_EQ(span.size(), 5);
+        HWY_ASSERT_STRING_EQ("DATA", span.data());
+      }));
+  HWY_ASSERT(
+      reader.CallWithSpan<float>(keyB, [](const hwy::Span<const float> span) {
+        HWY_ASSERT_EQ(span.size(), 4);
+        HWY_ASSERT_ARRAY_EQ(kOriginalData.data(), span.data(), span.size());
+      }));
+
+  close(fd);
+  unlink(path_str);
+}
+
+// Ensures padding works for any number of random-sized blobs.
+TEST(BlobStoreTest, TestNumBlobs) {
+  ThreadingArgs threading_args;
+  ThreadingContext ctx(threading_args);
+  hwy::ThreadPool& pool = ctx.pools.Pool();
+  hwy::RandomState rng;
+
+  for (size_t num_blobs = 1; num_blobs <= 512; ++num_blobs) {
+    // mkstemp will modify path_str so it holds a newly-created temporary file.
+    char path_str[] = "/tmp/blob_store_test2.sbs-XXXXXX";
+    const int fd = mkstemp(path_str);
+    HWY_ASSERT(fd > 0);
+    const Path path(path_str);
+
+    BlobWriter writer(path, pool);
+    std::vector<std::string> keys;
+    keys.reserve(num_blobs);
+    std::vector<std::vector<uint8_t>> blobs;
+    blobs.reserve(num_blobs);
+    for (size_t i = 0; i < num_blobs; ++i) {
+      keys.push_back(std::to_string(i));
+      // Smaller blobs when there are many, to speed up the test.
+      const size_t mask = num_blobs > 1000 ? 1023 : 8191;
+      // Never zero, but may be one byte, which we special-case.
+      blobs.emplace_back((size_t{hwy::Random32(&rng)} & mask) + 1);
+      std::vector<uint8_t>& blob = blobs.back();
+      blob[0] = static_cast<uint8_t>(i & 255);
+      if (blob.size() != 1) {
+        blob.back() = static_cast<uint8_t>(i >> 8);
+      }
+      writer.Add(keys.back(), blob.data(), blob.size());
+    }
+    HWY_ASSERT(keys.size() == num_blobs);
+    HWY_ASSERT(blobs.size() == num_blobs);
+    writer.WriteAll();
+
+    BlobReader reader(path);
+    HWY_ASSERT_EQ(reader.Keys().size(), num_blobs);
+    pool.Run(0, num_blobs, [&](uint64_t i, size_t /*thread*/) {
+      HWY_ASSERT_STRING_EQ(reader.Keys()[i].c_str(), std::to_string(i).c_str());
+      const BlobRange* range = reader.Find(keys[i]);
+      HWY_ASSERT(range);
+      HWY_ASSERT_EQ(blobs[i].size(), range->bytes);
+      HWY_ASSERT(reader.CallWithSpan<uint8_t>(
+          keys[i], [path_str, num_blobs, i, range,
+                    &blobs](const hwy::Span<const uint8_t> span) {
+            HWY_ASSERT_EQ(blobs[i].size(), span.size());
+            const bool match1 = span[0] == static_cast<uint8_t>(i & 255);
+            // If size == 1, we don't have a second byte to check.
+            const bool match2 =
+                span.size() == 1 ||
+                span[span.size() - 1] == static_cast<uint8_t>(i >> 8);
+            if (!match1 || !match2) {
+              HWY_ABORT("%s num_blobs %zu blob %zu offset %zu is corrupted.",
+                        path_str, num_blobs, i, range->offset);
+            }
+          }));
+    });
+
+    close(fd);
+    unlink(path_str);
+  }
+}
+
+}  // namespace
+}  // namespace gcpp
+
+HWY_TEST_MAIN();
diff --git a/compression/fields.cc b/io/fields.cc
similarity index 96%
rename from compression/fields.cc
rename to io/fields.cc
index 8977af7..4516e89 100644
--- a/compression/fields.cc
+++ b/io/fields.cc
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "compression/fields.h"
+#include "io/fields.h"
 
 #include <stdarg.h>
 #include <stddef.h>
@@ -24,7 +24,6 @@
 #include <string>
 #include <vector>
 
-#include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
 
 namespace gcpp {
@@ -88,7 +87,7 @@ class PrintVisitor : public VisitorBase {
   }
 
   void operator()(uint64_t& value) override {
-    fprintf(stderr, "%sU64 %zu\n", indent_.c_str(), value);
+    fprintf(stderr, "%sU64 %zu\n", indent_.c_str(), static_cast<size_t>(value));
   }
 
   void operator()(float& value) override {
@@ -115,7 +114,7 @@ class PrintVisitor : public VisitorBase {
 
 class ReadVisitor : public VisitorBase {
  public:
-  ReadVisitor(const hwy::Span<const uint32_t>& span, size_t pos)
+  ReadVisitor(const SerializedSpan span, size_t pos)
       : span_(span), result_(pos) {}
   ~ReadVisitor() {
     HWY_ASSERT(end_.empty());  // Bug if push/pop are not balanced.
@@ -236,7 +235,7 @@ class ReadVisitor : public VisitorBase {
   }
 
  private:
-  const hwy::Span<const uint32_t> span_;
+  const SerializedSpan span_;
   IFields::ReadResult result_;
   // Stack of end positions of nested IFields. Updated in operator()(IFields&),
   // but read in SkipField.
@@ -326,8 +325,7 @@ void IFields::Print() const {
   visitor(*const_cast<IFields*>(this));
 }
 
-IFields::ReadResult IFields::Read(const hwy::Span<const uint32_t>& span,
-                                  size_t pos) {
+IFields::ReadResult IFields::Read(const SerializedSpan span, size_t pos) {
   ReadVisitor visitor(span, pos);
   visitor(*this);
   return visitor.Result();
diff --git a/compression/fields.h b/io/fields.h
similarity index 96%
rename from compression/fields.h
rename to io/fields.h
index a17b48c..25728aa 100644
--- a/compression/fields.h
+++ b/io/fields.h
@@ -27,7 +27,7 @@
 #include <string>
 #include <vector>
 
-#include "hwy/aligned_allocator.h"
+#include "hwy/aligned_allocator.h"  // Span
 #include "hwy/base.h"
 // IWYU pragma: end_exports
 
@@ -56,8 +56,7 @@ struct IFields;  // breaks circular dependency
 // because their `IFields::VisitFields` calls `visitor.operator()`.
 //
 // Supported field types `T`: `uint32_t`, `int32_t`, `uint64_t`, `float`,
-// `std::string`,
-// classes derived from `IFields`, `bool`, `enum`, `std::vector<T>`.
+// `std::string`, `IFields` subclasses, `bool`, `enum`, `std::vector<T>`.
 class IFieldsVisitor {
  public:
   virtual ~IFieldsVisitor();
@@ -133,6 +132,8 @@ class IFieldsVisitor {
   bool any_invalid_ = false;
 };
 
+using SerializedSpan = hwy::Span<const uint32_t>;
+
 // Abstract base class for user-defined serializable classes, which are
 // forward- and backward compatible collection of fields (members). This means
 // old code can safely read new data, and new code can still handle old data.
@@ -178,13 +179,13 @@ struct IFields {
     // the code, but valid, and extra_u32 should be zero.
     uint32_t missing_fields;
     // How many extra u32 are in the stored size, vs. what we actually read as
-    // requested by VisitFields. If non-zero,, the data is newer than the code,
+    // requested by VisitFields. If non-zero, the data is newer than the code,
     // but valid, and missing_fields should be zero.
     uint32_t extra_u32;
   };
 
   // Reads fields starting at `span[pos]`.
-  ReadResult Read(const hwy::Span<const uint32_t>& span, size_t pos);
+  ReadResult Read(SerializedSpan span, size_t pos);
 
   // Returns false if there was an unrecoverable error, typically because a
   // field has an invalid value. If so, `storage` is undefined.
diff --git a/compression/fields_test.cc b/io/fields_test.cc
similarity index 99%
rename from compression/fields_test.cc
rename to io/fields_test.cc
index bfe7b03..37bb942 100644
--- a/compression/fields_test.cc
+++ b/io/fields_test.cc
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "compression/fields.h"
+#include "io/fields.h"
 
 #include <stddef.h>
 #include <stdint.h>
diff --git a/io/io.cc b/io/io.cc
new file mode 100644
index 0000000..df39f7b
--- /dev/null
+++ b/io/io.cc
@@ -0,0 +1,255 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Safe to be first, does not include POSIX headers.
+#include "hwy/detect_compiler_arch.h"
+// Request POSIX 2008, including `pread()` and `posix_fadvise()`. This also
+// implies `_POSIX_C_SOURCE`.
+#if !defined(_XOPEN_SOURCE) || _XOPEN_SOURCE < 700
+#undef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 700  // SUSv4
+#endif
+
+// Make `off_t` 64-bit even on 32-bit systems. Works for Android >= r15c.
+#undef _FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 64
+
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+
+#include "io/io.h"
+#include "hwy/base.h"  // HWY_ASSERT
+
+#if (HWY_OS_LINUX || HWY_OS_FREEBSD) && \
+    (!defined(__ANDROID_API__) || __ANDROID_API__ >= 24)
+#define GEMMA_IO_PREADV 1
+#else
+#define GEMMA_IO_PREADV 0
+#endif
+
+#if (HWY_OS_LINUX || HWY_OS_FREEBSD) && \
+    (!defined(__ANDROID_API__) || __ANDROID_API__ >= 21)
+#define GEMMA_IO_FADVISE 1
+#else
+#define GEMMA_IO_FADVISE 0
+#endif
+
+// FilePosix should only be compiled on non-Windows. It is easier to
+// check this in source code because we support multiple build systems. Note
+// that IOBatch at the end of this TU is still compiled on all platforms.
+#if !HWY_OS_WIN
+
+#if GEMMA_IO_PREADV
+// Replacement for the _BSD_SOURCE specified by preadv documentation.
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE
+#endif
+#include <errno.h>
+#include <sys/uio.h>  // preadv
+#endif                // GEMMA_IO_PREADV
+
+#include <fcntl.h>   // open
+#include <limits.h>  // IOV_MAX
+#include <stdint.h>
+#include <stdio.h>  // SEEK_END - unistd isn't enough for IDE.
+#include <sys/types.h>
+// Old OSX may require sys/types.h before sys/mman.h.
+#include <sys/mman.h>  // mmap
+#include <sys/stat.h>  // O_RDONLY
+#include <unistd.h>    // read, write, close
+
+// Placeholder for internal header, do not modify.
+#include "util/allocator.h"
+
+namespace gcpp {
+
+class FilePosix : public File {
+  int fd_ = 0;
+
+ public:
+  explicit FilePosix(int fd) : fd_(fd) { HWY_ASSERT(fd > 0); }
+  ~FilePosix() override {
+    if (fd_ != 0) {
+      HWY_ASSERT(close(fd_) != -1);
+    }
+  }
+
+  // pwrite is thread-safe and allows arbitrary offsets.
+  bool IsAppendOnly() const override { return false; }
+
+  uint64_t FileSize() const override {
+    static_assert(sizeof(off_t) == 8, "64-bit off_t required");
+    const off_t size = lseek(fd_, 0, SEEK_END);
+    if (size < 0) {
+      return 0;
+    }
+    return static_cast<uint64_t>(size);
+  }
+
+  bool Read(uint64_t offset, uint64_t size, void* to) const override {
+    uint8_t* bytes = reinterpret_cast<uint8_t*>(to);
+    uint64_t pos = 0;
+    for (;;) {
+      // pread seems to be faster than lseek + read when parallelized.
+      const auto bytes_read = pread(fd_, bytes + pos, size - pos, offset + pos);
+      if (bytes_read <= 0) break;
+      pos += bytes_read;
+      HWY_ASSERT(pos <= size);
+      if (pos == size) break;
+    }
+    return pos == size;  // success if managed to read desired size
+  }
+
+  bool Write(const void* from, uint64_t size, uint64_t offset) override {
+    const uint8_t* bytes = reinterpret_cast<const uint8_t*>(from);
+    uint64_t pos = 0;
+    for (;;) {
+      const auto bytes_written =
+          pwrite(fd_, bytes + pos, size - pos, offset + pos);
+      if (bytes_written <= 0) break;
+      pos += bytes_written;
+      HWY_ASSERT(pos <= size);
+      if (pos == size) break;
+    }
+    return pos == size;  // success if managed to write desired size
+  }
+
+  MapPtr Map() override {
+    const size_t mapping_size = FileSize();
+    // No `MAP_POPULATE` because we do not want to wait for I/O, and
+    // `MAP_NONBLOCK` is not guaranteed. `MAP_HUGETLB` fails. `MAP_SHARED` is
+    // more efficient than `MAP_PRIVATE`; the main difference is that the former
+    // will eventually see subsequent changes to the file.
+    const int flags = MAP_SHARED;
+    void* mapping =
+        mmap(nullptr, mapping_size, PROT_READ, flags, fd_, /*offset=*/0);
+    if (mapping == MAP_FAILED) return MapPtr();
+
+#ifdef MADV_WILLNEED  // Missing on some OSX.
+    // (Maybe) initiate readahead.
+    madvise(mapping, mapping_size, MADV_WILLNEED);
+#endif
+
+    return MapPtr(static_cast<const uint8_t*>(mapping),
+                  DeleterFunc([mapping_size](void* ptr) {
+                    HWY_ASSERT(munmap(ptr, mapping_size) == 0);
+                  }));
+  }
+
+  int Handle() const override { return fd_; }
+};  // FilePosix
+
+HWY_MAYBE_UNUSED extern std::unique_ptr<File> OpenFileGoogle(
+    const Path& filename, const char* mode);
+
+std::unique_ptr<File> OpenFileOrNull(const Path& filename, const char* mode) {
+  std::unique_ptr<File> file;  // OpenFileGoogle omitted
+  if (file) return file;
+
+  const bool is_read = mode[0] != 'w';
+  const int flags = is_read ? O_RDONLY : O_CREAT | O_RDWR | O_TRUNC;
+  const int fd = open(filename.path.c_str(), flags, 0644);
+  if (fd < 0) return file;
+
+#if GEMMA_IO_FADVISE
+  if (is_read) {
+    // Doubles the readahead window, which seems slightly faster when cached.
+    (void)posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+  }
+#endif  // GEMMA_IO_FADVISE
+
+  return std::make_unique<FilePosix>(fd);
+}
+
+}  // namespace gcpp
+
+#endif  // !HWY_OS_WIN
+
+namespace gcpp {
+
+std::unique_ptr<File> OpenFileOrAbort(const Path& filename, const char* mode) {
+  std::unique_ptr<File> file = OpenFileOrNull(filename, "r");
+  if (!file) {
+    HWY_ABORT("Failed to open %s", filename.path.c_str());
+  }
+  return file;
+}
+
+std::string ReadFileToString(const Path& path) {
+  std::unique_ptr<File> file = OpenFileOrAbort(path, "r");
+  const size_t size = file->FileSize();
+  if (size == 0) {
+    HWY_ABORT("Empty file %s", path.path.c_str());
+  }
+  std::string content(size, ' ');
+  if (!file->Read(0, size, content.data())) {
+    HWY_ABORT("Failed to read %s", path.path.c_str());
+  }
+  return content;
+}
+
+#ifdef IOV_MAX
+constexpr size_t kMaxSpans = IOV_MAX;
+#else
+constexpr size_t kMaxSpans = 1024;  // Linux limit
+#endif
+
+IOBatch::IOBatch(uint64_t offset, size_t key_idx)
+    : offset_(offset), key_idx_(key_idx) {
+  spans_.reserve(kMaxSpans);
+}
+
+// Returns true if the batch was full; if so, call again on the new batch.
+bool IOBatch::Add(void* mem, size_t bytes) {
+  if (spans_.size() >= kMaxSpans) return false;
+  if (total_bytes_ + bytes > 0x7FFFF000) return false;  // Linux limit
+  spans_.push_back({.mem = mem, .bytes = bytes});
+  total_bytes_ += bytes;
+  return true;
+}
+
+void InternalInit() {
+}
+
+uint64_t IOBatch::Read(const File& file) const {
+#if GEMMA_IO_PREADV
+  HWY_ASSERT(!spans_.empty());
+
+  ssize_t bytes_read;
+  for (;;) {
+    bytes_read =
+        preadv(file.Handle(), reinterpret_cast<const iovec*>(spans_.data()),
+               static_cast<int>(spans_.size()), offset_);
+    if (bytes_read >= 0) break;
+    if (errno == EINTR) continue;  // signal: retry
+    HWY_WARN("preadv failed, errno %d.", errno);
+    return 0;
+  }
+  return static_cast<uint64_t>(bytes_read);
+#else
+  uint64_t total = 0;
+  uint64_t offset = offset_;
+  for (const IOSpan& span : spans_) {
+    if (!file.Read(offset, span.bytes, span.mem)) return 0;
+    total += span.bytes;
+    offset += span.bytes;
+  }
+  return total;
+#endif
+}
+
+}  // namespace gcpp
diff --git a/compression/io.h b/io/io.h
similarity index 55%
rename from compression/io.h
rename to io/io.h
index 1d47143..d9481bc 100644
--- a/compression/io.h
+++ b/io/io.h
@@ -22,7 +22,9 @@
 #include <memory>
 #include <string>
 #include <utility>  // std::move
+#include <vector>
 
+#include "util/allocator.h"
 #include "hwy/base.h"
 
 namespace gcpp {
@@ -32,6 +34,8 @@ namespace gcpp {
 // prefer to define Exists inline because there are multiple io*.cc files.
 struct Path;
 
+using MapPtr = AlignedPtr<const uint8_t[]>;
+
 // Abstract base class enables multiple I/O backends in the same binary.
 class File {
  public:
@@ -42,20 +46,74 @@ class File {
   File(const File& other) = delete;
   const File& operator=(const File& other) = delete;
 
+  // If true, Write() should only be called with `offset` equal to the number
+  // of bytes already written to the file, which rules out parallel writes.
+  virtual bool IsAppendOnly() const = 0;
+
   // Returns size in bytes or 0.
   virtual uint64_t FileSize() const = 0;
 
   // Returns true if all the requested bytes were read.
+  // Thread-compatible.
   virtual bool Read(uint64_t offset, uint64_t size, void* to) const = 0;
 
   // Returns true if all the requested bytes were written.
+  // Thread-compatible.
   virtual bool Write(const void* from, uint64_t size, uint64_t offset) = 0;
+
+  // Maps the entire file into read-only memory or returns nullptr on failure.
+  // We do not support offsets because Windows requires them to be a multiple of
+  // the allocation granularity, which is 64 KiB. Some implementations may fail
+  // if the file is zero-sized and return a nullptr. Non-const because it may
+  // modify internal state. This is only expected to be called once per file.
+  virtual MapPtr Map() = 0;
+
+  // For use by `IOBatch::Read`.
+  virtual int Handle() const { return -1; }
 };
 
 // Returns nullptr on failure. `mode` is either "r" or "w+". This is not just
 // named 'OpenFile' to avoid a conflict with Windows.h #define.
 std::unique_ptr<File> OpenFileOrNull(const Path& filename, const char* mode);
 
+// As above, but aborts on instead of returning nullptr.
+std::unique_ptr<File> OpenFileOrAbort(const Path& filename, const char* mode);
+
+// Compatible with Linux iovec.
+struct IOSpan {
+  void* mem;
+  size_t bytes;
+};
+
+// Wrapper for Linux/BSD `preadv`, calling `File::Read` on other systems. To
+// insert row padding, we previously issued one IO per tensor row, which is
+// expensive. `preadv` reduces up to 1024 syscalls to 1.
+// The file data must be contiguous starting from `IOBatch::offset_`, because
+// `preadv` does not support per-`IOSpan` offsets.
+class IOBatch {
+ public:
+  // Reserves memory in `spans_`. `key_idx` identifies the blob/tensor.
+  explicit IOBatch(uint64_t offset, size_t key_idx);
+
+  // The next `bytes` will be read from file into `mem`.
+  // Returns true if the batch was full; if so, call again on the new batch.
+  bool Add(void* mem, size_t bytes);
+
+  uint64_t Offset() const { return offset_; }
+  uint64_t TotalBytes() const { return total_bytes_; }
+  size_t KeyIdx() const { return key_idx_; }
+
+  // Returns the total number of bytes read, or 0 if any I/O failed.
+  // Thread-compatible.
+  uint64_t Read(const File& file) const;
+
+ private:
+  uint64_t offset_;
+  uint64_t total_bytes_ = 0;
+  size_t key_idx_;
+  std::vector<IOSpan> spans_;  // contiguous in the file.
+};
+
 // Wrapper for strings representing a path name. Differentiates vs. arbitrary
 // strings and supports shortening for display purposes.
 struct Path {
@@ -87,21 +145,12 @@ struct Path {
   std::string path;
 };
 
-static inline HWY_MAYBE_UNUSED std::string ReadFileToString(const Path& path) {
-  std::unique_ptr<File> file = OpenFileOrNull(path, "r");
-  if (!file) {
-    HWY_ABORT("Failed to open %s", path.path.c_str());
-  }
-  const size_t size = file->FileSize();
-  if (size == 0) {
-    HWY_ABORT("Empty file %s", path.path.c_str());
-  }
-  std::string content(size, ' ');
-  if (!file->Read(0, size, content.data())) {
-    HWY_ABORT("Failed to read %s", path.path.c_str());
-  }
-  return content;
-}
+// Aborts on error.
+std::string ReadFileToString(const Path& path);
+
+// No-op in open-source. Must be called at the beginning of a binary, before
+// any I/O or flag usage.
+void InternalInit();
 
 }  // namespace gcpp
 
diff --git a/compression/io_win.cc b/io/io_win.cc
similarity index 82%
rename from compression/io_win.cc
rename to io/io_win.cc
index 1cb1673..34773d3 100644
--- a/compression/io_win.cc
+++ b/io/io_win.cc
@@ -21,7 +21,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include "compression/io.h"
+#include "io/io.h"
+#include "util/allocator.h"
 #include "hwy/base.h"  // HWY_ASSERT
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
@@ -46,6 +47,9 @@ class FileWin : public File {
     }
   }
 
+  // WriteFile is thread-safe and allows arbitrary offsets.
+  bool IsAppendOnly() const override { return false; }
+
   uint64_t FileSize() const override {
     DWORD hi;
     const DWORD lo = GetFileSize(hFile_, &hi);
@@ -96,6 +100,22 @@ class FileWin : public File {
     }
     return true;  // wrote everything => success
   }
+
+  MapPtr Map() override {
+    if (hFile_ == INVALID_HANDLE_VALUE) return MapPtr();
+
+    // Size=0 means the entire file.
+    HANDLE hMapping =
+        CreateFileMappingA(hFile_, nullptr, PAGE_READONLY, 0, 0, nullptr);
+    // Offset zero and size=0 means the entire file.
+    void* ptr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+    if (!ptr) return MapPtr();
+    return MapPtr(static_cast<const uint8_t*>(ptr),
+                  DeleterFunc([hMapping](void* ptr) {
+                    HWY_ASSERT(UnmapViewOfFile(ptr));
+                    HWY_ASSERT(CloseHandle(hMapping));
+                  }));
+  }
 };  // FileWin
 
 std::unique_ptr<File> OpenFileOrNull(const Path& filename, const char* mode) {
diff --git a/compression/migrate_weights.cc b/io/migrate_weights.cc
similarity index 64%
rename from compression/migrate_weights.cc
rename to io/migrate_weights.cc
index 97e6343..aa500bb 100644
--- a/compression/migrate_weights.cc
+++ b/io/migrate_weights.cc
@@ -13,11 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <stdio.h>
+// Loads a model and saves it in single-file format.
 
-#include <string>
-
-#include "evals/benchmark_helper.h"
+#include "evals/benchmark_helper.h"  // GemmaEnv
 #include "gemma/gemma.h"
 #include "util/args.h"
 
@@ -25,18 +23,9 @@ namespace gcpp {
 namespace {
 
 struct WriterArgs : public ArgsBase<WriterArgs> {
-  // --output_weights is required.
   WriterArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
 
-  // Returns error string or nullptr if OK.
-  const char* Validate() {
-    if (output_weights.path.empty()) {
-      return "Missing --output_weights flag, a file for the model weights.";
-    }
-    return nullptr;
-  }
-
-  Path output_weights;  // weights file location
+  Path output_weights;
 
   template <class Visitor>
   void ForEach(const Visitor& visitor) {
@@ -49,14 +38,12 @@ struct WriterArgs : public ArgsBase<WriterArgs> {
 }  // namespace gcpp
 
 int main(int argc, char** argv) {
-  // Loads a model in the multi-file format and saves it in single-file format.
   gcpp::WriterArgs args(argc, argv);
-  if (const char* err = args.Validate()) {
-    fprintf(stderr, "Skipping model load because: %s\n", err);
-    return 1;
+  if (args.output_weights.Empty()) {
+    HWY_ABORT("Missing --output_weights flag, a file for the model weights.");
   }
+
   gcpp::GemmaEnv env(argc, argv);
-  hwy::ThreadPool pool(0);
-  env.GetModel()->Save(args.output_weights, pool);
+  env.GetGemma()->Save(args.output_weights, env.Env().ctx.pools);
   return 0;
 }
diff --git a/ops/bench_matmul.cc b/ops/bench_matmul.cc
index 30ec634..04d535e 100644
--- a/ops/bench_matmul.cc
+++ b/ops/bench_matmul.cc
@@ -16,30 +16,20 @@
 // Benchmark of large MatMul instances for which the MatMulSlow would be too
 // slow. This lacks a reference and is only useful for performance measurement.
 
-#include "hwy/base.h"
-#ifndef HWY_DISABLED_TARGETS
-// Exclude HWY_SCALAR due to 2x bf16 -> f32, and Armv7 NEON because we require
-// double-precision support.
-#if HWY_ARCH_ARM_V7
-#define HWY_DISABLED_TARGETS (HWY_SCALAR | HWY_NEON)
-#else
-#define HWY_DISABLED_TARGETS HWY_SCALAR
-#endif
-#endif
-
 #include <stddef.h>
 #include <stdio.h>
 
 #include <algorithm>
-#include <memory>
 #include <vector>
 
-#include "compression/compress.h"
-#include "compression/shared.h"
+#include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
 #include "ops/matmul.h"
-#include "util/allocator.h"
 #include "util/basics.h"
-#include "util/threading.h"
+#include "util/threading_context.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/nanobenchmark.h"
 #include "hwy/profiler.h"
@@ -53,8 +43,8 @@
 #include "hwy/highway.h"
 // After highway.h
 #include "compression/compress-inl.h"
+#include "compression/test_util-inl.h"
 #include "ops/matmul-inl.h"
-#include "hwy/tests/test_util-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
@@ -63,59 +53,6 @@ extern int64_t first_target;
 
 namespace HWY_NAMESPACE {
 
-using FloatPtr = hwy::AlignedFreeUniquePtr<float[]>;
-
-template <typename MatT>
-using MatStoragePtr = std::unique_ptr<MatStorageT<MatT>>;
-
-// Generates inputs: deterministic, within max SfpStream range.
-template <typename MatT>
-MatStoragePtr<MatT> GenerateMat(const Extents2D extents,
-                                hwy::ThreadPool& pool) {
-  gcpp::CompressWorkingSet ws;
-  auto mat =
-      std::make_unique<MatStorageT<MatT>>("mat", extents.rows, extents.cols);
-  FloatPtr content = hwy::AllocateAligned<float>(mat->NumElements());
-  HWY_ASSERT(content);
-  const float scale =
-      SfpStream::kMax / (mat->NumElements() + hwy::Unpredictable1() - 1);
-  pool.Run(0, extents.rows, [&](const size_t r, size_t /*thread*/) {
-    for (size_t c = 0; c < extents.cols; c++) {
-      float f = static_cast<float>(r * extents.cols + c) * scale;
-      if ((r + c) & 1) f = -f;  // Also generate some negative values.
-      content[r * extents.cols + c] = f;
-    }
-  });
-
-  CompressScaled(content.get(), mat->NumElements(), ws, *mat, pool);
-  mat->set_scale(0.6f);  // Arbitrary value, different from 1.
-  return mat;
-}
-
-// extents describes the transposed matrix.
-template <typename MatT>
-MatStoragePtr<MatT> GenerateTransposedMat(const Extents2D extents,
-                                          hwy::ThreadPool& pool) {
-  gcpp::CompressWorkingSet ws;
-  auto mat =
-      std::make_unique<MatStorageT<MatT>>("trans", extents.rows, extents.cols);
-  FloatPtr content = hwy::AllocateAligned<float>(mat->NumElements());
-  const float scale =
-      SfpStream::kMax / (mat->NumElements() + hwy::Unpredictable1() - 1);
-  pool.Run(0, extents.rows, [&](const size_t r, size_t /*thread*/) {
-    for (size_t c = 0; c < extents.cols; c++) {
-      float f = static_cast<float>(c * extents.rows + r) * scale;
-      if ((r + c) & 1) f = -f;  // Also generate some negative values.
-      content[r * extents.cols + c] = f;
-    }
-  });
-
-  CompressScaled(content.get(), mat->NumElements(), ws, *mat, pool);
-  // Arbitrary value, different from 1, must match GenerateMat.
-  mat->set_scale(0.6f);
-  return mat;
-}
-
 void PrintSpeed(const Extents2D& A_extents, const Extents2D& B_extents,
                 std::vector<double>& times, MMPerKey* per_key) {
   std::sort(times.begin(), times.end());
@@ -135,7 +72,7 @@ void PrintSpeed(const Extents2D& A_extents, const Extents2D& B_extents,
 // M = A rows, K = A cols, N = C cols.
 template <typename TA, typename TB = TA, typename TC = float>
 void BenchMatMul(size_t M, size_t K, size_t N, bool add, MatMulEnv& env) {
-  hwy::ThreadPool& pool = env.parallel.Pools().Pool(0);
+  hwy::ThreadPool& pool = env.ctx.pools.Pool(0);
   if (env.print_config || env.print_measurement) {
     fprintf(stderr, "\n");
   }
@@ -147,24 +84,24 @@ void BenchMatMul(size_t M, size_t K, size_t N, bool add, MatMulEnv& env) {
   const Extents2D B_extents(N, K);  // already transposed
   const Extents2D C_extents(M, N);
 
-  RowVectorBatch<TC> c_slow_batch = AllocateAlignedRows<TC>(C_extents);
-  RowVectorBatch<TC> c_batch = AllocateAlignedRows<TC>(C_extents);
+  MatStorageT<TC> C_slow("c_slow_batch", C_extents, env.ctx.allocator,
+                         MatPadding::kOdd);
+  MatStorageT<TC> C("c_batch", C_extents, env.ctx.allocator, MatPadding::kOdd);
 
-  std::unique_ptr<MatStorageT<float>> add_storage;
+  MatStorageT<float> add_storage("add", Extents2D(), env.ctx.allocator,
+                                 MatPadding::kPacked);
   if (add) {
-    add_storage = GenerateMat<float>(Extents2D(1, N), pool);
-    HWY_ASSERT(add_storage);
-    add_storage->set_scale(1.0f);
+    add_storage = GenerateMat<float>(Extents2D(1, N), env.ctx.allocator,
+                                     MatPadding::kPacked, pool);
+    add_storage.SetScale(1.0f);
   }
 
-  MatStoragePtr<TA> a = GenerateMat<TA>(A_extents, pool);
-  MatStoragePtr<TB> b_trans = GenerateTransposedMat<TB>(B_extents, pool);
-  HWY_ASSERT(a && b_trans);
-  const auto A = ConstMatFromWeights(*a);
-  const auto B = ConstMatFromWeights(*b_trans);
+  MatStorageT<TA> a =
+      GenerateMat<TA>(A_extents, env.ctx.allocator, MatPadding::kOdd, pool);
+  MatStorageT<TB> b_trans = GenerateTransposedMat<TB>(
+      B_extents, env.ctx.allocator, MatPadding::kOdd, pool);
 
-  const float* add_row = add ? add_storage->data_scale1() : nullptr;
-  const RowPtr<TC> C = RowPtrFromBatch(c_batch);
+  const float* add_row = add ? add_storage.PackedScale1() : nullptr;
 
   // Fewer reps for large batch sizes, which take longer.
   const size_t num_samples = M < 32 ? 20 : 12;
@@ -174,11 +111,12 @@ void BenchMatMul(size_t M, size_t K, size_t N, bool add, MatMulEnv& env) {
   // Ensure usage conditions are set before autotuning. Both binding and
   // spinning may materially affect the choice of config. No harm in calling
   // BindB/C if there is a single package: they will be a no-op.
-  BindB(B_extents.rows, sizeof(TC), B, env.parallel);
-  BindC(A_extents.rows, C, env.parallel);
+  BindB(b_trans, sizeof(TC), env.parallel);
+  BindC(C, env.parallel);
+  C.AllocateAndAttachRowPtrs(env.row_ptrs);
 
   Tristate use_spinning = Tristate::kDefault;
-  env.parallel.Pools().MaybeStartSpinning(use_spinning);
+  env.ctx.pools.MaybeStartSpinning(use_spinning);
 
   // env.print_config = true;
   // env.print_measurement = true;
@@ -189,7 +127,7 @@ void BenchMatMul(size_t M, size_t K, size_t N, bool add, MatMulEnv& env) {
   // Until enough samples collected *after* autotuning finished:
   while (times.size() < num_samples) {
     const double t0 = hwy::platform::Now();
-    per_key = MatMul(A, B, add_row, env, C);
+    per_key = MatMul(a, b_trans, add_row, env, C);
     const double t1 = hwy::platform::Now();
     double elapsed = t1 - t0;
     keep += hwy::ConvertScalarTo<double>(C.Row(0)[hwy::Unpredictable1()]);
@@ -198,7 +136,7 @@ void BenchMatMul(size_t M, size_t K, size_t N, bool add, MatMulEnv& env) {
     if (per_key->autotune.Best()) times.push_back(elapsed);
   }
   hwy::PreventElision(keep);
-  env.parallel.Pools().MaybeStopSpinning(use_spinning);
+  env.ctx.pools.MaybeStopSpinning(use_spinning);
   PrintSpeed(A_extents, B_extents, times, per_key);
 }
 
@@ -207,8 +145,8 @@ using SFP = SfpStream;
 
 void BenchAllMatMul() {
   if (first_target == 0) first_target = HWY_TARGET;
-  // Disable the best-target-only limitation.
-  // if (HWY_TARGET != first_target) return;
+  // Comment out to disable the best-target-only limitation.
+  if (HWY_TARGET != first_target) return;
 
   // Skip EMU128 (10x slower than SSE4 for SFP) and older x86.
   if (HWY_TARGET == HWY_EMU128 || HWY_TARGET == HWY_SSSE3 ||
@@ -216,17 +154,11 @@ void BenchAllMatMul() {
     return;
   }
 
-  const size_t max_threads = 0;      // no limit
-  const BoundedSlice package_slice;  // all packages/sockets
-  const BoundedSlice cluster_slice;  // all clusters/CCX
-  const BoundedSlice lp_slice;       // default to all cores (per package).
-  const BoundedTopology topology(package_slice, cluster_slice, lp_slice);
-  Allocator::Init(topology, /*enable_bind=*/true);
-  NestedPools pools(topology, max_threads, Tristate::kDefault);
-  fprintf(stderr, "BenchAllMatMul %s %s\n", topology.TopologyString(),
-          pools.PinString());
-
-  MatMulEnv env(topology, pools);
+  ThreadingArgs threading_args;
+  ThreadingContext ctx(threading_args);
+  fprintf(stderr, "BenchAllMatMul %s %s\n", ctx.topology.TopologyString(),
+          ctx.pools.PinString());
+  MatMulEnv env(ctx);
 
   for (size_t batch_size : {1, 4, 128, 512}) {
     constexpr bool kAdd = false;
diff --git a/ops/dot-inl.h b/ops/dot-inl.h
index f5282f2..48aaae9 100644
--- a/ops/dot-inl.h
+++ b/ops/dot-inl.h
@@ -15,8 +15,6 @@
 
 #include <stddef.h>
 
-#include "compression/compress.h"
-#include "hwy/base.h"
 #include "hwy/profiler.h"
 
 // Include guard for (potentially) SIMD code.
@@ -374,17 +372,6 @@ HWY_INLINE float Dot(const WT* HWY_RESTRICT w, const VT* vec, size_t num) {
   return Dot(d, MakeConstSpan(w, num), /*w_ofs=*/0, vec, num);
 }
 
-// Adapter for use by matvec-inl.h. TODO: remove when that is no longer used.
-template <typename MatT, typename VT>
-HWY_INLINE float Dot(const MatPtrT<MatT>& w, size_t w_ofs,
-                     const VT* vec_aligned, size_t num) {
-  const hn::ScalableTag<VT> d;
-  return w.scale() * Dot(d,
-                         MakeConstSpan(reinterpret_cast<const MatT*>(w.Ptr()),
-                                       w.NumElements()),
-                         w_ofs, vec_aligned, num);
-}
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace gcpp
diff --git a/ops/dot_test.cc b/ops/dot_test.cc
index bac78af..a461614 100644
--- a/ops/dot_test.cc
+++ b/ops/dot_test.cc
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "compression/types.h"
 #ifndef HWY_DISABLED_TARGETS
-// Exclude HWY_SCALAR due to 2x bf16 -> f32.
-#define HWY_DISABLED_TARGETS HWY_SCALAR
-#endif
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
 
 #include <stddef.h>
 #include <stdio.h>
@@ -26,11 +26,10 @@
 #include <cmath>
 #include <random>
 
-#include "compression/shared.h"
+#include "compression/compress.h"
 #include "util/allocator.h"
-#include "util/app.h"
 #include "util/test_util.h"
-#include "util/threading.h"
+#include "util/threading_context.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/profiler.h"
@@ -750,13 +749,13 @@ class DotStats {
   // Factor by which the approximate result is off; lower is better.
   void CheckMuls() const {
     // Comp2 is between Compensated and Kahan.
-    ASSERT_INSIDE(kComp2, 1.001, s_muls[kComp2].Mean(), 1.3);
+    ASSERT_INSIDE(kComp2, 1.001, s_muls[kComp2].Mean(), 1.4);
     ASSERT_INSIDE(kComp2, 1.001f, s_muls[kComp2].Max(), 2.4f);
     ASSERT_INSIDE(kComp2, 1.0, s_muls[kComp2].GeometricMean(), 1.2);
 
     // Compensated and Double are very accurate.
     ASSERT_LESS(kCompensated, s_muls[kCompensated].Min(), 1.0f + 2E-6f);
-    ASSERT_LESS(kCompensated, s_muls[kCompensated].Max(), 1.0f + 2E-5f);
+    ASSERT_LESS(kCompensated, s_muls[kCompensated].Max(), 1.0f + 1E-4f);
     ASSERT_LESS(kDouble, s_muls[kDouble].Min(), 1.0f + 2E-6f);
     ASSERT_LESS(kDouble, s_muls[kDouble].Max(), 1.0f + 2E-5f);
 
@@ -805,7 +804,7 @@ class DotStats {
     ASSERT_INSIDE(kAddTwoProd, 2.2E-4, s_l1s[kAddTwoProd].Mean(), 8E-4);
     ASSERT_INSIDE(kAddTwoProd, 4E-4f, s_l1s[kAddTwoProd].Max(), 2.1E-3f);
     // Updating Kahan's FastTwoSums to TwoSums does help a bit.
-    ASSERT_INSIDE(kAddTwoSum, 1.5E-4, s_l1s[kAddTwoSum].Mean(), 5.2E-4);
+    ASSERT_INSIDE(kAddTwoSum, 1.5E-4, s_l1s[kAddTwoSum].Mean(), 5.8E-4);
 
     ASSERT_INSIDE(kPairwise, 4.5E-4, s_l1s[kPairwise].Mean(), 4E-3);
     ASSERT_INSIDE(kPairwise, 1.1E-3f, s_l1s[kPairwise].Max(), 1E-2f);
@@ -1000,9 +999,8 @@ struct TestShortDotsT {
     const size_t N = hn::Lanes(d);
     const hn::ScalableTag<float> df;  // for CallDot
 
-    const AppArgs app;
-    BoundedTopology topology(CreateTopology(app));
-    NestedPools pools = CreatePools(topology, app);
+    ThreadingArgs threading_args;
+    ThreadingContext ctx(threading_args);
     CompressWorkingSet work;
     std::mt19937 rng;
     rng.seed(12345);
@@ -1013,22 +1011,22 @@ struct TestShortDotsT {
       // GenerateWellConditionedInputs calls DecompressAndZeroPad to `raw*`,
       // hence they require padding to one vector.
       const size_t padded_num = hwy::RoundUpTo(num, N);
-      const size_t packed_num = CompressedArrayElements<Packed>(num);
-      RowVectorBatch<float> raw_w(Extents2D(1, padded_num));
-      RowVectorBatch<float> raw_v(Extents2D(1, padded_num));
-      RowVectorBatch<Packed> weights(Extents2D(1, packed_num));
-      const PackedSpan<Packed> w(weights.Batch(0), packed_num);
-      RowVectorBatch<T> vectors(Extents2D(1, num));
-      const PackedSpan<T> v(vectors.Batch(0), num);
+      MatStorageT<float> raw_w("raw_w", padded_num, ctx.allocator);
+      MatStorageT<float> raw_v("raw_v", padded_num, ctx.allocator);
+      MatStorageT<Packed> weights("weights", padded_num, ctx.allocator);
+      const PackedSpan<Packed> w = weights.Span();
+      MatStorageT<T> vectors("vectors", padded_num, ctx.allocator);
+      const PackedSpan<T> v = vectors.Span();
 
-      RowVectorBatch<double> bufs(Extents2D(1, num));
-      double* HWY_RESTRICT buf = bufs.Batch(0);
+      MatStorageT<double> bufs("bufs", padded_num, ctx.allocator);
+      double* HWY_RESTRICT buf = bufs.Row(0);
 
       for (size_t rep = 0; rep < hn::AdjustedReps(20); ++rep) {
-        GenerateWellConditionedInputs(num, raw_w.All(), rng, w, work);
-        GenerateWellConditionedInputs(num, raw_v.All(), rng, v, work);
+        GenerateWellConditionedInputs(num, raw_w.Row(0), rng, w, work);
+        GenerateWellConditionedInputs(num, raw_v.Row(0), rng, v, work);
 
-        const float dot_exact = ExactDot(raw_w.All(), raw_v.All(), num, buf);
+        const float dot_exact =
+            ExactDot(raw_w.PackedScale1(), raw_v.PackedScale1(), num, buf);
         float dots[kVariants];
         for (size_t variant = 0; variant < kVariants; ++variant) {
           // Here Packed is not always float, so we must not call kDouble.
@@ -1099,10 +1097,18 @@ void TestAllDot() {
     return;
   }
 
+  constexpr size_t kMaxWorkers = 15;
+
+  // Limit workers because we only support `kMaxWorkers`.
+  ThreadingArgs threading_args;
+  threading_args.max_packages = 1;
+  threading_args.max_clusters = 1;
+  threading_args.max_lps = kMaxWorkers - 1;
+  ThreadingContext ctx(threading_args);
+
   {  // ensure no profiler zones are active
     const hn::ScalableTag<float> df;
 
-    constexpr size_t kMaxWorkers = 15;
     std::mt19937 rngs[kMaxWorkers];
     for (size_t i = 0; i < kMaxWorkers; ++i) {
       rngs[i].seed(12345 + 65537 * i);
@@ -1110,44 +1116,46 @@ void TestAllDot() {
 
     constexpr size_t kReps = hn::AdjustedReps(40);
     const size_t num = 24 * 1024;
-    const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 1),
-                                   BoundedSlice());
-    NestedPools pools(topology, kMaxWorkers - 1, /*pin=*/Tristate::kDefault);
-    RowVectorBatch<float> a(Extents2D(kMaxWorkers, num));
-    RowVectorBatch<float> b(Extents2D(kMaxWorkers, num));
-    RowVectorBatch<double> bufs(Extents2D(kMaxWorkers, num));
+    MatStorageT<float> a("a", Extents2D(kMaxWorkers, num), ctx.allocator,
+                         MatPadding::kOdd);
+    MatStorageT<float> b("b", Extents2D(kMaxWorkers, num), ctx.allocator,
+                         MatPadding::kOdd);
+    MatStorageT<double> bufs("bufs", Extents2D(kMaxWorkers, num), ctx.allocator,
+                             MatPadding::kOdd);
     std::array<DotStats, kMaxWorkers> all_stats;
 
-    pools.Cluster(0, 0).Run(0, kReps, [&](const uint32_t rep, size_t thread) {
-      float* HWY_RESTRICT pa = a.Batch(thread);
-      float* HWY_RESTRICT pb = b.Batch(thread);
-      double* HWY_RESTRICT buf = bufs.Batch(thread);
-      const PackedSpan<const float> a_span(pa, num);
-      DotStats& stats = all_stats[thread];
-      const double cond =
-          GenerateIllConditionedInputs(num, pa, pb, rngs[thread]);
+    ctx.pools.Cluster(0, 0).Run(
+        0, kReps, [&](const uint32_t rep, size_t thread) {
+          float* HWY_RESTRICT pa = a.Row(thread);
+          float* HWY_RESTRICT pb = b.Row(thread);
+          double* HWY_RESTRICT buf = bufs.Row(thread);
+          const PackedSpan<const float> a_span(pa, num);
+          DotStats& stats = all_stats[thread];
+          const double cond =
+              GenerateIllConditionedInputs(num, pa, pb, rngs[thread]);
 
-      const float dot_exact = ExactDot(pa, pb, num, buf);
+          const float dot_exact = ExactDot(pa, pb, num, buf);
 
-      float dots[kVariants] = {};
-      double times[kVariants] = {};
-      for (size_t variant = 0; variant < kVariants; ++variant) {
-        constexpr size_t kTimeReps = hn::AdjustedReps(10);
-        std::array<double, kTimeReps> elapsed;
-        for (int time_rep = 0; time_rep < kTimeReps; ++time_rep) {
-          const double start = hwy::platform::Now();
-          dots[variant] += CallDot(df, variant, a_span, /*w_ofs=*/0, pb, num);
-          hwy::PreventElision(*pa);
-          elapsed[time_rep] = hwy::platform::Now() - start;
-        }
-        dots[variant] /= kTimeReps;
-        times[variant] = TrimmedMean(elapsed.data(), kTimeReps);
-      }
+          float dots[kVariants] = {};
+          double times[kVariants] = {};
+          for (size_t variant = 0; variant < kVariants; ++variant) {
+            constexpr size_t kTimeReps = hn::AdjustedReps(10);
+            std::array<double, kTimeReps> elapsed;
+            for (size_t time_rep = 0; time_rep < kTimeReps; ++time_rep) {
+              const double start = hwy::platform::Now();
+              dots[variant] +=
+                  CallDot(df, variant, a_span, /*w_ofs=*/0, pb, num);
+              hwy::PreventElision(*pa);
+              elapsed[time_rep] = hwy::platform::Now() - start;
+            }
+            dots[variant] /= kTimeReps;
+            times[variant] = TrimmedMean(elapsed.data(), kTimeReps);
+          }
 
-      stats.NotifyTimes(times);
-      stats.NotifyRep(num, cond, dot_exact, dots);
-      stats.NotifyRatios();
-    });
+          stats.NotifyTimes(times);
+          stats.NotifyRep(num, cond, dot_exact, dots);
+          stats.NotifyRatios();
+        });
 
     DotStats& stats = all_stats[0];
     for (size_t i = 1; i < kMaxWorkers; ++i) {
diff --git a/ops/fp_arith-inl.h b/ops/fp_arith-inl.h
index 423b0fb..2abae43 100644
--- a/ops/fp_arith-inl.h
+++ b/ops/fp_arith-inl.h
@@ -136,7 +136,7 @@ hn::TFromD<DF> ReduceCascadedSums(DF df, const VF sum, VF sum_err) {
   using TF = hn::TFromD<DF>;
   // For non-scalable wide vectors, reduce loop iterations below by recursing
   // once or twice for halves of 256-bit or 512-bit vectors.
-  if constexpr (!HWY_HAVE_SCALABLE) {
+  if constexpr (HWY_HAVE_CONSTEXPR_LANES) {
     if constexpr (hn::Lanes(df) > 16 / sizeof(TF)) {
       const hn::Half<DF> dfh;
       using VFH = hn::Vec<decltype(dfh)>;
diff --git a/ops/gemma_matvec_test.cc b/ops/gemma_matvec_test.cc
index 6982b20..e55539d 100644
--- a/ops/gemma_matvec_test.cc
+++ b/ops/gemma_matvec_test.cc
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "compression/types.h"
 #ifndef HWY_DISABLED_TARGETS
-// Exclude HWY_SCALAR due to 2x bf16 -> f32.
-#define HWY_DISABLED_TARGETS HWY_SCALAR
-#endif
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
 
 #include <stddef.h>
 #include <stdio.h>
@@ -25,7 +25,8 @@
 #include <cmath>      // std::abs
 #include <memory>
 
-#include "compression/compress.h"
+#include "util/mat.h"
+#include "util/threading_context.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -37,6 +38,7 @@
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 // After highway.h
+#include "compression/compress-inl.h"
 #include "ops/matvec-inl.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -48,18 +50,18 @@ using FloatPtr = hwy::AlignedFreeUniquePtr<float[]>;
 
 FloatPtr SimpleMatVecAdd(const MatStorageT<float>& mat, const FloatPtr& vec,
                          const FloatPtr& add) {
-  FloatPtr raw_mat = hwy::AllocateAligned<float>(mat.NumElements());
+  const size_t num = mat.Rows() * mat.Cols();
+  FloatPtr raw_mat = hwy::AllocateAligned<float>(num);
   FloatPtr out = hwy::AllocateAligned<float>(mat.Rows());
   HWY_ASSERT(raw_mat && out);
   const hn::ScalableTag<float> df;
-  DecompressAndZeroPad(df, MakeSpan(mat.data(), mat.NumElements()), 0,
-                       raw_mat.get(), mat.NumElements());
+  DecompressAndZeroPad(df, mat.Span(), 0, raw_mat.get(), num);
   for (size_t idx_row = 0; idx_row < mat.Rows(); idx_row++) {
     out[idx_row] = 0.0f;
     for (size_t idx_col = 0; idx_col < mat.Cols(); idx_col++) {
       out[idx_row] += raw_mat[mat.Cols() * idx_row + idx_col] * vec[idx_col];
     }
-    out[idx_row] *= mat.scale();
+    out[idx_row] *= mat.Scale();
     out[idx_row] += add[idx_row];
   }
   return out;
@@ -67,10 +69,13 @@ FloatPtr SimpleMatVecAdd(const MatStorageT<float>& mat, const FloatPtr& vec,
 
 template <typename MatT, size_t kOuter, size_t kInner>
 std::unique_ptr<MatStorageT<float>> GenerateMat(size_t offset,
+                                                const Allocator& allocator,
                                                 hwy::ThreadPool& pool) {
   gcpp::CompressWorkingSet ws;
-  auto mat = std::make_unique<MatStorageT<float>>("TestMat", kOuter, kInner);
-  FloatPtr raw_mat = hwy::AllocateAligned<float>(mat->NumElements());
+  const Extents2D extents(kOuter, kInner);
+  auto mat = std::make_unique<MatStorageT<float>>("TestMat", extents, allocator,
+                                                  MatPadding::kPacked);
+  FloatPtr raw_mat = hwy::AllocateAligned<float>(extents.Area());
   HWY_ASSERT(raw_mat);
   const float scale = 1.0f / kInner;
   pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) {
@@ -80,8 +85,8 @@ std::unique_ptr<MatStorageT<float>> GenerateMat(size_t offset,
     }
   });
 
-  CompressScaled(raw_mat.get(), mat->NumElements(), ws, *mat, pool);
-  mat->set_scale(1.9f);  // Arbitrary value, different from 1.
+  Compress(raw_mat.get(), extents.Area(), ws, mat->Span(), 0, pool);
+  mat->SetScale(1.9f);  // Arbitrary value, different from 1.
   return mat;
 }
 
@@ -106,10 +111,12 @@ void AssertClose(const FloatPtr& a, const FloatPtr& b) {
 }
 
 void TestMatVecAdd() {
-  hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
+  ThreadingArgs threading_args;
+  ThreadingContext ctx(threading_args);
+  hwy::ThreadPool& pool = ctx.pools.Pool();
   constexpr size_t kOuter = 128 * 3;
   constexpr size_t kInner = 128 * 5;
-  auto mat = GenerateMat<float, kOuter, kInner>(0, pool);
+  auto mat = GenerateMat<float, kOuter, kInner>(0, ctx.allocator, pool);
   FloatPtr vec = GenerateVec<kInner>(0);
   FloatPtr add = GenerateVec<kOuter>(0);
   FloatPtr expected_out = SimpleMatVecAdd(*mat, vec, add);
@@ -121,11 +128,13 @@ void TestMatVecAdd() {
 }
 
 void TestTwoMatVecAdd() {
-  hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
+  ThreadingArgs threading_args;
+  ThreadingContext ctx(threading_args);
+  hwy::ThreadPool& pool = ctx.pools.Pool();
   constexpr size_t kOuter = 128 * 3;
   constexpr size_t kInner = 128 * 5;
-  auto mat0 = GenerateMat<float, kOuter, kInner>(0, pool);
-  auto mat1 = GenerateMat<float, kOuter, kInner>(1, pool);
+  auto mat0 = GenerateMat<float, kOuter, kInner>(0, ctx.allocator, pool);
+  auto mat1 = GenerateMat<float, kOuter, kInner>(1, ctx.allocator, pool);
   FloatPtr vec = GenerateVec<kInner>(0);
   FloatPtr add0 = GenerateVec<kOuter>(0);
   FloatPtr add1 = GenerateVec<kOuter>(1);
@@ -142,10 +151,13 @@ void TestTwoMatVecAdd() {
 }
 
 void TestTwoOfsMatVecAddLoop() {
-  hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
+  ThreadingArgs threading_args;
+  ThreadingContext ctx(threading_args);
+  hwy::ThreadPool& pool = ctx.pools.Pool();
+
   constexpr size_t kOuter = 128 * 3;
   constexpr size_t kInner = 128 * 5;
-  auto mat = GenerateMat<float, kOuter, kInner>(0, pool);
+  auto mat = GenerateMat<float, kOuter, kInner>(0, ctx.allocator, pool);
   FloatPtr vec = GenerateVec<kInner>(0);
   FloatPtr add0 = GenerateVec<kOuter>(0);
   FloatPtr add1 = GenerateVec<kOuter>(1);
diff --git a/ops/matmul-inl.h b/ops/matmul-inl.h
index 782c3e7..6712da3 100644
--- a/ops/matmul-inl.h
+++ b/ops/matmul-inl.h
@@ -15,16 +15,17 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <stdio.h>
 
 #include <vector>
 
-#include "compression/shared.h"
+#include "compression/types.h"
 #include "ops/matmul.h"  // IWYU pragma: export
 #include "util/allocator.h"
 #include "util/basics.h"
-#include "util/threading.h"
+#include "util/mat.h"
+#include "util/threading_context.h"
 #include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/profiler.h"
 #include "hwy/timer.h"
 
@@ -110,8 +111,8 @@ class MMStoreHorizontalSumsIntoC {
                              VF C20, VF C21, VF C22, VF C23,  //
                              VF C30, VF C31, VF C32, VF C33,  //
                              const size_t row_c, const size_t col_c,
-                             const MMArgs& args, const RowPtr<TC>& C) const {
-    float buf[16 * hn::MaxLanes(df)];
+                             const MMArgs& args, RowPtrs<TC> C_rows) const {
+    HWY_ALIGN float buf[16 * hn::MaxLanes(df)];
     const size_t N = hn::Lanes(df);
     // Horizontal reductions (`ReduceSum`) are rather expensive, entailing
     // log(N) operations for vectors of length N. Because `kNR` == 4, we
@@ -146,10 +147,10 @@ class MMStoreHorizontalSumsIntoC {
     if constexpr (kAdd) {
       vadd = hn::Load(d4, args.add + col_c);
     }
-    MaybeScaleAndStore<0>(d4, sum0, vscale, vadd, C, row_c, col_c);
-    MaybeScaleAndStore<1>(d4, sum1, vscale, vadd, C, row_c, col_c);
-    MaybeScaleAndStore<2>(d4, sum2, vscale, vadd, C, row_c, col_c);
-    MaybeScaleAndStore<3>(d4, sum3, vscale, vadd, C, row_c, col_c);
+    MaybeScaleAndStore<0>(d4, sum0, vscale, vadd, C_rows, row_c, col_c);
+    MaybeScaleAndStore<1>(d4, sum1, vscale, vadd, C_rows, row_c, col_c);
+    MaybeScaleAndStore<2>(d4, sum2, vscale, vadd, C_rows, row_c, col_c);
+    MaybeScaleAndStore<3>(d4, sum3, vscale, vadd, C_rows, row_c, col_c);
   }
 
  private:
@@ -185,13 +186,14 @@ class MMStoreHorizontalSumsIntoC {
     }
   }
 
-  template <size_t kRow, typename TC, class DF4, class VF4 = hn::Vec<DF4>>
+  template <size_t kRow, /*deduced:*/ class DF4, class VF4 = hn::Vec<DF4>,
+            typename TC>
   static HWY_INLINE void MaybeScaleAndStore(DF4 df4, VF4 sum, VF4 vscale,
-                                            VF4 vadd, const RowPtr<TC>& C,
+                                            VF4 vadd, RowPtrs<TC> C_rows,
                                             const size_t row_c,
                                             const size_t col_c) {
     if constexpr (kRow < kRowsAC) {
-      TC* HWY_RESTRICT pos = C.Row(row_c + kRow) + col_c;
+      TC* HWY_RESTRICT pos = C_rows[row_c + kRow] + col_c;
       const hn::Rebind<TC, DF4> dc4;
       const VF4 out = hn::MulAdd(sum, vscale, vadd);
       hn::Store(TCFromF32(dc4, out), dc4, pos);
@@ -221,12 +223,12 @@ class MMAddHorizontalSumsIntoPartial {
                              VF F20, VF F21, VF F22, VF F23,  //
                              VF F30, VF F31, VF F32, VF F33,  //
                              const size_t row_c, const size_t col_c,
-                             const RowPtrD& partial) const {
+                             const StridedViewD& partial) const {
     // We accumulate in 64-bit to avoid loss of precision.
     static_assert(HWY_HAVE_FLOAT64, "Disable Armv7 NEON: we require fp64");
 
     const hn::Repartition<double, DF> dd;
-    double buf[16 * hn::MaxLanes(dd)];
+    HWY_ALIGN double buf[16 * hn::MaxLanes(dd)];
     using VD = hn::Vec<decltype(dd)>;
     const size_t ND = hn::Lanes(dd);
     VD C00 = SumOfPromotedPairs(dd, F00);
@@ -327,7 +329,8 @@ class MMAddHorizontalSumsIntoPartial {
   }
 
   template <size_t kRow, class D4, class V4 = hn::Vec<D4>>
-  static HWY_INLINE void MaybeAddStore(D4 d4, V4 sum, const RowPtrD& partial,
+  static HWY_INLINE void MaybeAddStore(D4 d4, V4 sum,
+                                       const StridedViewD& partial,
                                        const size_t row_c, const size_t col_c) {
     if constexpr (kRow < kRowsAC) {
       double* HWY_RESTRICT pos = partial.Row(row_c + kRow) + col_c;
@@ -345,21 +348,15 @@ class MMAddHorizontalSumsIntoPartial {
 // Stateless, wraps member functions.
 class MMKernel {
  public:
-  // Choosing `kMaxMR == kNR` minimizes the ratio of loads to FMA, because
-  // we load `kNR + kMaxMR` vectors per `kMaxMR * kNR` element tile.
-  // In general, `M` (batch size) is not a multiple of `kMaxMR`. Thus functions
-  // that load or store a tile are parameterized on `kRowsAC`: usually `kMaxMR`,
-  // or less on ISAs with fewer registers, or for the last few rows of A.
-  static constexpr size_t kMaxMR = 4;
-
   // Calls `LoopKC` for each of `mc` rows of A in steps of `mr`. `A_view`
   // is `mc x kc` and `B_view` is `(kNR x kc)`. Both start at row/col 0.
   // A2C0 in MOMMS terminology updates a `mc x kNR` slice of the output.
   template <class Tag, typename TC>
-  static HWY_INLINE void A2C0(const RowPtrBF& A_view, const RowPtrBF& B_view,
-                              size_t mr, const IndexRange& range_mc,
-                              const size_t row_b, size_t kc, Tag tag,
-                              const MMArgs& args, const RowPtr<TC>& C) {
+  static HWY_INLINE void A2C0(const StridedViewBF& A_view,
+                              const StridedViewBF& B_view, size_t mr,
+                              const IndexRange& range_mc, const size_t row_b,
+                              size_t kc, Tag tag, const MMArgs& args,
+                              RowPtrs<TC> C_rows) {
     HWY_DASSERT(1 <= mr && mr <= kMaxMR);
     const size_t row0 = range_mc.begin();
     const size_t mc = range_mc.Num();
@@ -368,7 +365,8 @@ class MMKernel {
     // M == 1, or x86 with 8 SIMD registers:
     if (HWY_UNLIKELY(mr == 1)) {
       for (; imc < mc; ++imc) {
-        LoopKC<1>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args, C);
+        LoopKC<1>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args,
+                  C_rows);
       }
       return;
     }
@@ -377,11 +375,13 @@ class MMKernel {
     if (HWY_UNLIKELY(mr == 2)) {
       if (HWY_LIKELY(mc >= 2)) {
         for (; imc <= mc - 2; imc += 2) {
-          LoopKC<2>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args, C);
+          LoopKC<2>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args,
+                    C_rows);
         }
       }
       if (HWY_UNLIKELY(imc != mc)) {
-        LoopKC<1>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args, C);
+        LoopKC<1>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args,
+                  C_rows);
       }
       return;
     }
@@ -389,17 +389,18 @@ class MMKernel {
     HWY_DASSERT(mr == 4);
     if (HWY_LIKELY(mc >= 4)) {
       for (; imc <= mc - 4; imc += 4) {
-        LoopKC<4>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args, C);
+        LoopKC<4>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args,
+                  C_rows);
       }
     }
     const size_t remainder_mc = mc - imc;
     HWY_DASSERT(remainder_mc < 4);
     if (HWY_UNLIKELY(remainder_mc & 2)) {
-      LoopKC<2>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args, C);
+      LoopKC<2>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args, C_rows);
       imc += 2;
     }
     if (HWY_UNLIKELY(remainder_mc & 1)) {
-      LoopKC<1>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args, C);
+      LoopKC<1>(A_view, B_view, row0 + imc, imc, row_b, kc, tag, args, C_rows);
       imc += 1;
     }
     HWY_DASSERT(imc == mc);
@@ -496,11 +497,11 @@ class MMKernel {
   // with top-left corner `partial.Row(row_ac) + col_c`. Both A and B must be
   // BF16 so we can load directly without `Decompress2`, which is expensive for
   // NUQ and requires 2x unrolling, which requires more loads.
-  template <size_t kRowsAC, class Tag, typename TC>
-  static HWY_INLINE void LoopKC(const RowPtrBF& A_view, const RowPtrBF& B_view,
-                                size_t row_ac, size_t imc, size_t col_c,
-                                size_t kc, Tag tag, const MMArgs& args,
-                                const RowPtr<TC>& C) {
+  template <size_t kRowsAC, /*deduced:*/ class Tag, typename TC>
+  static HWY_INLINE void LoopKC(const StridedViewBF& A_view,
+                                const StridedViewBF& B_view, size_t row_ac,
+                                size_t imc, size_t col_c, size_t kc, Tag tag,
+                                const MMArgs& args, RowPtrs<TC> C_rows) {
     const hn::ScalableTag<BF16> dbf;
     using VBF = hn::Vec<decltype(dbf)>;
     const size_t NBF = hn::Lanes(dbf);
@@ -614,11 +615,11 @@ class MMKernel {
       if (args.add) {
         MMStoreHorizontalSumsIntoC<kRowsAC, /*kAdd=*/true>()(
             df, C00, C01, C02, C03, C10, C11, C12, C13, C20, C21, C22, C23, C30,
-            C31, C32, C33, row_ac, col_c, args, C);
+            C31, C32, C33, row_ac, col_c, args, C_rows);
       } else {
         MMStoreHorizontalSumsIntoC<kRowsAC, /*kAdd=*/false>()(
             df, C00, C01, C02, C03, C10, C11, C12, C13, C20, C21, C22, C23, C30,
-            C31, C32, C33, row_ac, col_c, args, C);
+            C31, C32, C33, row_ac, col_c, args, C_rows);
       }
     } else {
       MMAddHorizontalSumsIntoPartial<kRowsAC, Tag>()(
@@ -642,27 +643,27 @@ class MMScaleDemoteAdd {
   template <typename TC>
   static HWY_INLINE void FillC(const IndexRange& range_mc,
                                const IndexRange& range_nc, const MMArgs& args,
-                               const RowPtr<TC>& C) {
+                               RowPtrs<TC> C_rows) {
     size_t row_c = range_mc.begin();
     if (args.add) {
       constexpr bool kAdd = true;
       if (range_mc.Num() >= 4) {
         for (; row_c <= range_mc.end() - 4; row_c += 4) {
-          Do4Rows<kAdd>(row_c, range_nc, args, C);
+          Do4Rows<kAdd>(row_c, range_nc, args, C_rows);
         }
       }
       for (; row_c < range_mc.end(); ++row_c) {
-        Do1Row<kAdd>(row_c, range_nc, args, C);
+        Do1Row<kAdd>(row_c, range_nc, args, C_rows);
       }
     } else {
       constexpr bool kAdd = false;
       if (range_mc.Num() >= 4) {
         for (; row_c <= range_mc.end() - 4; row_c += 4) {
-          Do4Rows<kAdd>(row_c, range_nc, args, C);
+          Do4Rows<kAdd>(row_c, range_nc, args, C_rows);
         }
       }
       for (; row_c < range_mc.end(); ++row_c) {
-        Do1Row<kAdd>(row_c, range_nc, args, C);
+        Do1Row<kAdd>(row_c, range_nc, args, C_rows);
       }
     }
   }
@@ -671,7 +672,7 @@ class MMScaleDemoteAdd {
   // Unrolled for 4 rows to reduce the number of loads from `add`.
   template <bool kAdd, typename TC>
   static HWY_INLINE void Do4Rows(size_t row_c, const IndexRange& range_nc,
-                                 const MMArgs& args, const RowPtr<TC>& C) {
+                                 const MMArgs& args, RowPtrs<TC> C_rows) {
     const hn::ScalableTag<double> dd;
     const hn::Rebind<float, decltype(dd)> df;  // result of DemoteTo
     const hn::Rebind<TC, decltype(dd)> dc;
@@ -685,15 +686,14 @@ class MMScaleDemoteAdd {
     const double* HWY_RESTRICT pr2 = args.partial.Row(row_c + 2);
     const double* HWY_RESTRICT pr3 = args.partial.Row(row_c + 3);
 
-    TC* HWY_RESTRICT cr0 = C.Row(row_c + 0);
-    TC* HWY_RESTRICT cr1 = C.Row(row_c + 1);
-    TC* HWY_RESTRICT cr2 = C.Row(row_c + 2);
-    TC* HWY_RESTRICT cr3 = C.Row(row_c + 3);
+    TC* HWY_RESTRICT cr0 = C_rows[row_c + 0];
+    TC* HWY_RESTRICT cr1 = C_rows[row_c + 1];
+    TC* HWY_RESTRICT cr2 = C_rows[row_c + 2];
+    TC* HWY_RESTRICT cr3 = C_rows[row_c + 3];
 
     // We manually unroll 2x for higher IPC in batch=1.
     size_t col_c = range_nc.begin();
     if (HWY_LIKELY(range_nc.Num() >= 2 * ND)) {
-      HWY_UNROLL(1)
       for (; col_c <= range_nc.end() - 2 * ND; col_c += 2 * ND) {
         VD a0, a1;  // unused if !kAdd
         if constexpr (kAdd) {
@@ -790,7 +790,7 @@ class MMScaleDemoteAdd {
   // Same as above but handles a single row (for remainder rows).
   template <bool kAdd, typename TC>
   static HWY_INLINE void Do1Row(size_t row_c, const IndexRange& range_nc,
-                                const MMArgs& args, const RowPtr<TC>& C) {
+                                const MMArgs& args, RowPtrs<TC> C_rows) {
     const hn::ScalableTag<double> dd;
     const hn::Rebind<float, decltype(dd)> df;  // result of DemoteTo
     const hn::Rebind<TC, decltype(dd)> dc;
@@ -799,12 +799,11 @@ class MMScaleDemoteAdd {
     const size_t ND = hn::Lanes(dd);
     const VD vscale = hn::Set(dd, args.scale);
     const double* HWY_RESTRICT pr0 = args.partial.Row(row_c + 0);
-    TC* HWY_RESTRICT cr0 = C.Row(row_c + 0);
+    TC* HWY_RESTRICT cr0 = C_rows[row_c + 0];
 
     // We manually unroll 2x for higher IPC in batch=1.
     size_t col_c = range_nc.begin();
     if (HWY_LIKELY(range_nc.Num() >= 2 * ND)) {
-      HWY_UNROLL(1)
       for (; col_c <= range_nc.end() - 2 * ND; col_c += 2 * ND) {
         VD a0, a1;  // unused if !kAdd
         if constexpr (kAdd) {
@@ -861,45 +860,37 @@ class MMScaleDemoteAdd {
 class MMPerPackage {
  public:
   template <typename TA>
-  MMPerPackage(const ConstMat<TA>& A, const MMArgs& args,
-               const MMConfig& config, size_t pkg_idx,
-               const IndexRange& range_np)
+  MMPerPackage(const MatPtrT<TA>& A, const MMArgs& args, const MMConfig& config,
+               size_t pkg_idx, const IndexRange& range_np)
       : args_(args),
         pkg_idx_(pkg_idx),
+        // May be overwritten with a view of A, if already BF16.
+        A_(args_.env->storage.A(pkg_idx, A.Extents())),
         range_np_(range_np),
         mr_(config.MR()),
-        ranges_mc_(config.RangesOfMC(A.Extents().rows)),
-        ranges_kc_(config.RangesOfKC(A.Extents().cols)),
+        ranges_mc_(config.RangesOfMC(A.Rows())),
+        ranges_kc_(config.RangesOfKC(A.Cols())),
         ranges_nc_(config.RangesOfNC(range_np)),
         order_(config.Order()),
         inner_tasks_(config.InnerTasks()),
-        out_(config.Out()) {
-    // May be overwritten with a view of A, if already BF16.
-    A_ = args_.env->storage.A(pkg_idx, A.Extents());
-    {
-      MMZone zone;
-      zone.MaybeEnter("MM.DecompressA", args_);
-      A_ = DecompressA(A);
-    }
+        out_(config.Out()),
+        line_bytes_(args.env->ctx.allocator.LineBytes()) {
+    A_ = DecompressA(A);
   }
 
   // B is decompressed several call layers lower, but not all member functions
   // depend on TB, so pass it as an argument instead of templating the class.
   template <typename TB, typename TC>
-  HWY_NOINLINE void operator()(const ConstMat<TB>& B,
-                               const RowPtr<TC>& C) const {
-    // TODO: include NUQ tables? NumPacked in ConstMat?
-    const size_t num_packed_B = B.ofs + B.Stride() * B.Extents().rows;
-
+  HWY_NOINLINE void operator()(const MatPtrT<TB>& B, RowPtrs<TC> C_rows) const {
     switch (order_) {
       case MMOrder::kNT:
-        return DoNT(B, num_packed_B, C);
+        return DoNT(B, C_rows);
       case MMOrder::kNT_K:
-        return DoNT_K(B, num_packed_B, C);
+        return DoNT_K(B, C_rows);
       case MMOrder::kNT_MT:
-        return DoNT_MT(B, num_packed_B, C);
+        return DoNT_MT(B, C_rows);
       case MMOrder::kNT_MT_K:
-        return DoNT_MT_K(B, num_packed_B, C);
+        return DoNT_MT_K(B, C_rows);
       default:
         HWY_UNREACHABLE;
     }
@@ -909,58 +900,54 @@ class MMPerPackage {
   // Compute size of per-worker storage for `kNR` row ranges of B. Stack
   // allocation avoids passing a worker index.
   static constexpr size_t B_stride_max_ =
-      StrideForCyclicOffsets<BF16>(MMStorage::kMaxKC);
-  static constexpr size_t B_storage_max_ =
-      kNR * B_stride_max_ + Allocator::MaxQuantumBytes() / sizeof(BF16);
+      MMStorage::kMaxKC + 2 * Allocator::MaxLineBytes() / sizeof(BF16);
+  static constexpr size_t B_storage_max_ = kNR * B_stride_max_;
 
   // Granularity of `ForNP`. B rows produce C columns, so we
   // want a multiple of the line size to prevent false sharing.
-  static size_t MultipleNP(size_t sizeof_TC) {
-    return HWY_MAX(kNR, Allocator::LineBytes() / sizeof_TC);
+  size_t MultipleNP(size_t sizeof_TC) const {
+    return HWY_MAX(kNR, line_bytes_ / sizeof_TC);
   }
 
-  // Single M and K, parallel N. Fills all of C directly.
+  // Single M and K ranges, parallel N. Fills all of C directly.
   template <typename TB, typename TC>
-  HWY_INLINE void DoNT(const ConstMat<TB>& B, size_t num_packed_B,
-                       const RowPtr<TC>& C) const {
-    MMZone zone;
-    zone.MaybeEnter("MM.NT", args_);
+  HWY_INLINE void DoNT(const MatPtrT<TB>& B, RowPtrs<TC> C_rows) const {
+    static const uint32_t zone_id = PROFILER_ADD_ZONE("MM.NT");
     HWY_DASSERT(ranges_mc_.NumTasks() == 1);
     HWY_DASSERT(ranges_kc_.NumTasks() == 1);
     const IndexRange& range_M = ranges_mc_.Range(0);
     const IndexRange& range_K = ranges_kc_.Range(0);
     const size_t K = range_K.Num();
-    const RowPtrBF& A_view = A_.View(range_M.begin(), 0, K);
-    const size_t B_stride = StrideForCyclicOffsets<BF16>(K);
+    const StridedViewBF& A_view = A_.View(range_M.begin(), 0, K);
+    const size_t B_stride =
+        Stride(MatPadding::kOdd, K, sizeof(BF16), line_bytes_);
 
     // Similar to `loop_nc` below, but here we hoisted `A_view`.
     args_.env->parallel.ForNP(
         range_np_, MultipleNP(sizeof(TC)), inner_tasks_, pkg_idx_,
-        [&](const IndexRange& range_nc) HWY_ATTR {
+        [&](const IndexRange& range_nc, size_t worker) HWY_ATTR {
+          MMZone zone;
+          zone.MaybeEnter(worker, zone_id, args_);
+
           HWY_ALIGN BF16 B_storage[B_storage_max_];  // TLS
-          const RowPtrBF B_view(B_storage, K, B_stride);
+          const StridedViewBF B_storage_view(B_storage, K, B_stride);
 
           for (size_t row_b = range_nc.begin(); row_b < range_nc.end();
                row_b += kNR) {
-            {
-              MMZone zone;
-              zone.MaybeEnter("MM.NT.DecB", args_);
-              DecompressB(B, num_packed_B, row_b, range_K, B_view);
-            }
+            StridedViewBF B_view =
+                DecompressB(B, row_b, range_K, B_storage_view);
             MMKernel::A2C0(A_view, B_view, mr_, range_M, row_b, K, MMSetC(),
-                           args_, C);
+                           args_, C_rows);
           }
         });
 
     HWY_DASSERT(out_ == MMOut::kDirect);  // already filled C
   }
 
-  // Single M, parallel N, sequential K. Fills all of partial.
+  // Single M range, parallel N, sequential K. Fills all of partial.
   template <typename TB, typename TC>
-  HWY_INLINE void DoNT_K(const ConstMat<TB>& B, size_t num_packed_B,
-                         const RowPtr<TC>& C) const {
-    MMZone zone;
-    zone.MaybeEnter("MM.NT_K", args_);
+  HWY_INLINE void DoNT_K(const MatPtrT<TB>& B, RowPtrs<TC> C_rows) const {
+    static const uint32_t zone_id = PROFILER_ADD_ZONE("MM.NT_K");
     HWY_DASSERT(ranges_mc_.NumTasks() == 1);
     const IndexRange& range_mc = ranges_mc_.Range(0);
 
@@ -971,24 +958,26 @@ class MMPerPackage {
                              const IndexRange& range_nc,
                              auto out_tag) HWY_ATTR {
       const size_t kc = range_kc.Num();
-      const RowPtrBF& A_view = A_.View(range_mc.begin(), range_kc.begin(), kc);
-      const RowPtrBF B_view(B_storage, kc, StrideForCyclicOffsets<BF16>(kc));
+      const StridedViewBF& A_view =
+          A_.View(range_mc.begin(), range_kc.begin(), kc);
+      const StridedViewBF B_storage_view(
+          B_storage, kc,
+          Stride(MatPadding::kOdd, kc, sizeof(BF16), line_bytes_));
 
       for (size_t row_b = range_nc.begin(); row_b < range_nc.end();
            row_b += kNR) {
-        {
-          MMZone zone;
-          zone.MaybeEnter("MM.NT_K.DecB", args_);
-          DecompressB(B, num_packed_B, row_b, range_kc, B_view);
-        }
+        StridedViewBF B_view = DecompressB(B, row_b, range_kc, B_storage_view);
         MMKernel::A2C0(A_view, B_view, mr_, range_mc, row_b, kc, out_tag, args_,
-                       C);
+                       C_rows);
       }
     };
 
     args_.env->parallel.ForNP(
         range_np_, MultipleNP(sizeof(TC)), inner_tasks_, pkg_idx_,
-        [&](const IndexRange& range_nc) HWY_ATTR {
+        [&](const IndexRange& range_nc, size_t worker) HWY_ATTR {
+          MMZone zone;
+          zone.MaybeEnter(worker, zone_id, args_);
+
           HWY_ALIGN BF16 B_storage[B_storage_max_];  // TLS
 
           // Peel off the first iteration of the kc loop: avoid
@@ -1001,16 +990,19 @@ class MMPerPackage {
           });
         });
 
-    MMZone fill_zone;
     if (out_ == MMOut::kCopy) {
-      fill_zone.MaybeEnter("MM.NT_K.FillC", args_);
-      MMScaleDemoteAdd::FillC(range_mc, range_np_, args_, C);
+      static const uint32_t zone_id = PROFILER_ADD_ZONE("MM.NT_K.FillC.Copy");
+      MMZone fill_zone;
+      fill_zone.MaybeEnter(0, zone_id, args_);
+      MMScaleDemoteAdd::FillC(range_mc, range_np_, args_, C_rows);
     } else if (out_ == MMOut::kParM) {
-      fill_zone.MaybeEnter("MM.NT_K.FillC.ParM", args_);
+      static const uint32_t zone_id = PROFILER_ADD_ZONE("MM.NT_K.FillC.ParM");
       args_.env->parallel.ForRangeMC(
-          range_mc, pkg_idx_, [&](size_t row_a) HWY_ATTR {
+          range_mc, pkg_idx_, [&](size_t row_a, size_t worker) HWY_ATTR {
+            MMZone fill_zone;
+            fill_zone.MaybeEnter(worker, zone_id, args_);
             MMScaleDemoteAdd::FillC(IndexRange(row_a, row_a + 1), range_np_,
-                                    args_, C);
+                                    args_, C_rows);
           });
     } else {
       HWY_UNREACHABLE;  // kDirect is only used with kNT.
@@ -1020,33 +1012,33 @@ class MMPerPackage {
   // Parallel loops over mc/nc blocks of M/range_np, single K.
   // Fills `mc x nc` sections of C directly, in parallel.
   template <typename TB, typename TC>
-  HWY_INLINE void DoNT_MT(const ConstMat<TB>& B, size_t num_packed_B,
-                          const RowPtr<TC>& C) const {
-    MMZone zone;
-    zone.MaybeEnter("MM.NT_MT", args_);
+  HWY_INLINE void DoNT_MT(const MatPtrT<TB>& B, RowPtrs<TC> C_rows) const {
+    static const uint32_t zone_id = PROFILER_ADD_ZONE("MM.NT_MT");
     HWY_DASSERT(ranges_kc_.NumTasks() == 1);
     const IndexRange& range_K = ranges_kc_.Range(0);
     const size_t K = range_K.Num();
-    const size_t B_stride = StrideForCyclicOffsets<BF16>(K);
+    const size_t B_stride =
+        Stride(MatPadding::kOdd, K, sizeof(BF16), line_bytes_);
 
     // Sequential loop over NC/MC/KC, similar to `loop_nc` below
     // except for the profiler strings and `out_tag`.
     args_.env->parallel.ForRangesMC_NC(
         ranges_mc_, ranges_nc_, pkg_idx_,
-        [&](const IndexRange& range_mc, const IndexRange& range_nc) HWY_ATTR {
-          const RowPtrBF& A_view = A_.View(range_mc.begin(), 0, K);
+        [&](const IndexRange& range_mc, const IndexRange& range_nc,
+            size_t worker) HWY_ATTR {
+          MMZone zone;
+          zone.MaybeEnter(worker, zone_id, args_);
+
+          const StridedViewBF& A_view = A_.View(range_mc.begin(), 0, K);
           HWY_ALIGN BF16 B_storage[B_storage_max_];  // TLS
-          const RowPtrBF B_view(B_storage, K, B_stride);
+          const StridedViewBF B_storage_view(B_storage, K, B_stride);
 
           for (size_t row_b = range_nc.begin(); row_b < range_nc.end();
                row_b += kNR) {
-            {
-              MMZone zone;
-              zone.MaybeEnter("MM.NT_MT.DecB", args_);
-              DecompressB(B, num_packed_B, row_b, range_K, B_view);
-            }
+            StridedViewBF B_view =
+                DecompressB(B, row_b, range_K, B_storage_view);
             MMKernel::A2C0(A_view, B_view, mr_, range_mc, row_b, K, MMSetC(),
-                           args_, C);
+                           args_, C_rows);
           }
         });
 
@@ -1056,79 +1048,89 @@ class MMPerPackage {
   // Parallel loops over mc/nc blocks of M/range_np, sequential K.
   // Fills `mc x nc` sections of `partial`, then `C`, in parallel.
   template <typename TB, typename TC>
-  HWY_INLINE void DoNT_MT_K(const ConstMat<TB>& B, size_t num_packed_B,
-                            const RowPtr<TC>& C) const {
-    MMZone zone;
-    zone.MaybeEnter("MM.NT_MT_K", args_);
+  HWY_INLINE void DoNT_MT_K(const MatPtrT<TB>& B, RowPtrs<TC> C_rows) const {
+    static const uint32_t zone_id = PROFILER_ADD_ZONE("MM.NT_MT_K");
+    static const uint32_t fill_zone_id = PROFILER_ADD_ZONE("MM.NT_MT_K.FillC");
     const size_t kc_max = ranges_kc_.TaskSize();
     HWY_DASSERT(kc_max <= MMStorage::kMaxKC);
-    const size_t B_stride = StrideForCyclicOffsets<BF16>(kc_max);
+    const size_t B_stride =
+        Stride(MatPadding::kOdd, kc_max, sizeof(BF16), line_bytes_);
     // Sequential loop over NC/MC/KC, for when the M/N loops are
     // already parallel. This is B3A2C0 in MOMMS terminology: we read
     // `mc x kc` of A, `nc x kc` of B, update `mc x nc` of `partial`.
-    const auto loop_nc = [&](const RowPtrBF& B_view, const IndexRange& range_mc,
+    const auto loop_nc = [&](const StridedViewBF& B_storage_view,
+                             const IndexRange& range_mc,
                              const IndexRange& range_kc,
                              const IndexRange& range_nc,
                              auto out_tag) HWY_ATTR {
       const size_t kc = range_kc.Num();
-      const RowPtrBF& A_view = A_.View(range_mc.begin(), range_kc.begin(), kc);
+      const StridedViewBF& A_view =
+          A_.View(range_mc.begin(), range_kc.begin(), kc);
 
       for (size_t row_b = range_nc.begin(); row_b < range_nc.end();
            row_b += kNR) {
-        {
-          MMZone zone;
-          zone.MaybeEnter("MM.NT_MT_K.DecB", args_);
-          DecompressB(B, num_packed_B, row_b, range_kc, B_view);
-        }
+        StridedViewBF B_view = DecompressB(B, row_b, range_kc, B_storage_view);
         MMKernel::A2C0(A_view, B_view, mr_, range_mc, row_b, kc, out_tag, args_,
-                       C);
+                       C_rows);
       }
     };  // loop_nc
     args_.env->parallel.ForRangesMC_NC(
         ranges_mc_, ranges_nc_, pkg_idx_,
-        [&](const IndexRange& range_mc, const IndexRange& range_nc) HWY_ATTR {
+        [&](const IndexRange& range_mc, const IndexRange& range_nc,
+            size_t worker) HWY_ATTR {
+          MMZone zone;
+          zone.MaybeEnter(worker, zone_id, args_);
+
           HWY_ALIGN BF16 B_storage[B_storage_max_];  // TLS
-          const RowPtrBF B_view(B_storage, kc_max, B_stride);
+          const StridedViewBF B_storage_view(B_storage, kc_max, B_stride);
 
           // Peel off the first iteration of the kc loop: avoid
           // zero-initializing `partial` by writing into it.
           ranges_kc_.VisitFirst([&](const IndexRange& range_kc) {
-            loop_nc(B_view, range_mc, range_kc, range_nc, MMSetPartial());
+            loop_nc(B_storage_view, range_mc, range_kc, range_nc,
+                    MMSetPartial());
           });
           ranges_kc_.VisitRemaining([&](const IndexRange& range_kc) {
-            loop_nc(B_view, range_mc, range_kc, range_nc, MMAddPartial());
+            loop_nc(B_storage_view, range_mc, range_kc, range_nc,
+                    MMAddPartial());
           });
 
           // Already in parallel section, hence no `kParM`, and
           // `kDirect` is only used with `kNT_MT`.
           HWY_DASSERT(out_ == MMOut::kCopy);
           MMZone fill_zone;
-          fill_zone.MaybeEnter("MM.NT_MT_K.FillC", args_);
-          MMScaleDemoteAdd::FillC(range_mc, range_nc, args_, C);
+          fill_zone.MaybeEnter(worker, fill_zone_id, args_);
+          MMScaleDemoteAdd::FillC(range_mc, range_nc, args_, C_rows);
         });
   }
 
-  // Decompresses all `M x K` from `A` into `pkg_A`. Assumes `TA` is a seekable
-  // type (i.e., not NUQ) so we can use pointer arithmetic.
+  // Decompresses all `M x K` from `A` into padded BF16 `A_`. Assumes `TA` is a
+  // seekable type (i.e., not NUQ) so we can use pointer arithmetic.
   template <typename TA>
-  HWY_NOINLINE void DoDecompressA(const ConstMat<TA>& A, MMParA par_a) const {
-    const IndexRange all_M(0, A.extents.rows);
-    const IndexRange all_K(0, A.extents.cols);
+  HWY_NOINLINE void DoDecompressA(const MatPtrT<TA>& A, MMParA par_a) const {
+    const IndexRange all_M(0, A.Rows());
+    const IndexRange all_K(0, A.Cols());
     HWY_DASSERT(all_K.Num() == A_.Cols());
 
     const hn::ScalableTag<BF16> dbf;
     const size_t NBF = hn::Lanes(dbf);
     static_assert(hwy::IsSameEither<TA, BF16, float>(), "Can seek");
 
+    static const uint32_t zone_id = PROFILER_ADD_ZONE("MM.DecompressA");
+
     const auto do_range = [&](const IndexRange& range_M,
-                              const IndexRange& range_K) HWY_ATTR {
+                              const IndexRange& range_K,
+                              size_t worker) HWY_ATTR {
+      MMZone zone;
+      zone.MaybeEnter(worker, zone_id, args_);
+
       const size_t col0 = range_K.begin();
       const size_t cols = range_K.Num();
-      // otherwise, padding overwrites neighbors
-      HWY_DASSERT(cols % NBF == 0 || cols == A.extents.cols);
+      // Must be a vector multiple, or the last range before row padding,
+      // otherwise `DecompressAndZeroPad` overwrites neighbors.
+      HWY_DASSERT(cols % NBF == 0 || range_K.end() == A.Cols());
       for (size_t row_a : range_M) {
-        const PackedSpan<const TA> from =
-            MakeSpan(A.ptr + A.Row(row_a) + col0, cols);
+        const PackedSpan<const TA> from = MakeSpan(A.Row(row_a) + col0, cols);
         BF16* HWY_RESTRICT to = A_.Row(row_a) + col0;
         DecompressAndZeroPad(dbf, from, 0, to, cols);
         // Verify that we zero-padded.
@@ -1142,7 +1144,7 @@ class MMPerPackage {
 
     switch (par_a) {
       case MMParA::kNone:
-        do_range(all_M, all_K);
+        do_range(all_M, all_K, /*worker=*/0);
         break;
       case MMParA::kK1:
       case MMParA::kK2:
@@ -1151,33 +1153,36 @@ class MMPerPackage {
         // At least one vector, otherwise DecompressAndZeroPad will add
         // padding, which might overwrite neighboring tasks. Also a whole cache
         // line to avoid false sharing.
-        const size_t multiple_K =
-            HWY_MAX(NBF, Allocator::LineBytes() / sizeof(BF16));
+        const size_t multiple_K = HWY_MAX(NBF, line_bytes_ / sizeof(BF16));
 
         args_.env->parallel.ForNP(
             all_K, multiple_K, inner_tasks, pkg_idx_,
-            [&](const IndexRange& range_K) { do_range(all_M, range_K); });
+            [&](const IndexRange& range_K, size_t worker) {
+              do_range(all_M, range_K, worker);
+            });
         break;
       }
       case MMParA::kM:
-        args_.env->parallel.ForRangeMC(all_M, pkg_idx_, [&](size_t row_a) {
-          do_range(IndexRange(row_a, row_a + 1), all_K);
-        });
+        args_.env->parallel.ForRangeMC(
+            all_M, pkg_idx_, [&](size_t row_a, size_t worker) {
+              do_range(IndexRange(row_a, row_a + 1), all_K, worker);
+            });
         break;
     }
   }
 
   // Autotuning wrapper for `DoDecompressA`.
   template <typename TA>
-  HWY_INLINE RowPtrBF DecompressA(const ConstMat<TA>& A) const {
+  HWY_INLINE StridedViewBF DecompressA(const MatPtrT<TA>& A) const {
     MMAutoTune<MMParA>& autotune = args_.per_key->autotune_par_a[pkg_idx_];
     // If already BF16, maybe return a view:
     if constexpr (hwy::IsSame<TA, BF16>()) {
-      // Only if no zero-padding required.
+      // Only if vector multiple and padded (see `DoDecompressA`).
       const size_t NBF = hn::Lanes(hn::ScalableTag<BF16>());
-      if (HWY_LIKELY(A.extents.cols % NBF == 0)) {
-        const BF16* pos = A.ptr + A.Row(0);
-        return RowPtrBF(const_cast<BF16*>(pos), A.extents.cols, A.Stride());
+      if (HWY_LIKELY(A.Cols() % NBF == 0 && !A.IsPacked())) {
+        // Const, but cast because StridedView is also used for `partial` which
+        // is non-const.
+        return StridedViewBF(const_cast<TA*>(A.Row(0)), A.Cols(), A.Stride());
       }
     }
 
@@ -1188,12 +1193,9 @@ class MMPerPackage {
 
     // First call: generate candidates.
     if (HWY_UNLIKELY(!autotune.HasCandidates())) {
-      std::vector<MMParA> candidates = {MMParA::kK1, MMParA::kK2, MMParA::kK4};
-      if (A.extents.rows == 1) {
-        candidates.push_back(MMParA::kNone);
-      } else {
-        candidates.push_back(MMParA::kM);
-      }
+      const MMParA other = (A.Rows() == 1) ? MMParA::kNone : MMParA::kM;
+      std::vector<MMParA> candidates = {MMParA::kK1, MMParA::kK2, MMParA::kK4,
+                                        other};
       autotune.SetCandidates(candidates);
     }
 
@@ -1215,18 +1217,22 @@ class MMPerPackage {
   // col 0 of `B_view`. Decompressing SFP is relatively cheap on `AVX3_DL`
   // thanks to its large table lookups, and less so on other targets.
   template <typename TB>
-  HWY_INLINE void DecompressB(const ConstMat<TB>& B, size_t num_packed_B,
-                              const size_t row_b, const IndexRange& range_kc,
-                              const RowPtrBF& B_view) const {
-    const hn::ScalableTag<BF16> dbf;
+  HWY_INLINE StridedViewBF DecompressB(const MatPtrT<TB>& B, const size_t row_b,
+                                       const IndexRange& range_kc,
+                                       const StridedViewBF& B_view) const {
+    if constexpr (hwy::IsSame<TB, BF16>()) {
+      return StridedViewBF(const_cast<BF16*>(B.Row(row_b)) + range_kc.begin(),
+                           range_kc.Num(), B.Stride());
+    }
 
-    const PackedSpan<const TB> B_span = MakeSpan(B.ptr, num_packed_B);
+    const hn::ScalableTag<BF16> dbf;
+    const PackedSpan<const TB> B_span = B.PaddedSpan();
 
     const size_t kc = range_kc.Num();
     const size_t col0 = range_kc.begin();
 
     for (size_t r = 0; r < kNR; ++r) {
-      const size_t packed_ofs = B.Row(row_b + r) + col0;
+      const size_t packed_ofs = (row_b + r) * B.Stride() + col0;
       BF16* HWY_RESTRICT to = B_view.Row(r);
       DecompressAndZeroPad(dbf, B_span, packed_ofs, to, kc);
       // Verify that we zero-padded.
@@ -1236,11 +1242,12 @@ class MMPerPackage {
         }
       }
     }
+    return B_view;
   }
 
   const MMArgs args_;  // copy for locality
   const size_t pkg_idx_;
-  RowPtrBF A_;  // points into A or storage.
+  StridedViewBF A_;  // view into A or pkg_A_, both of which are padded.
 
   const IndexRange range_np_;
   // From MMConfig:
@@ -1251,6 +1258,7 @@ class MMPerPackage {
   const MMOrder order_;
   const size_t inner_tasks_;
   const MMOut out_;
+  const size_t line_bytes_;
 };  // MMPerPackage
 
 // Stateless, wraps member functions.
@@ -1268,18 +1276,19 @@ struct MMImpl {
   // Called from `MatMul` from two places: either with the next autotune config,
   // or with the best config.
   template <typename TA, typename TB, typename TC>
-  static HWY_NOINLINE void DoMatMul(const ConstMat<TA>& A,
-                                    const ConstMat<TB>& B, const RowPtr<TC>& C,
-                                    const MMArgs& args,
+  static HWY_NOINLINE void DoMatMul(const MatPtrT<TA>& A, const MatPtrT<TB>& B,
+                                    RowPtrs<TC> C_rows, const MMArgs& args,
                                     const MMConfig& config) {
-    MMZone matmul_zone;
-    matmul_zone.MaybeEnter("MM.DoMatMul", args);
+    PROFILER_ZONE("MM.DoMatMul");
+    static const uint32_t zone_id = PROFILER_ADD_ZONE("MM.DoMatMul.PerPkg");
 
     // Outermost loop: static NUMA-aware partition of B rows across packages.
     args.env->parallel.ForPkg(
         args.per_key->ranges_np.NumTasks(), [&](size_t pkg_idx) {
+          MMZone matmul_zone;
+          matmul_zone.MaybeEnter(pkg_idx, zone_id, args);
           const IndexRange& range_np = args.per_key->ranges_np.Range(pkg_idx);
-          MMPerPackage(A, args, config, pkg_idx, range_np)(B, C);
+          MMPerPackage(A, args, config, pkg_idx, range_np)(B, C_rows);
         });
   }
 };
@@ -1287,10 +1296,14 @@ struct MMImpl {
 // Computes the matrix product `A * B * scale [+ add]` and stores it in `C`.
 //
 // `A` is a row-major matrix with `M` rows and `B` is transposed. The latter's
-// `K = B.Extents().cols`, which must match `A.Extents().cols`, is the number
+// `K = B.Cols()`, which must match `A.Cols()`, is the number
 // of rows in the original B. `N = C.Cols()` must be a multiple of 4. There
 // are no other restrictions on shape, though performance is better when `M % 4
-// == 0` or `M <= 4`.
+// == 0` or `M <= 4`, and when A is padded (`!A.IsPacked()`).
+//
+// NOTE: if A and/or B are BF16 and padded, the interval `[Cols(),
+// hwy::RoundUpTo(Cols(), hn::Lanes(dbf))` must be zero-initialized to match
+// the behavior of `DecompressAndZeroPad`. We check this in debug builds.
 //
 // If `add` is non-null, the row-vector `add` is added to each of the `M` rows
 // of `C`, which is a row-major matrix with arbitrary stride. A scale for
@@ -1305,21 +1318,24 @@ struct MMImpl {
 //
 // Uses considerable stack space: at least 40 KiB per thread.
 template <typename TA, typename TB, typename TC>
-HWY_NOINLINE MMPerKey* MatMul(const ConstMat<TA>& A, const ConstMat<TB>& B,
+HWY_NOINLINE MMPerKey* MatMul(const MatPtrT<TA>& A, const MatPtrT<TB>& B,
                               const float* HWY_RESTRICT add, MatMulEnv& env,
-                              const RowPtr<TC>& C) {
-  const size_t M = A.Extents().rows;
-  const size_t K = A.Extents().cols;
-  const size_t N = B.Extents().rows;
+                              MatPtrT<TC>& C) {
+  RowPtrs<TC> C_rows = GetOrSetTempRowPtrs(C, env.row_ptrs[2]);
+
+  const Allocator& allocator = env.ctx.allocator;
+  const size_t M = A.Rows();
+  const size_t K = A.Cols();
+  const size_t N = B.Rows();
   const MMKeys::Key key = MMKeys::KeyFromDims(M, K, N);
   intptr_t index = MMImpl::IndexOfKey(key, env.keys);
   // First time we see this shape/key.
   if (HWY_UNLIKELY(index < 0)) {
-    env.keys.Append(key);
+    env.keys.Append(key, allocator);
 
     size_t max_packages = MMParallel::kMaxPackages;
     // For low-batch, multiple sockets only help if binding is enabled.
-    if (!Allocator::ShouldBind() && M <= 4) {
+    if (!allocator.ShouldBind() && M <= 4) {
       max_packages = 1;
     }
 
@@ -1331,33 +1347,31 @@ HWY_NOINLINE MMPerKey* MatMul(const ConstMat<TA>& A, const ConstMat<TB>& B,
   MMPerKey& per_key = env.per_key[index];
   MMAutoTune<MMConfig>& tuner = per_key.autotune;
 
-  const MMArgs args(env, per_key, static_cast<double>(A.scale) * B.scale, add,
-                    env.storage.Partial());
+  const MMArgs args(env, per_key, static_cast<double>(A.Scale()) * B.Scale(),
+                    add, env.storage.Partial());
   if (HWY_LIKELY(tuner.Best())) {
-    MMImpl::DoMatMul(A, B, C, args, *tuner.Best());
+    MMImpl::DoMatMul(A, B, C_rows, args, *tuner.Best());
     return &per_key;
   }
 
-  PROFILER_ZONE("Matmul.Autotune");
+  // From here, CPU time is negligible except DoMatMul.
 
   // First call: enumerate all feasible configs.
   if (HWY_UNLIKELY(!tuner.HasCandidates())) {
     // Ensure matrix dimensions match each other.
-    HWY_ASSERT(K == B.Extents().cols);
-    HWY_ASSERT(N == C.Cols());
+    HWY_ASSERT(K == B.Cols());
     HWY_ASSERT(M <= MMStorage::kMaxM);
     HWY_ASSERT(K <= MMStorage::kMaxK);
     HWY_ASSERT(N <= MMStorage::kMaxN);
     HWY_ASSERT(N % kNR == 0);
 
-    // Negligible CPU time.
-    tuner.SetCandidates(MMCandidates(M, K, N, sizeof(TC), MMKernel::kMaxMR, kNR,
-                                     per_key.ranges_np, env.print_config));
+    tuner.SetCandidates(MMCandidates(allocator, M, K, N, sizeof(TC), kMaxMR,
+                                     kNR, per_key.ranges_np, env.print_config));
   }
 
   const MMConfig& cfg = tuner.NextConfig();
   const uint64_t t0 = hwy::timer::Start();
-  MMImpl::DoMatMul(A, B, C, args, cfg);
+  MMImpl::DoMatMul(A, B, C_rows, args, cfg);
   const uint64_t t1 =
       env.have_timer_stop ? hwy::timer::Stop() : hwy::timer::Start();
   const double min_elapsed = static_cast<double>(tuner.NotifyTicks(t1 - t0)) /
diff --git a/ops/matmul.cc b/ops/matmul.cc
index 8e9fc82..de6d52b 100644
--- a/ops/matmul.cc
+++ b/ops/matmul.cc
@@ -21,14 +21,17 @@
 #include <stdint.h>
 #include <stdio.h>
 
+#include <atomic>
 #include <vector>
 
 #include "util/allocator.h"
 #include "util/basics.h"
-#include "util/threading.h"
+#include "util/mat.h"
+#include "util/threading_context.h"
 #include "hwy/base.h"
 #include "hwy/detect_targets.h"
 #include "hwy/per_target.h"
+#include "hwy/profiler.h"
 #include "hwy/timer.h"
 
 namespace gcpp {
@@ -60,10 +63,11 @@ size_t PrevDivisor(const size_t begin, const size_t end, const size_t dim,
 // and holds most of their arguments in member variables.
 class GenerateCandidates {
  public:
-  GenerateCandidates(size_t M, size_t K, size_t N, size_t sizeof_TC,
-                     size_t max_mr, size_t nr,
+  GenerateCandidates(const Allocator& allocator, size_t M, size_t K, size_t N,
+                     size_t sizeof_TC, size_t max_mr, size_t nr,
                      const IndexRangePartition& ranges_np, bool print_config)
-      : M_(M),
+      : allocator_(allocator),
+        M_(M),
         K_(K),
         N_(N),
         sizeof_TC_(sizeof_TC),
@@ -73,8 +77,8 @@ class GenerateCandidates {
         // `RangesOf*`. Must be a vector multiple. The previous/next cache line
         // is likely still in L1, but we expect K > 1000 and might as well round
         // up to the line size.
-        kc_multiple_(HWY_MIN(K, Allocator::LineBytes() / sizeof(BF16))),
-        nc_multiple_(Allocator::StepBytes() / sizeof_TC),
+        kc_multiple_(HWY_MIN(K, allocator.LineBytes() / sizeof(BF16))),
+        nc_multiple_(allocator.StepBytes() / sizeof_TC),
         ranges_np_(ranges_np),
         print_config_(print_config) {}
 
@@ -149,7 +153,7 @@ class GenerateCandidates {
       // 2D blocking is useless for a single row of M.
       if (IsBlock(order) && M_ <= mr) continue;
       // Conversely, N-only parallelism is uncompetitive for large M.
-      if (!IsBlock(order) && M_ >= 8 * mr) continue;
+      if (!IsBlock(order) && M_ >= kMaxTilesM * mr) continue;
       orders.push_back(order);
     }
   }
@@ -172,7 +176,7 @@ class GenerateCandidates {
     // subtract the output and buf, and allow using more than the actual L1
     // size. This results in an overestimate, and the loop below will propose
     // the next few smaller values for the autotuner to evaluate.
-    const size_t bytes_ab = Allocator::L1Bytes() * 3;
+    const size_t bytes_ab = allocator_.L1Bytes() * 3;
     const size_t col_bytes = rows_a * sizeof(BF16) + nr_ * sizeof(BF16);
     size_t kc_max = hwy::DivCeil(bytes_ab, col_bytes);
     kc_max =
@@ -220,8 +224,8 @@ class GenerateCandidates {
     // packed B. We want `mc * kc` elements of A to fit in L2, alongside
     // `bytes_b` plus `mc` cache lines because resident-A updates `mc` rows of
     // partial.
-    const size_t bytes_per_mc = kc * sizeof(BF16) + Allocator::LineBytes();
-    size_t mc_max = hwy::DivCeil(Allocator::L2Bytes() - bytes_b, bytes_per_mc);
+    const size_t bytes_per_mc = kc * sizeof(BF16) + allocator_.LineBytes();
+    size_t mc_max = hwy::DivCeil(allocator_.L2Bytes() - bytes_b, bytes_per_mc);
     mc_max = HWY_MIN(mc_max, MMStorage::kMaxM);
     HWY_DASSERT(mc_max != 0);
     mc_max = HWY_MIN(mc_max, M_);
@@ -264,7 +268,7 @@ class GenerateCandidates {
     // Otherwise, leave it unbounded.
     if (M_ > mr) {
       const size_t bytes_per_nc = (kc * sizeof(BF16) + mc * out_bytes);
-      nc_max = hwy::DivCeil(Allocator::L3Bytes(), bytes_per_nc);
+      nc_max = hwy::DivCeil(allocator_.L3Bytes(), bytes_per_nc);
       nc_max = HWY_MIN(HWY_MIN(nc_max, MMStorage::kMaxN), np_max);
     }
     HWY_DASSERT(nc_max != 0);
@@ -351,6 +355,7 @@ class GenerateCandidates {
     }
   }
 
+  const Allocator& allocator_;
   const size_t M_;
   const size_t K_;
   const size_t N_;
@@ -370,25 +375,26 @@ class GenerateCandidates {
 }  // namespace
 
 // Facade to avoid exposing `GenerateCandidates` in the header.
-std::vector<MMConfig> MMCandidates(size_t M, size_t K, size_t N,
-                                   size_t sizeof_TC, size_t max_mr, size_t nr,
+std::vector<MMConfig> MMCandidates(const Allocator& allocator, size_t M,
+                                   size_t K, size_t N, size_t sizeof_TC,
+                                   size_t max_mr, size_t nr,
                                    const IndexRangePartition& ranges_np,
                                    bool print_config) {
-  return GenerateCandidates(M, K, N, sizeof_TC, max_mr, nr, ranges_np,
-                            print_config)();
+  return GenerateCandidates(allocator, M, K, N, sizeof_TC, max_mr, nr,
+                            ranges_np, print_config)();
 }
 
 // Returns the granularity of B rows for `RangesOfNP`. Aims to avoid remote
 // memory accesses or false sharing, unless there are insufficient per-package
 // rows for that.
-static size_t NPMultiple(size_t N, size_t sizeof_TC, size_t nr,
-                         size_t num_packages) {
-  size_t np_multiple = Allocator::QuantumBytes() / sizeof_TC;
+static size_t NPMultiple(const Allocator& allocator, size_t N,
+                         size_t sizeof_TC, size_t nr, size_t num_packages) {
+  size_t np_multiple = allocator.BasePageBytes() / sizeof_TC;
   // If binding, `np_multiple` is typically 1024 and `num_packages` > 1. For
   // `N` < 4096, this can cause significant load imbalance. If split unevenly,
   // choose a smaller multiple.
   if (N % (np_multiple * num_packages)) {
-    const size_t min_multiple = Allocator::LineBytes() / sizeof_TC;
+    const size_t min_multiple = allocator.LineBytes() / sizeof_TC;
     np_multiple =
         PrevDivisor(min_multiple, np_multiple, N / num_packages, min_multiple);
     if (HWY_UNLIKELY(np_multiple == 0)) {
@@ -396,8 +402,13 @@ static size_t NPMultiple(size_t N, size_t sizeof_TC, size_t nr,
     }
     // This happens in tests with small N, hence do not assert.
     if (N % (np_multiple * num_packages) && N >= 128) {
-      HWY_WARN("NPMultiple: N=%zu still not divisible by np_multiple=%zu\n", N,
-               np_multiple);
+      static std::atomic_flag warned = ATOMIC_FLAG_INIT;
+      if (!warned.test_and_set()) {
+        HWY_WARN(
+            "NPMultiple: N=%zu still not divisible by np_multiple=%zu * "
+            "num_packages=%zu\n",
+            N, np_multiple, num_packages);
+      }
       np_multiple = nr;
     }
   }
@@ -406,18 +417,73 @@ static size_t NPMultiple(size_t N, size_t sizeof_TC, size_t nr,
 
 IndexRangePartition MMParallel::RangesOfNP(size_t max_packages, size_t N,
                                            size_t sizeof_TC, size_t nr) const {
-  const size_t num_packages = HWY_MIN(max_packages, pools_.NumPackages());
-  return StaticPartition(IndexRange(0, N), num_packages,
-                         NPMultiple(N, sizeof_TC, nr, num_packages));
+  const size_t num_packages = HWY_MIN(max_packages, ctx_.pools.NumPackages());
+  return StaticPartition(
+      IndexRange(0, N), num_packages,
+      NPMultiple(ctx_.allocator, N, sizeof_TC, nr, num_packages));
 }
 
-MatMulEnv::MatMulEnv(const BoundedTopology& topology, NestedPools& pools)
-    : parallel(topology, pools), storage(parallel) {
-  // Ensure Allocator:Init was called.
-  HWY_ASSERT(Allocator::LineBytes() != 0 && Allocator::VectorBytes() != 0);
-
+MatMulEnv::MatMulEnv(ThreadingContext& ctx)
+    : ctx(ctx), parallel(ctx), storage(ctx.allocator, parallel) {
   char cpu100[100];
   have_timer_stop = hwy::platform::HaveTimerStop(cpu100);
+
+  row_ptrs.push_back(hwy::AllocateAligned<uint8_t*>(MMStorage::kMaxM));  // A
+  row_ptrs.push_back(hwy::AllocateAligned<uint8_t*>(MMStorage::kMaxN));  // B
+  row_ptrs.push_back(hwy::AllocateAligned<uint8_t*>(MMStorage::kMaxM));  // C
+}
+
+void BindB(MatPtr& B, size_t sizeof_TC, MMParallel& parallel) {
+  Allocator& allocator = parallel.allocator();
+  if (!allocator.ShouldBind()) return;
+  if (B.Rows() == 1) return;
+
+  PROFILER_ZONE("Startup.BindB");
+
+  const IndexRangePartition ranges_np =
+      parallel.RangesOfNP(MMParallel::kMaxPackages, B.Rows(), sizeof_TC, kNR);
+  for (size_t pkg_idx = 0; pkg_idx < ranges_np.NumTasks(); ++pkg_idx) {
+    const IndexRange& rows_b = ranges_np.Range(pkg_idx);
+    const size_t node = parallel.Node(pkg_idx);
+    uintptr_t begin = reinterpret_cast<uintptr_t>(B.RowBytes(rows_b.begin()));
+    uintptr_t end = begin + rows_b.Num() * B.Stride() * B.ElementBytes();
+    // B row padding is less than the page size, so only bind the subset that
+    // is page-aligned.
+    begin = hwy::RoundUpTo(begin, allocator.BasePageBytes());
+    end = hwy::RoundDownTo(end, allocator.BasePageBytes());
+    if (HWY_LIKELY(begin != end)) {
+      allocator.BindMemory(reinterpret_cast<void*>(begin), end - begin, node);
+    }
+  }
+}
+
+// C is BF16/float, or double for partial
+void BindC(MatPtr& C, MMParallel& parallel) {
+  Allocator& allocator = parallel.allocator();
+  if (!allocator.ShouldBind()) return;
+
+  PROFILER_ZONE("Startup.BindC");
+
+  const IndexRangePartition ranges_np = parallel.RangesOfNP(
+      MMParallel::kMaxPackages, C.Cols(), C.ElementBytes(), kNR);
+  bool ok = true;
+  for (size_t pkg_idx = 0; pkg_idx < ranges_np.NumTasks(); ++pkg_idx) {
+    const IndexRange& cols_c = ranges_np.Range(pkg_idx);
+    // `BindMemory` requires page alignment. These are in bytes.
+    const size_t begin = hwy::RoundUpTo(cols_c.begin() * C.ElementBytes(),
+                                        allocator.BasePageBytes());
+    const size_t end = hwy::RoundDownTo(cols_c.end() * C.ElementBytes(),
+                                        allocator.BasePageBytes());
+
+    const size_t node = parallel.Node(pkg_idx);
+    for (size_t im = 0; im < C.Rows(); ++im) {
+      ok &= allocator.BindMemory(C.RowBytes(im) + begin, end - begin, node);
+    }
+  }
+  if (HWY_UNLIKELY(!ok)) {
+    HWY_WARN("Failed to bind C (%zux%zu), %zu packages.", C.Rows(), C.Cols(),
+             ranges_np.NumTasks());
+  }
 }
 
 }  // namespace gcpp
diff --git a/ops/matmul.h b/ops/matmul.h
index dc375d0..99477d3 100644
--- a/ops/matmul.h
+++ b/ops/matmul.h
@@ -21,14 +21,13 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <memory>  // std::unique_ptr
 #include <vector>
 
 // IWYU pragma: begin_exports
-#include "compression/compress.h"
-#include "util/allocator.h"
 #include "util/basics.h"
-#include "util/threading.h"
-#include "util/topology.h"
+#include "util/mat.h"
+#include "util/threading_context.h"
 #include "hwy/aligned_allocator.h"  // Span
 #include "hwy/base.h"
 #include "hwy/bit_set.h"
@@ -47,18 +46,25 @@ namespace gcpp {
 // `MMAddHorizontalSumsIntoPartial`. We ensure `C.Cols() % kNR == 0`.
 constexpr size_t kNR = 4;
 
+// Choosing `kMaxMR == kNR` minimizes the ratio of loads to FMA, because
+// we load `kNR + kMaxMR` vectors per `kMaxMR * kNR` element tile.
+// In general, `M` (batch size) is not a multiple of `kMaxMR`. Thus functions
+// that load or store a tile are parameterized on `kRowsAC`: usually `kMaxMR`,
+// or less on ISAs with fewer registers, or for the last few rows of A.
+static constexpr size_t kMaxMR = 4;
+
+// Mostly stateless, can be constructed on the fly by weights.cc. Captures the
+// the ThreadingContext to shorten call sites.
 class MMParallel {
  public:
   static constexpr size_t kMaxPackages = 4;
 
-  // Both references must outlive this object.
-  MMParallel(const BoundedTopology& topology, NestedPools& pools)
-      : topology_(topology), pools_(pools) {
-    HWY_DASSERT(pools_.NumPackages() <= kMaxPackages);
+  // `ctx` must outlive this object.
+  MMParallel(ThreadingContext& ctx) : ctx_(ctx) {
+    HWY_DASSERT(ctx_.pools.NumPackages() <= kMaxPackages);
   }
 
-  // Used by tests.
-  NestedPools& Pools() { return pools_; }
+  Allocator& allocator() const { return ctx_.allocator; }
 
   // Initial static partitioning of B rows across packages.
   IndexRangePartition RangesOfNP(size_t max_packages, size_t N,
@@ -66,37 +72,40 @@ class MMParallel {
 
   // For `BindB` and `BindC`.
   size_t Node(size_t pkg_idx) const {
-    return topology_.GetCluster(pkg_idx, 0).Node();
+    return ctx_.topology.GetCluster(pkg_idx, 0).Node();
   }
 
   // Calls `func(pkg_idx)` for each package in parallel.
   template <class Func>
   void ForPkg(const size_t max_packages, const Func& func) {
-    pools_.AllPackages().Run(0, HWY_MIN(max_packages, pools_.NumPackages()),
-                             [&](uint64_t task, size_t pkg_idx) {
-                               HWY_DASSERT(task == pkg_idx);
-                               (void)task;
-                               func(pkg_idx);
-                             });
+    ctx_.pools.AllPackages().Run(
+        0, HWY_MIN(max_packages, ctx_.pools.NumPackages()),
+        [&](uint64_t task, size_t pkg_idx) {
+          HWY_DASSERT(task == pkg_idx);
+          (void)task;
+          func(pkg_idx);
+        });
   }
 
   // Cluster/CCX-aware parallel-for over B rows in `range_np`. `nx_multiple` is
-  // the granularity of per-cluster tasks. Calls `func(worker_range)`.
+  // the granularity of per-cluster tasks. Calls `func(worker_range, worker)`.
   template <class Func>
   void ForNP(const IndexRange& range_np, size_t nx_multiple, size_t inner_tasks,
              size_t pkg_idx, const Func& func) {
     HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4);
+    const size_t pkg_base = pkg_idx * ctx_.pools.MaxWorkersPerPackage();
+
     // Single cluster: parallel-for over static partition of `range_np`.
-    hwy::ThreadPool& all_clusters = pools_.AllClusters(pkg_idx);
+    hwy::ThreadPool& all_clusters = ctx_.pools.AllClusters(pkg_idx);
     const size_t num_clusters = all_clusters.NumWorkers();
     if (num_clusters == 1) {
-      hwy::ThreadPool& cluster = pools_.Cluster(pkg_idx, 0);
+      hwy::ThreadPool& cluster = ctx_.pools.Cluster(pkg_idx, 0);
       const IndexRangePartition worker_ranges = StaticPartition(
           range_np, cluster.NumWorkers() * inner_tasks, nx_multiple);
       return ParallelizeOneRange(
           worker_ranges, cluster,
-          [&](const IndexRange& worker_range, size_t /*thread*/) {
-            func(worker_range);
+          [&](const IndexRange& worker_range, size_t thread) {
+            func(worker_range, pkg_base + thread);
           });
     }
 
@@ -106,42 +115,48 @@ class MMParallel {
     ParallelizeOneRange(
         nx_ranges, all_clusters,
         [&](const IndexRange& nx_range, const size_t cluster_idx) {
-          hwy::ThreadPool& cluster = pools_.Cluster(pkg_idx, cluster_idx);
+          hwy::ThreadPool& cluster = ctx_.pools.Cluster(pkg_idx, cluster_idx);
+          const size_t cluster_base =
+              pkg_base + cluster_idx * ctx_.pools.MaxWorkersPerCluster();
           // Parallel-for over sub-ranges of `cluster_range` within the cluster.
           const IndexRangePartition worker_ranges = StaticPartition(
               nx_range, cluster.NumWorkers() * inner_tasks, nx_multiple);
-          ParallelizeOneRange(worker_ranges, cluster,
-                              [&](const IndexRange& worker_range,
-                                  size_t /*thread*/) { func(worker_range); });
+          ParallelizeOneRange(
+              worker_ranges, cluster,
+              [&](const IndexRange& worker_range, size_t thread) {
+                func(worker_range, cluster_base + thread);
+              });
         });
   }
 
   // Cluster/CCX-aware parallel-for over blocks (separate subranges of A and B
-  // rows). Calls `func(range_mc, range_nc)`.
+  // rows). Calls `func(range_mc, range_nc, worker)`.
   template <class Func>
   void ForRangesMC_NC(const IndexRangePartition& ranges_mc,
                       const IndexRangePartition& ranges_nc, size_t pkg_idx,
                       const Func& func) {
-    hwy::ThreadPool& all_clusters = pools_.AllClusters(pkg_idx);
+    const size_t pkg_base = pkg_idx * ctx_.pools.MaxWorkersPerPackage();
+    hwy::ThreadPool& all_clusters = ctx_.pools.AllClusters(pkg_idx);
     // `all_clusters` is a pool with one worker per cluster in a package.
     const size_t num_clusters = all_clusters.NumWorkers();
     // Single (big) cluster: collapse two range indices into one parallel-for
     // to reduce the number of fork-joins.
     if (num_clusters == 1) {
       const size_t cluster_idx = 0;
-      hwy::ThreadPool& cluster = pools_.Cluster(pkg_idx, cluster_idx);
+      hwy::ThreadPool& cluster = ctx_.pools.Cluster(pkg_idx, cluster_idx);
       // Low-batch: avoid Divide/Remainder.
       if (HWY_UNLIKELY(ranges_mc.NumTasks() == 1)) {
         return ParallelizeOneRange(
-            ranges_nc, cluster,
-            [&](const IndexRange& range_nc, size_t /*thread*/) {
-              func(ranges_mc.Range(0), range_nc);
+            ranges_nc, cluster, [&](const IndexRange& range_nc, size_t thread) {
+              func(ranges_mc.Range(0), range_nc, pkg_base + thread);
             });
       } else {
         return ParallelizeTwoRanges(
             ranges_mc, ranges_nc, cluster,
             [&](const IndexRange& range_mc, const IndexRange& range_nc,
-                size_t /*thread*/) { func(range_mc, range_nc); });
+                size_t thread) {
+              func(range_mc, range_nc, pkg_base + thread);
+            });
       }
     }
 
@@ -150,53 +165,71 @@ class MMParallel {
     ParallelizeOneRange(
         ranges_nc, all_clusters,
         [&](const IndexRange range_nc, size_t cluster_idx) {
-          hwy::ThreadPool& cluster = pools_.Cluster(pkg_idx, cluster_idx);
-          ParallelizeOneRange(
-              ranges_mc, cluster,
-              [&](const IndexRange& range_mc, size_t /*thread*/) {
-                func(range_mc, range_nc);
-              });
+          const size_t cluster_base =
+              pkg_base + cluster_idx * ctx_.pools.MaxWorkersPerCluster();
+          hwy::ThreadPool& cluster = ctx_.pools.Cluster(pkg_idx, cluster_idx);
+          ParallelizeOneRange(ranges_mc, cluster,
+                              [&](const IndexRange& range_mc, size_t thread) {
+                                func(range_mc, range_nc, cluster_base + thread);
+                              });
         });
   }
 
-  // Calls `func(row_a)` in parallel.
+  // Calls `func(row_a, worker)` in parallel.
   template <class Func>
   void ForRangeMC(const IndexRange& range_mc, size_t pkg_idx,
                   const Func& func) {
-    pools_.Pool(pkg_idx).Run(
+    const size_t pkg_base = pkg_idx * ctx_.pools.MaxWorkersPerPackage();
+    ctx_.pools.Pool(pkg_idx).Run(
         range_mc.begin(), range_mc.end(),
-        [&](uint64_t row_a, size_t /*thread*/) { func(row_a); });
+        [&](uint64_t row_a, size_t thread) { func(row_a, pkg_base + thread); });
   }
 
  private:
-  const BoundedTopology& topology_;
-  NestedPools& pools_;
+  ThreadingContext& ctx_;
 };
 
-template <typename TC>  // BF16/float for C, double for partial
-void BindC(size_t M, const RowPtr<TC>& C, MMParallel& parallel) {
-  if (!Allocator::ShouldBind()) return;
+void BindB(MatPtr& B, size_t sizeof_TC, MMParallel& parallel);
+// C is BF16/float, or double for partial.
+void BindC(MatPtr& C, MMParallel& parallel);
 
-  const IndexRangePartition ranges_np =
-      parallel.RangesOfNP(MMParallel::kMaxPackages, C.Cols(), sizeof(TC), kNR);
-  const size_t quantum = Allocator::QuantumBytes() / sizeof(TC);
-  bool ok = true;
-  for (size_t pkg_idx = 0; pkg_idx < ranges_np.NumTasks(); ++pkg_idx) {
-    const IndexRange& cols_c = ranges_np.Range(pkg_idx);
-    const size_t node = parallel.Node(pkg_idx);
-    for (size_t im = 0; im < M; ++im) {
-      // BindRowsToPackageNodes may not be page-aligned.
-      const size_t begin = hwy::RoundUpTo(cols_c.begin(), quantum);
-      const size_t end = hwy::RoundDownTo(cols_c.end(), quantum);
-      ok &= Allocator::BindMemory(C.Row(im) + begin, (end - begin) * sizeof(TC),
-                                  node);
-    }
+// Lightweight view into `MatStorageT`, with a fixed pitch/stride between rows.
+#pragma pack(push, 1)  // power of two size
+template <typename T>
+class StridedView {
+ public:
+  StridedView(T* HWY_RESTRICT row0, size_t cols, size_t stride)
+      : row0_(row0),
+        cols_(static_cast<uint32_t>(cols)),
+        stride_(static_cast<uint32_t>(stride)) {
+    HWY_DASSERT(stride >= cols);
   }
-  if (HWY_UNLIKELY(!ok)) {
-    HWY_WARN("Failed to bind C (%zux%zu), %zu packages.", M, C.Cols(),
-             ranges_np.NumTasks());
+
+  T* HWY_RESTRICT Row(size_t r) const { return row0_ + stride_ * r; }
+  size_t Cols() const { return static_cast<size_t>(cols_); }
+
+  size_t Stride() const { return static_cast<size_t>(stride_); }
+  void SetStride(size_t stride) {
+    HWY_DASSERT(stride >= Cols());
+    stride_ = stride;
   }
-}
+
+  // Returns 2D subrange whose top-left is `r, c` and width is `cols`.
+  StridedView<T> View(size_t r, size_t c, size_t cols) const {
+    HWY_DASSERT(c < Cols());
+    HWY_DASSERT(cols <= Cols() - c);
+    return StridedView<T>(Row(r) + c, cols, stride_);
+  }
+
+ private:
+  T* HWY_RESTRICT row0_;
+  uint32_t cols_;
+  uint32_t stride_;
+};
+#pragma pack(pop)
+
+using StridedViewBF = StridedView<BF16>;
+using StridedViewD = StridedView<double>;
 
 // Per-package storage for packed A, and one global C-shaped `partial` for
 // accumulating partial dot products (sections of K).
@@ -212,46 +245,51 @@ class MMStorage {
   // of BF16 A and B fit in 32 KiB L1, but there may be `kMaxMR` and `kNR`.
   static constexpr size_t kMaxKC = 8 * 1024;
 
-  explicit MMStorage(MMParallel& parallel) {
+  // Internally threaded; must not be called concurrently with the same
+  // `ThreadingContext` (used via `parallel`).
+  MMStorage(const Allocator& allocator, MMParallel& parallel)
+      :  // Per-worker copies of `partial` would be wasteful. We instead
+         // allocate one instance of the maximum matrix extents because threads
+         // write at false-sharing-free granularity.
+        partial_storage_("partial_storage", Extents2D(kMaxM, kMaxN), allocator,
+                         MatPadding::kOdd),
+        // Same stride independent of the actual C.Cols() so we can pre-bind.
+        partial_(partial_storage_.Row(0), kMaxN, partial_storage_.Stride()) {
     // Per-package allocation so each can decompress A into its own copy.
+    // Must be padded, see `DoDecompressA`.
     parallel.ForPkg(MMParallel::kMaxPackages, [&](size_t pkg_idx) {
-      pkg_A_[pkg_idx] = AllocateAlignedRows<BF16>(Extents2D(kMaxM, kMaxK));
+      pkg_A_[pkg_idx].reset(new MatStorageT<BF16>(
+          "pkg_A", Extents2D(kMaxM, kMaxK), allocator, MatPadding::kOdd));
 
-      if (Allocator::ShouldBind()) {
+      if (allocator.ShouldBind()) {
         const size_t node = parallel.Node(pkg_idx);
-        if (!Allocator::BindMemory(pkg_A_[pkg_idx].All(),
-                                   pkg_A_[pkg_idx].NumBytes(), node)) {
+        size_t bytes = pkg_A_[pkg_idx]->Rows() * pkg_A_[pkg_idx]->Stride() *
+                       pkg_A_[pkg_idx]->ElementBytes();
+        bytes = hwy::RoundDownTo(bytes, allocator.BasePageBytes());
+        if (!allocator.BindMemory(pkg_A_[pkg_idx]->Row(0), bytes, node)) {
           HWY_WARN("Failed to bind memory for package %zu", pkg_idx);
         }
       }
     });
 
-    // Per-worker copies of `partial` would be wasteful. We instead allocate
-    // one instance of the maximum matrix extents because threads write at
-    // false-sharing-free granularity.
-    partial_storage_ = AllocateAlignedRows<double>(Extents2D(kMaxM, kMaxN));
-    // Same stride independent of the actual C.Cols() so we can pre-bind.
-    partial_ = RowPtrD(partial_storage_.All(), kMaxN,
-                       StrideForCyclicOffsets<double>(kMaxN));
     // Avoid cross-package accesses.
-    BindC(kMaxM, partial_, parallel);
+    BindC(partial_storage_, parallel);
   }
 
-  // Returns per-package matrix view. Non-const so that `RowVectorBatch` is
-  // non-const, because `RowPtr` requires a non-const pointer.
-  RowPtrBF A(size_t pkg_idx, const Extents2D& extents) {
+  // Returns per-package matrix view.
+  StridedViewBF A(size_t pkg_idx, const Extents2D& extents) const {
     HWY_DASSERT(extents.rows <= kMaxM);
     HWY_DASSERT(extents.cols <= kMaxK);
-    const size_t stride = StrideForCyclicOffsets<BF16>(extents.cols);
-    return RowPtrBF(pkg_A_[pkg_idx].All(), extents.cols, stride);
+    return StridedViewBF(const_cast<BF16*>(pkg_A_[pkg_idx]->Row(0)),
+                         extents.cols, pkg_A_[pkg_idx]->Stride());
   }
 
-  RowPtrD Partial() const { return partial_; }
+  StridedViewD Partial() const { return partial_; }
 
  private:
-  RowVectorBatch<BF16> pkg_A_[MMParallel::kMaxPackages];
-  RowVectorBatch<double> partial_storage_;
-  RowPtrD partial_;
+  std::unique_ptr<MatStorageT<BF16>> pkg_A_[MMParallel::kMaxPackages];
+  MatStorageT<double> partial_storage_;
+  StridedViewD partial_;
 };
 
 //------------------------------------------------------------------------------
@@ -431,13 +469,15 @@ class MMConfig {
 static_assert(sizeof(MMConfig) == 32);  // for faster indexing
 #pragma pack(pop)
 
-std::vector<MMConfig> MMCandidates(size_t M, size_t K, size_t N,
-                                   size_t sizeof_TC, size_t max_mr, size_t nr,
+std::vector<MMConfig> MMCandidates(const Allocator& allocator, size_t M,
+                                   size_t K, size_t N, size_t sizeof_TC,
+                                   size_t max_mr, size_t nr,
                                    const IndexRangePartition& ranges_np,
                                    bool print_config);
 
 // State machine for choosing the best `TConfig`, which is `MMConfig` for the
 // main MatMul autotuner.
+// TODO: replace with hwy/auto_tune.h.
 template <typename TConfig>
 class MMAutoTune {
  public:
@@ -535,8 +575,21 @@ class MMAutoTune {
 
 //------------------------------------------------------------------------------
 
+// Minimum M, in units of tile rows of height mr={1, 2, 4}, from which
+// `MMOrder::kNT[_K]` are no longer allowed. They require a single MC range,
+// but choosing the same config for a larger M can result in multiple MC ranges.
+// Thus M less than this must have unique keys/configs.
+static constexpr size_t kMaxTilesM = 8;
+
 // Map of previously seen dimensions to index via linear search.
 class MMKeys {
+  // Group batch size into buckets to reduce #auto-tunes.
+  static size_t BucketM(size_t M) {
+    if (M < kMaxTilesM * kMaxMR) return M;  // See kMaxTilesM above.
+    if (M <= 128) return 128;
+    return 512;
+  }
+
  public:
   using Key = uint64_t;
   // KeyFromDims will only return this if all dims are zero, which is invalid.
@@ -547,7 +600,7 @@ class MMKeys {
     HWY_DASSERT(M < (Key{1} << 16));  // batch sizes are smaller
     HWY_DASSERT(K < (Key{1} << 24));
     HWY_DASSERT(N < (Key{1} << 24));
-    const Key key = static_cast<Key>(M) | (static_cast<Key>(K) << 16) |
+    const Key key = static_cast<Key>(BucketM(M)) | (static_cast<Key>(K) << 16) |
                     (static_cast<Key>(N) << 40);
     HWY_DASSERT(key != kPadding);
     return key;
@@ -560,11 +613,11 @@ class MMKeys {
   }
 
   // Must only be called if not already present in `Keys()`.
-  void Append(Key key) {
+  void Append(Key key, const Allocator& allocator) {
     // Dynamic allocation because the test checks many more dimensions than
     // would be reasonable to pre-allocate. DIY for alignment and padding.
     if (HWY_UNLIKELY(num_unique_ >= capacity_)) {
-      const size_t NU64 = Allocator::VectorBytes() / sizeof(Key);
+      const size_t NU64 = allocator.VectorBytes() / sizeof(Key);
       // Start at one vector so the size is always a multiple of N.
       if (HWY_UNLIKELY(capacity_ == 0)) {
         capacity_ = hwy::DivCeil(NU64, 2);  // will be doubled below
@@ -604,16 +657,16 @@ struct MMPerKey {
   MMAutoTune<MMParA> autotune_par_a[MMParallel::kMaxPackages];
 };
 
-// Stores state shared across MatMul calls. Non-copyable.
+// Stores state shared across MatMul calls. Non-copyable. `ctx` must outlive
+// `MatMulEnv`.
 struct MatMulEnv {
-  explicit MatMulEnv(const BoundedTopology& topology, NestedPools& pools);
+  // Internally threaded; must not be called concurrently with the same
+  // `ThreadingContext`.
+  explicit MatMulEnv(ThreadingContext& ctx);
 
+  ThreadingContext& ctx;
   bool have_timer_stop = false;
 
-  // Enable binding: disabled in Gemma until tensors support it, enabled in
-  // bench_matmul.cc.
-  bool enable_bind = false;
-
   // Whether `MMCandidates()` should print the set of parameters.
   bool print_config = false;
   // Whether to print each config's speed during autotuning.
@@ -625,13 +678,23 @@ struct MatMulEnv {
   MMStorage storage;
   MMKeys keys;
   std::vector<MMPerKey> per_key;
+
+  // Storage for arbitrary output rows, see `MatPtr::AllocateAndAttachRowPtrs`.
+  // Most MatMul callers use strided MatPtr, but GemmaAttention::ComputeQKV
+  // writes to differing KV positions per query / output row.
+  // The first three allocations are sufficient for any A, B, C, respectively,
+  // but also potentially overwritten by each MatMul. Subsequent entries are
+  // precomputed for tensors and not overwritten. Per-tensor allocations make
+  // it likelier that asan detects bugs such as use after free, overrun, and
+  // dangling references.
+  std::vector<hwy::AlignedFreeUniquePtr<uint8_t*[]>> row_ptrs;
 };
 
 // Arguments to MatMul() that are independent of the A/B/C types.
 // Reduces register pressure compared to individual values/references.
 struct MMArgs {
   MMArgs(MatMulEnv& env, MMPerKey& per_key, double scale,
-         const float* HWY_RESTRICT add, const RowPtrD& partial)
+         const float* HWY_RESTRICT add, const StridedViewD& partial)
       : env(&env),
         per_key(&per_key),
         scale(scale),
@@ -644,7 +707,7 @@ struct MMArgs {
   double scale;
   const float* HWY_RESTRICT add;
   // Same size as C, threads write at false-sharing-free granularity.
-  RowPtrD partial;
+  StridedViewD partial;
 };
 
 // Wrapper over hwy::Zone that is only enabled when autotuning finished.
@@ -662,9 +725,9 @@ class MMZone {
   }
 
   // `name` must be a string literal.
-  void MaybeEnter(const char* name, const MMArgs& args) {
+  void MaybeEnter(size_t thread_id, uint32_t zone_id, const MMArgs& args) {
     if (args.per_key->WantProfile()) {
-      new (&data_) Zone(name);
+      new (&data_) Zone(thread_id, zone_id);
       used_ = true;
     }
   }
@@ -675,101 +738,10 @@ class MMZone {
 };
 #else
 struct MMZone {
-  void MaybeEnter(const char*, const MMArgs&) {}
+  void MaybeEnter(size_t, uint32_t, const MMArgs&) {}
 };
 #endif  // PROFILER_ENABLED
 
-// Used for the A and B arguments of `MatMul`, which are always const.
-// Create via MakeConstMat. This differs from `RowPtr` in that it supports the
-// `ofs` required for compressed T.
-template <typename T>
-struct ConstMat {
-  ConstMat(const T* ptr, Extents2D extents, size_t stride, size_t ofs = 0)
-      : ptr(ptr), extents(extents), stride(stride), ofs(ofs) {
-    HWY_DASSERT(ptr != nullptr);
-    HWY_DASSERT(stride >= extents.cols);
-  }
-  size_t Row(size_t r) const {
-    if constexpr (HWY_IS_DEBUG_BUILD) {
-      if (r >= extents.rows) {
-        HWY_ABORT("ConstMat::Row %zu out of bounds %zu", r, extents.rows);
-      }
-    }
-    return ofs + r * stride;
-  }
-
-  const Extents2D& Extents() const { return extents; }
-  size_t Stride() const { return stride; }
-
-  // Shrinks the row-extent of this matrix view, i.e. reduces the view to a
-  // subrange of the original rows starting at row 0.
-  void ShrinkRows(size_t rows) {
-    HWY_ASSERT(rows <= extents.rows);
-    extents.rows = rows;
-  }
-
-  const T* HWY_RESTRICT ptr;
-  Extents2D extents;
-  size_t stride;
-
-  // `scale` allows expanding the smaller range of `SfpStream` to the original
-  // values. MatFromWeights sets this from `MatPtr`.
-  float scale = 1.0f;
-
-  // Offset to add to `ptr`; separate because T=NuqStream does not support
-  // pointer arithmetic.
-  size_t ofs;
-};
-
-// For deducing T.
-template <typename T>
-ConstMat<T> MakeConstMat(T* HWY_RESTRICT ptr, Extents2D extents, size_t stride,
-                         size_t ofs = 0) {
-  return ConstMat<T>(ptr, extents, stride, ofs);
-}
-
-// For A argument to MatMul (activations).
-template <typename T>
-ConstMat<T> ConstMatFromBatch(size_t batch_size,
-                              const RowVectorBatch<T>& row_vectors) {
-  HWY_DASSERT(batch_size <= row_vectors.BatchSize());
-  return MakeConstMat(const_cast<T*>(row_vectors.Const()),
-                      Extents2D(batch_size, row_vectors.Cols()),
-                      row_vectors.Stride());
-}
-
-template <typename T>
-ConstMat<T> ConstMatFromWeights(const MatPtrT<T>& m, size_t ofs = 0) {
-  ConstMat<T> mat =
-      MakeConstMat(const_cast<T*>(m.data()), m.Extents(), m.Stride(), ofs);
-  mat.scale = m.scale();
-  return mat;
-}
-
-template <typename TB>
-void BindB(size_t N, size_t sizeof_TC, const ConstMat<TB>& B,
-           MMParallel& parallel) {
-  if (!Allocator::ShouldBind()) return;
-
-  const IndexRangePartition ranges_np =
-      parallel.RangesOfNP(MMParallel::kMaxPackages, N, sizeof_TC, kNR);
-  const size_t quantum = Allocator::QuantumBytes() / sizeof(TB);
-  for (size_t pkg_idx = 0; pkg_idx < ranges_np.NumTasks(); ++pkg_idx) {
-    const IndexRange& rows_b = ranges_np.Range(pkg_idx);
-    const size_t node = parallel.Node(pkg_idx);
-    uintptr_t begin =
-        reinterpret_cast<uintptr_t>(B.ptr + B.Row(rows_b.begin()));
-    uintptr_t end = begin + rows_b.Num() * B.Stride() * sizeof(TB);
-    // B is not yet guaranteed to have padded rows, so only bind the
-    // subset that is page-aligned.
-    begin = hwy::RoundUpTo(begin, quantum);
-    end = hwy::RoundDownTo(end, quantum);
-    if (HWY_LIKELY(begin != end)) {
-      Allocator::BindMemory(reinterpret_cast<void*>(begin), end - begin, node);
-    }
-  }
-}
-
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_OPS_MATMUL_H_
diff --git a/ops/matmul_static-inl.h b/ops/matmul_static-inl.h
new file mode 100644
index 0000000..28b21cf
--- /dev/null
+++ b/ops/matmul_static-inl.h
@@ -0,0 +1,60 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/detect_compiler_arch.h"  // HWY_IDE
+
+#ifndef GEMMA_MATMUL_TB
+#if HWY_IDE
+// Provide a definition so the IDE does not complain.
+#define GEMMA_MATMUL_TB float
+#else
+#error "Only include from matmul_static_*.cc, which define GEMMA_MATMUL_TB"
+#endif  // HWY_IDE
+#endif  // GEMMA_MATMUL_TB
+
+// Passed to GEMMA_MATMUL_FOREACH_AC; defines one overload for one target.
+#define GEMMA_MATMUL_DEFINE_ONE(TA, TB, TC)                             \
+  MMPerKey* MatMulStatic(const MatPtrT<TA>& A, const MatPtrT<TB>& B,    \
+                         const float* HWY_RESTRICT add, MatMulEnv& env, \
+                         MatPtrT<TC>& C) {                              \
+    return MatMul(A, B, add, env, C);                                   \
+  }
+
+#if defined(THIRD_PARTY_GEMMA_CPP_GEMMA_OPS_MATMUL_STATIC_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef THIRD_PARTY_GEMMA_CPP_GEMMA_OPS_MATMUL_STATIC_INL_H_
+#undef THIRD_PARTY_GEMMA_CPP_GEMMA_OPS_MATMUL_STATIC_INL_H_
+#else
+#define THIRD_PARTY_GEMMA_CPP_GEMMA_OPS_MATMUL_STATIC_INL_H_
+#endif
+
+#include "hwy/highway.h"
+// After highway.h
+#include "ops/matmul-inl.h"
+#include "ops/matmul_static.h"  // includes highway.h!
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+
+// Ignore warning that we are defining a function in a header; this is only
+// included from matmul_static_*.cc.
+GEMMA_MATMUL_FOREACH_AC(GEMMA_MATMUL_DEFINE_ONE, GEMMA_MATMUL_TB)  // NOLINT
+
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_OPS_MATMUL_STATIC_INL_H_
diff --git a/ops/matmul_static.h b/ops/matmul_static.h
new file mode 100644
index 0000000..c06b87a
--- /dev/null
+++ b/ops/matmul_static.h
@@ -0,0 +1,59 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_OPS_MATMUL_STATIC_H_
+#define THIRD_PARTY_GEMMA_CPP_OPS_MATMUL_STATIC_H_
+
+// Declares overloads of MatMulStatic for all SIMD targets and input types.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "ops/matmul.h"  // IWYU pragma: keep, b/420428845
+#include "hwy/highway.h"
+
+// Invokes GEMMA_X(TA, TB, TC) for all combinations of F32 or BF16.
+#define GEMMA_MATMUL_FOREACH_AC(GEMMA_X, TB) \
+  GEMMA_X(float, TB, float)                  \
+  GEMMA_X(float, TB, BF16)                   \
+  GEMMA_X(BF16, TB, float)                   \
+  GEMMA_X(BF16, TB, BF16)
+
+// Passed to GEMMA_MATMUL_FOREACH_AC; declares one overload for one target.
+#define GEMMA_MATMUL_DECL_ONE(TA, TB, TC)                               \
+  MMPerKey* MatMulStatic(const MatPtrT<TA>& A, const MatPtrT<TB>& B,    \
+                         const float* HWY_RESTRICT add, MatMulEnv& env, \
+                         MatPtrT<TC>& C);
+
+// Passed to HWY_VISIT_TARGETS; declares all overloads for all targets.
+#define GEMMA_MATMUL_DECL(TARGET, NAMESPACE)                  \
+  namespace NAMESPACE {                                       \
+  GEMMA_MATMUL_FOREACH_AC(GEMMA_MATMUL_DECL_ONE, BF16)        \
+  GEMMA_MATMUL_FOREACH_AC(GEMMA_MATMUL_DECL_ONE, float)       \
+  GEMMA_MATMUL_FOREACH_AC(GEMMA_MATMUL_DECL_ONE, NuqStream)   \
+  GEMMA_MATMUL_FOREACH_AC(GEMMA_MATMUL_DECL_ONE, SfpStream)   \
+  /* NOLINTNEXTLINE(google-readability-namespace-comments) */ \
+  }  // namespace NAMESPACE
+
+namespace gcpp {
+
+// MatMul function declarations for each SIMD target. Allows direct call from
+// the per-target namespace. We may later replace this with dynamic dispatch if
+// the overhead is acceptable.
+HWY_VISIT_TARGETS(GEMMA_MATMUL_DECL)
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_OPS_MATMUL_STATIC_H_
diff --git a/gemma/instantiations/f32.cc b/ops/matmul_static_bf16.cc
similarity index 53%
rename from gemma/instantiations/f32.cc
rename to ops/matmul_static_bf16.cc
index 6b5496d..f84d1d7 100644
--- a/gemma/instantiations/f32.cc
+++ b/ops/matmul_static_bf16.cc
@@ -1,11 +1,11 @@
-// Copyright 2024 Google LLC
+// Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//      http://www.apache.org/licenses/LICENSE-2.0
+//     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,8 +13,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
+// Compiles this file for multiple architectures via "foreach_target.h", to
+// which we pass the filename via macro 'argument'.
+// clang-format off
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "gemma/instantiations/f32.cc"
+#define HWY_TARGET_INCLUDE "ops/matmul_static_bf16.cc"  // NOLINT
+// clang-format on
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_TYPE float
-#include "gemma/gemma-inl.h"
+#define GEMMA_MATMUL_TB BF16
+#include "ops/matmul_static-inl.h"
diff --git a/gemma/instantiations/nuq.cc b/ops/matmul_static_f32.cc
similarity index 53%
rename from gemma/instantiations/nuq.cc
rename to ops/matmul_static_f32.cc
index 5e3ff4d..e749f53 100644
--- a/gemma/instantiations/nuq.cc
+++ b/ops/matmul_static_f32.cc
@@ -1,11 +1,11 @@
-// Copyright 2024 Google LLC
+// Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//      http://www.apache.org/licenses/LICENSE-2.0
+//     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,8 +13,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
+// Compiles this file for multiple architectures via "foreach_target.h", to
+// which we pass the filename via macro 'argument'.
+// clang-format off
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "gemma/instantiations/nuq.cc"
+#define HWY_TARGET_INCLUDE "ops/matmul_static_f32.cc"  // NOLINT
+// clang-format on
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_TYPE NuqStream
-#include "gemma/gemma-inl.h"
+#define GEMMA_MATMUL_TB float
+#include "ops/matmul_static-inl.h"
diff --git a/gemma/instantiations/bf16.cc b/ops/matmul_static_nuq.cc
similarity index 51%
rename from gemma/instantiations/bf16.cc
rename to ops/matmul_static_nuq.cc
index 19ae585..c9e75e7 100644
--- a/gemma/instantiations/bf16.cc
+++ b/ops/matmul_static_nuq.cc
@@ -1,11 +1,11 @@
-// Copyright 2024 Google LLC
+// Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//      http://www.apache.org/licenses/LICENSE-2.0
+//     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,8 +13,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "compression/types.h"  // GEMMA_ENABLE_NUQ
+#if GEMMA_ENABLE_NUQ
+
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
+// Compiles this file for multiple architectures via "foreach_target.h", to
+// which we pass the filename via macro 'argument'.
+// clang-format off
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "gemma/instantiations/bf16.cc"
+#define HWY_TARGET_INCLUDE "ops/matmul_static_nuq.cc"  // NOLINT
+// clang-format on
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_TYPE hwy::bfloat16_t
-#include "gemma/gemma-inl.h"
+#define GEMMA_MATMUL_TB NuqStream
+#include "ops/matmul_static-inl.h"
+
+#endif  // GEMMA_ENABLE_NUQ
diff --git a/gemma/instantiations/sfp.cc b/ops/matmul_static_sfp.cc
similarity index 53%
rename from gemma/instantiations/sfp.cc
rename to ops/matmul_static_sfp.cc
index 563d034..a2fca74 100644
--- a/gemma/instantiations/sfp.cc
+++ b/ops/matmul_static_sfp.cc
@@ -1,11 +1,11 @@
-// Copyright 2024 Google LLC
+// Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//      http://www.apache.org/licenses/LICENSE-2.0
+//     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,8 +13,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
+// Compiles this file for multiple architectures via "foreach_target.h", to
+// which we pass the filename via macro 'argument'.
+// clang-format off
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "gemma/instantiations/sfp.cc"
+#define HWY_TARGET_INCLUDE "ops/matmul_static_sfp.cc"  // NOLINT
+// clang-format on
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_TYPE SfpStream
-#include "gemma/gemma-inl.h"
+#define GEMMA_MATMUL_TB SfpStream
+#include "ops/matmul_static-inl.h"
diff --git a/ops/matmul_test.cc b/ops/matmul_test.cc
index aaf3bc1..2ec77f5 100644
--- a/ops/matmul_test.cc
+++ b/ops/matmul_test.cc
@@ -15,30 +15,26 @@
 
 // End to end test of MatMul, comparing against a reference implementation.
 
-#include "hwy/detect_compiler_arch.h"
+#include "compression/types.h"
 #ifndef HWY_DISABLED_TARGETS
-// Exclude HWY_SCALAR due to 2x bf16 -> f32, and Armv7 NEON because we require
-// double-precision support.
-#if HWY_ARCH_ARM_V7
-#define HWY_DISABLED_TARGETS (HWY_SCALAR | HWY_NEON)
-#else
-#define HWY_DISABLED_TARGETS HWY_SCALAR
-#endif
-#endif
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
+
+// matmul_static is not built as a test, hence does not define MatMulStatic for
+// worse-than-baseline targets (to speed up builds), so we skip them here, too.
+#ifndef HWY_SKIP_NON_BEST_BASELINE
+#define HWY_SKIP_NON_BEST_BASELINE
+#endif  // HWY_SKIP_NON_BEST_BASELINE
 
 #include <stddef.h>
 #include <stdio.h>
 
-#include <memory>
-
-#include "compression/compress.h"
-#include "compression/shared.h"
 #include "ops/matmul.h"
-#include "util/allocator.h"
 #include "util/basics.h"
-#include "util/threading.h"
-#include "hwy/base.h"
+#include "util/mat.h"
+#include "util/threading_context.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/nanobenchmark.h"  // Unpredictable1
 
 // clang-format off
 #undef HWY_TARGET_INCLUDE
@@ -48,9 +44,9 @@
 #include "hwy/highway.h"
 // After highway.h
 #include "compression/compress-inl.h"
+#include "compression/test_util-inl.h"
 #include "ops/dot-inl.h"
-#include "ops/matmul-inl.h"
-#include "hwy/tests/test_util-inl.h"
+#include "ops/matmul_static.h"  // also textual
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
@@ -60,62 +56,11 @@ extern int64_t first_target;
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-using FloatPtr = hwy::AlignedFreeUniquePtr<float[]>;
-
-template <typename MatT>
-using MatStoragePtr = std::unique_ptr<MatStorageT<MatT>>;
-
-// Generates inputs: deterministic, within max SfpStream range.
-template <typename MatT>
-MatStoragePtr<MatT> GenerateMat(const Extents2D& extents,
-                                hwy::ThreadPool& pool) {
-  gcpp::CompressWorkingSet ws;
-  auto mat =
-      std::make_unique<MatStorageT<MatT>>("mat", extents.rows, extents.cols);
-  FloatPtr content = hwy::AllocateAligned<float>(mat->NumElements());
-  HWY_ASSERT(content);
-  const float scale = SfpStream::kMax / (mat->NumElements());
-  pool.Run(0, extents.rows, [&](const size_t r, size_t /*thread*/) {
-    for (size_t c = 0; c < extents.cols; c++) {
-      float f = static_cast<float>(r * extents.cols + c) * scale;
-      if ((r + c) & 1) f = -f;  // Also generate some negative values.
-      content[r * extents.cols + c] = f;
-    }
-  });
-
-  CompressScaled(content.get(), mat->NumElements(), ws, *mat, pool);
-  mat->set_scale(0.6f);  // Arbitrary value, different from 1.
-  return mat;
-}
-
-// extents describes the transposed matrix.
-template <typename MatT>
-MatStoragePtr<MatT> GenerateTransposedMat(const Extents2D extents,
-                                          hwy::ThreadPool& pool) {
-  gcpp::CompressWorkingSet ws;
-  auto mat =
-      std::make_unique<MatStorageT<MatT>>("trans", extents.rows, extents.cols);
-  FloatPtr content = hwy::AllocateAligned<float>(mat->NumElements());
-  const float scale = SfpStream::kMax / (mat->NumElements());
-  pool.Run(0, extents.rows, [&](const size_t r, size_t /*thread*/) {
-    for (size_t c = 0; c < extents.cols; c++) {
-      float f = static_cast<float>(c * extents.rows + r) * scale;
-      if ((r + c) & 1) f = -f;  // Also generate some negative values.
-      content[r * extents.cols + c] = f;
-    }
-  });
-
-  CompressScaled(content.get(), mat->NumElements(), ws, *mat, pool);
-  // Arbitrary value, different from 1, must match GenerateMat.
-  mat->set_scale(0.6f);
-  return mat;
-}
-
 // Returns 1-norm, used for estimating tolerable numerical differences.
-double MaxRowAbsSum(const RowVectorBatch<float>& a) {
+double MaxRowAbsSum(const MatStorageT<float>& a) {
   double max_row_abs_sum = 0.0;
-  for (size_t r = 0; r < a.BatchSize(); r++) {
-    const float* row = a.Batch(r);
+  for (size_t r = 0; r < a.Rows(); r++) {
+    const float* row = a.Row(r);
     double row_abs_sum = 0.0;
     for (size_t c = 0; c < a.Cols(); c++) {
       row_abs_sum += hwy::ScalarAbs(row[c]);
@@ -126,11 +71,11 @@ double MaxRowAbsSum(const RowVectorBatch<float>& a) {
 }
 
 // Returns the maximum absolute value of `a`.
-float MaxAbs(const RowVectorBatch<float>& a) {
+float MaxAbs(const MatStorageT<float>& a) {
   float max_abs = 0.0f;
   for (size_t c = 0; c < a.Cols(); c++) {
-    for (size_t r = 0; r < a.BatchSize(); r++) {
-      const float* row = a.Batch(r);
+    for (size_t r = 0; r < a.Rows(); r++) {
+      const float* row = a.Row(r);
       max_abs = HWY_MAX(max_abs, hwy::ScalarAbs(row[c]));
     }
   }
@@ -139,30 +84,32 @@ float MaxAbs(const RowVectorBatch<float>& a) {
 
 // B is already transposed.
 template <typename TA, typename TB, typename TC>
-void AssertClose(const ConstMat<TA>& A, const ConstMat<TB>& B,
-                 const RowPtr<TC>& C_slow, const RowPtr<TC>& C, int line) {
+void AssertClose(const MatPtrT<TA>& A, const MatPtrT<TB>& B,
+                 const MatPtrT<TC>& C_slow, const MatPtrT<TC>& C,
+                 MatMulEnv& env, int line) {
   const hn::ScalableTag<float> df;
-  const size_t cols = A.extents.cols;
-  const size_t B_rows = B.extents.rows;
+  const size_t cols = A.Cols();
+  const size_t B_rows = B.Rows();
   // Round up for DecompressAndZeroPad.
-  RowVectorBatch<float> a_batch = AllocateAlignedRows<float>(A.extents);
-  RowVectorBatch<float> b_trans_batch = AllocateAlignedRows<float>(B.extents);
-  RowVectorBatch<float> c_batch =
-      AllocateAlignedRows<float>(Extents2D(A.extents.rows, B_rows));
-  RowVectorBatch<float> c_slow_batch =
-      AllocateAlignedRows<float>(Extents2D(A.extents.rows, B_rows));
-  HWY_ASSERT(A.ofs == 0 && B.ofs == 0);
-  for (size_t m = 0; m < A.extents.rows; ++m) {
-    DecompressAndZeroPad(df, MakeSpan(A.ptr + A.Row(m), cols), 0,
-                         a_batch.Batch(m), cols);
-    DecompressAndZeroPad(df, MakeSpan(C.Row(m), B_rows), 0, c_batch.Batch(m),
+  MatStorageT<float> a_batch("a_batch", A.Extents(), env.ctx.allocator,
+                             MatPadding::kOdd);
+  MatStorageT<float> b_trans_batch("b_trans_batch", B.Extents(),
+                                   env.ctx.allocator, MatPadding::kOdd);
+  MatStorageT<float> c_batch("c_batch", Extents2D(A.Rows(), B_rows),
+                             env.ctx.allocator, MatPadding::kOdd);
+  c_batch.AllocateAndAttachRowPtrs(env.row_ptrs);
+  MatStorageT<float> c_slow_batch("c_slow_batch", Extents2D(A.Rows(), B_rows),
+                                  env.ctx.allocator, MatPadding::kOdd);
+  for (size_t m = 0; m < A.Rows(); ++m) {
+    DecompressAndZeroPad(df, MakeSpan(A.Row(m), cols), 0, a_batch.Row(m), cols);
+    DecompressAndZeroPad(df, MakeSpan(C.Row(m), B_rows), 0, c_batch.Row(m),
                          B_rows);
     DecompressAndZeroPad(df, MakeSpan(C_slow.Row(m), B_rows), 0,
-                         c_slow_batch.Batch(m), B_rows);
+                         c_slow_batch.Row(m), B_rows);
   }
   for (size_t n = 0; n < B_rows; ++n) {
-    DecompressAndZeroPad(df, MakeSpan(B.ptr + B.Row(n), cols), 0,
-                         b_trans_batch.Batch(n), cols);
+    DecompressAndZeroPad(df, MakeSpan(B.Row(n), cols), 0, b_trans_batch.Row(n),
+                         cols);
   }
 
   // MatMul rounds inputs to BF16, so error is proportional to the max input
@@ -182,10 +129,10 @@ void AssertClose(const ConstMat<TA>& A, const ConstMat<TB>& B,
   }
   const double max_rel = 1.0 + hwy::ConvertScalarTo<double>(hwy::Epsilon<TC>());
 
-  for (size_t r = 0; r < A.extents.rows; r++) {
-    const float* expected_row = c_slow_batch.Batch(r);
-    const float* actual_row = c_batch.Batch(r);
-    for (size_t c = 0; c < B.extents.rows; c++) {
+  for (size_t r = 0; r < A.Rows(); r++) {
+    const float* expected_row = c_slow_batch.Row(r);
+    const float* actual_row = c_batch.Row(r);
+    for (size_t c = 0; c < B.Rows(); c++) {
       const double expected_value = static_cast<double>(expected_row[c]);
       const double actual_value = static_cast<double>(actual_row[c]);
       const bool in_range = expected_value - tolerance <= actual_value &&
@@ -209,22 +156,21 @@ void AssertClose(const ConstMat<TA>& A, const ConstMat<TB>& B,
 
 // B is already transposed.
 template <typename TA, typename TB, typename TC>
-HWY_INLINE void MatMulSlow(const ConstMat<TA> A, const ConstMat<TB> B,
+HWY_INLINE void MatMulSlow(const MatPtrT<TA> A, const MatPtrT<TB> B,
                            const float* HWY_RESTRICT add_row, MatMulEnv& env,
-                           const RowPtr<TC>& C) {
+                           MatPtrT<TC>& C) {
   // TA can be any Packed except NuqStream because it uses pointer
   // arithmetic, because it is the second argument to Dot, which does not
   // support a v_ofs.
   static_assert(sizeof(TA) >= sizeof(BF16), "A matrix must be BF16/f32");
-  const float scale = A.scale * B.scale;
+  const float scale = A.Scale() * B.Scale();
 
   const hn::ScalableTag<float> df;  // lane type is ignored
-  const PackedSpan<const TB> b_span =
-      MakeSpan(B.ptr, B.ofs + B.Stride() * B.Extents().rows);
+  const PackedSpan<const TB> b_span = B.Span();
   const IndexRange all_rows_c(0, A.Extents().rows);
   const IndexRange all_cols_c(0, C.Cols());
 
-  NestedPools& pools = env.parallel.Pools();
+  NestedPools& pools = env.ctx.pools;
   hwy::ThreadPool& all_packages = pools.AllPackages();
   const IndexRangePartition get_row_c =
       StaticPartition(all_rows_c, all_packages.NumWorkers(), 1);
@@ -232,7 +178,7 @@ HWY_INLINE void MatMulSlow(const ConstMat<TA> A, const ConstMat<TB> B,
       get_row_c, all_packages,
       [&](const IndexRange& rows_c, size_t package_idx) HWY_ATTR {
         hwy::ThreadPool& all_clusters = pools.AllClusters(package_idx);
-        const size_t multiple = Allocator::QuantumBytes() / sizeof(TB);
+        const size_t multiple = env.ctx.allocator.QuantumBytes() / sizeof(TB);
         const IndexRangePartition get_col_c =
             StaticPartition(all_cols_c, all_clusters.NumWorkers(), multiple);
         ParallelizeOneRange(
@@ -243,8 +189,8 @@ HWY_INLINE void MatMulSlow(const ConstMat<TA> A, const ConstMat<TB> B,
                 for (size_t c : cols_c) {
                   const float add = add_row ? add_row[c] : 0.0f;
                   C_row[c] = hwy::ConvertScalarTo<TC>(
-                      add + scale * Dot(df, b_span, c * B.Stride(),
-                                        A.ptr + A.Row(r), A.extents.cols));
+                      add + scale * Dot(df, b_span, c * B.Stride(), A.Row(r),
+                                        A.Cols()));
                 }
               }
             });
@@ -262,7 +208,7 @@ void PrintSpeed(const char* algo, const Extents2D& A_extents,
 template <typename TA, typename TB = TA, typename TC = float>
 void TestMatMul(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc, bool add,
                 MatMulEnv& env, int line) {
-  hwy::ThreadPool& pool = env.parallel.Pools().Pool();
+  hwy::ThreadPool& pool = env.ctx.pools.Pool();
   fprintf(stderr, "TestMatMul %zu, K=%zu, %zu, add=%d, TA=%s, TB=%s, TC=%s\n",
           rows_ac, cols_a_rows_b, cols_bc, add, TypeName<TA>(), TypeName<TB>(),
           TypeName<TC>());
@@ -274,30 +220,29 @@ void TestMatMul(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc, bool add,
   const Extents2D B_extents(cols_bc, cols_a_rows_b);  // already transposed
   const Extents2D C_extents(rows_ac, cols_bc);
 
-  MatStoragePtr<TA> a = GenerateMat<TA>(A_extents, pool);
-  MatStoragePtr<TB> b_trans = GenerateTransposedMat<TB>(B_extents, pool);
-  RowVectorBatch<TC> c_slow_batch = AllocateAlignedRows<TC>(C_extents);
-  RowVectorBatch<TC> c_batch = AllocateAlignedRows<TC>(C_extents);
-  HWY_ASSERT(a && b_trans);
+  MatStorageT<TA> A(
+      GenerateMat<TA>(A_extents, env.ctx.allocator, MatPadding::kOdd, pool));
+  // Must be packed because we call Span() on it.
+  MatStorageT<TB> BT(GenerateTransposedMat<TB>(B_extents, env.ctx.allocator,
+                                               MatPadding::kPacked, pool));
+  MatStorageT<TC> C_slow("C_slow", C_extents, env.ctx.allocator,
+                         MatPadding::kOdd);
+  MatStorageT<TC> C("C", C_extents, env.ctx.allocator, MatPadding::kOdd);
+  C.AllocateAndAttachRowPtrs(env.row_ptrs);
 
-  std::unique_ptr<MatStorageT<float>> add_storage;
-  if (add) {
-    add_storage = GenerateMat<float>(Extents2D(1, cols_bc), pool);
-    HWY_ASSERT(add_storage);
-    add_storage->set_scale(1.0f);
-  }
+  MatStorageT<float> add_storage =
+      add ? GenerateMat<float>(Extents2D(1, cols_bc), env.ctx.allocator,
+                               MatPadding::kPacked, pool)
+          : MatStorageT<float>("add", Extents2D(), env.ctx.allocator,
+                               MatPadding::kPacked);
+  add_storage.SetScale(1.0f);
+  const float* add_row = add ? add_storage.PackedScale1() : nullptr;
 
-  const auto A = ConstMatFromWeights(*a);
-  const auto B = ConstMatFromWeights(*b_trans);
-  const float* add_row = add ? add_storage->data_scale1() : nullptr;
-  const RowPtr<TC> C_slow = RowPtrFromBatch(c_slow_batch);
-  const RowPtr<TC> C = RowPtrFromBatch(c_batch);
-
-  MatMulSlow(A, B, add_row, env, C_slow);
+  MatMulSlow(A, BT, add_row, env, C_slow);
   // A few reps to get coverage of the various autotuned code paths.
   for (size_t rep = 0; rep < 16; ++rep) {
-    MMPerKey* per_key = MatMul(A, B, add_row, env, C);
-    AssertClose(A, B, C_slow, C, line);
+    MMPerKey* per_key = MatMulStatic(A, BT, add_row, env, C);
+    AssertClose(A, BT, C_slow, C, env, line);
     if (per_key->autotune.Best()) break;
   }
 }
@@ -312,22 +257,23 @@ void TestTiny() {
   if (HWY_TARGET != first_target) return;
 
   for (size_t max_packages : {1, 2}) {
-    const BoundedTopology topology(BoundedSlice(0, max_packages));
-    Allocator::Init(topology, /*enable_bind=*/true);
-    const size_t max_threads = 0;  // no limit
-    NestedPools pools(topology, max_threads, Tristate::kDefault);
-#if GEMMA_DISABLE_TOPOLOGY
-    if (max_packages == 2) break;  // we only have one package
-#else
-    // If less than the limit, we have already tested all num_packages.
-    if (topology.FullTopology().packages.size() < max_packages) break;
-#endif
-    fprintf(stderr, "TestTiny %zu: %s %s\n", max_packages,
-            topology.TopologyString(), pools.PinString());
+    ThreadingArgs threading_args;
+    threading_args.bind = Tristate::kTrue;
+    threading_args.max_packages = max_packages;
+    ThreadingContext ctx(threading_args);
+    MatMulEnv env(ctx);
+    NestedPools& pools = env.ctx.pools;
 
-    Tristate use_spinning = Tristate::kDefault;
-    pools.MaybeStartSpinning(use_spinning);
-    MatMulEnv env(topology, pools);
+    if constexpr (GEMMA_DISABLE_TOPOLOGY) {
+      if (max_packages == 2) break;  // we only have one package
+    } else {
+      // If less than the limit, we have already tested all num_packages.
+      if (env.ctx.topology.FullTopology().packages.size() < max_packages) break;
+    }
+    fprintf(stderr, "TestTiny %zu: %s %s\n", max_packages,
+            env.ctx.topology.TopologyString(), pools.PinString());
+
+    pools.MaybeStartSpinning(threading_args.spin);
 
     for (size_t M = 1; M <= 12; ++M) {
       for (size_t K = 1; K <= 64; K *= 2) {
@@ -336,23 +282,25 @@ void TestTiny() {
         }
       }
     }
-    pools.MaybeStopSpinning(use_spinning);
+    pools.MaybeStopSpinning(threading_args.spin);
   }
 }
 
 void TestAllMatMul() {
   // Skip EMU128 (10x slower than SSE4 for SFP) and older x86.
-  if (HWY_TARGET == HWY_EMU128 || HWY_TARGET == HWY_SSE4 ||
-      HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE2) {
+  // Add Unpredictable1 to prevent erroneous "unreachable code" warning.
+  if (hwy::Unpredictable1() == 1 &&
+      (HWY_TARGET == HWY_EMU128 || HWY_TARGET == HWY_SSE4 ||
+       HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE2)) {
     return;
   }
 
-  const BoundedTopology topology;
-  Allocator::Init(topology, /*enable_bind=*/true);
-  NestedPools pools(topology);
-  Tristate use_spinning = Tristate::kDefault;
-  pools.MaybeStartSpinning(use_spinning);
-  MatMulEnv env(topology, pools);
+  ThreadingArgs threading_args;
+  threading_args.bind = Tristate::kTrue;
+  ThreadingContext ctx(threading_args);
+  MatMulEnv env(ctx);
+  NestedPools& pools = env.ctx.pools;
+  pools.MaybeStartSpinning(threading_args.spin);
 
   // Sizes seen in gemma_test 2B. Too slow for CI, enable on-demand.
   TestMatMul<F32>(1, 2048, 512, /*add=*/false, env, __LINE__);
@@ -417,6 +365,8 @@ void TestAllMatMul() {
   TestMatMul<BF16, F32>(1, 128, 32, /*add=*/true, env, __LINE__);
   TestMatMul<F32, SFP>(1, 128, 32, /*add=*/false, env, __LINE__);
   TestMatMul<BF16, SFP>(1, 128, 32, /*add=*/true, env, __LINE__);
+
+  pools.MaybeStopSpinning(threading_args.spin);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/ops/matvec-inl.h b/ops/matvec-inl.h
index 7ad56e7..8be84ec 100644
--- a/ops/matvec-inl.h
+++ b/ops/matvec-inl.h
@@ -37,6 +37,8 @@
 
 #include "compression/compress-inl.h"
 #include "ops/dot-inl.h"
+#include "ops/matmul.h"
+#include "util/mat.h"  // MatPtrT
 #include "hwy/contrib/math/math-inl.h"
 #include "hwy/contrib/matvec/matvec-inl.h"
 
@@ -45,15 +47,17 @@ namespace gcpp {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-// Adapter for use by matvec-inl.h. TODO: remove when that is no longer used.
-template <class ArrayT, typename VT>
-HWY_INLINE float Dot(const ArrayT& w, size_t w_ofs, const VT* vec_aligned,
+// For callers that pass `MatPtrT`, which is not necessarily packed - callers
+// should use Stride() to compute `w_ofs`.
+template <typename WT, typename VT>
+HWY_INLINE float Dot(const MatPtrT<WT>& w, size_t w_ofs, const VT* vec_aligned,
                      size_t num) {
   const hn::ScalableTag<VT> d;
-  return w.scale() * Dot(d, MakeConstSpan(w.data(), w.NumElements()), w_ofs,
-                         vec_aligned, num);
+  return w.Scale() * Dot(d, w.PaddedSpan(), w_ofs, vec_aligned, num);
 }
 
+// ArrayT is MatPtrT.
+
 // Simple version without tiling nor threading, but two offsets/outputs and
 // always with addition.
 template <typename ArrayT, typename VecT, typename AddT>
@@ -68,8 +72,8 @@ HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
   PROFILER_ZONE("TwoOfsMatVecAddLoop");
 
   for (size_t idx_row = 0; idx_row < outer; ++idx_row) {
-    const size_t row_ofs0 = mat_ofs0 + (idx_row)*inner;
-    const size_t row_ofs1 = mat_ofs1 + (idx_row)*inner;
+    const size_t row_ofs0 = mat_ofs0 + idx_row * mat.Stride();
+    const size_t row_ofs1 = mat_ofs1 + idx_row * mat.Stride();
     out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
                     Dot(mat, row_ofs0, vec_aligned, inner);
     out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
@@ -108,11 +112,11 @@ namespace detail {
 // coordinate of the tile is r0, c0.
 template <class DF, typename ArrayT, typename VecT>
 HWY_INLINE void AccumulatePartialDotProducts(
-    DF df, const ArrayT& mat, size_t mat_ofs, size_t mat_stride, size_t r0,
-    size_t c0, size_t num_rows, size_t num_cols,
-    const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
+    DF df, const ArrayT& mat, size_t mat_ofs, size_t r0, size_t c0,
+    size_t num_rows, size_t num_cols, const VecT* HWY_RESTRICT vec_aligned,
+    float* HWY_RESTRICT out) {
   for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
-    const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
+    const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat.Stride();
     out[idx_row] += Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
   }
 }
@@ -122,14 +126,13 @@ HWY_INLINE void AccumulatePartialDotProducts(
 // accumulate.
 template <bool kInit, class DF, typename ArrayT, typename VecT, typename InitT>
 HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
-                                           size_t mat_ofs, size_t mat_stride,
-                                           size_t r0, size_t c0,
+                                           size_t mat_ofs, size_t r0, size_t c0,
                                            size_t num_rows, size_t num_cols,
                                            const VecT* HWY_RESTRICT vec_aligned,
                                            const InitT* HWY_RESTRICT init,
                                            float* HWY_RESTRICT out) {
   for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
-    const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
+    const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat.Stride();
     if constexpr (kInit) {
       out[idx_row] = hwy::ConvertScalarTo<float>(init[idx_row + r0]) +
                      Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
@@ -145,32 +148,32 @@ HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
 // store into in out[r - r0].
 template <bool kAdd, class DF, typename ArrayT, typename VecT, typename AddT>
 HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
-                                        size_t mat_ofs, size_t mat_stride,
-                                        size_t r0, size_t num_rows,
+                                        size_t mat_ofs, size_t r0,
+                                        size_t num_rows, size_t num_cols,
                                         const VecT* HWY_RESTRICT vec_aligned,
                                         const AddT* HWY_RESTRICT add,
                                         float* HWY_RESTRICT out) {
+  HWY_DASSERT(num_cols <= mat.Cols());
   // Tall and skinny: set `out` to the single dot product.
-  if (mat_stride < MaxCols()) {
-    SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, mat_stride, r0, 0,
-                                     num_rows, mat_stride, vec_aligned, add,
-                                     out);
+  if (num_cols < MaxCols()) {
+    SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, r0, 0, num_rows,
+                                     num_cols, vec_aligned, add, out);
     return;
   }
 
   // We have at least MaxCols, so start by setting `out` to that:
-  SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, mat_stride, r0, 0,
-                                   num_rows, MaxCols(), vec_aligned, add, out);
+  SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, r0, 0, num_rows, MaxCols(),
+                                   vec_aligned, add, out);
   // For further multiples of MaxCols, accumulate. Remainders handled below.
   size_t c0 = MaxCols();
-  for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) {
-    AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, num_rows,
-                                 MaxCols(), vec_aligned, out);
+  for (; c0 <= num_cols - MaxCols(); c0 += MaxCols()) {
+    AccumulatePartialDotProducts(df, mat, mat_ofs, r0, c0, num_rows, MaxCols(),
+                                 vec_aligned, out);
   }
 
-  if (c0 < mat_stride) {  // Final cols
-    AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, num_rows,
-                                 mat_stride - c0, vec_aligned, out);
+  if (c0 < num_cols) {  // Final cols
+    AccumulatePartialDotProducts(df, mat, mat_ofs, r0, c0, num_rows,
+                                 num_cols - c0, vec_aligned, out);
   }
 }
 
@@ -194,9 +197,8 @@ HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs,
   pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
     PROFILER_ZONE("MatVec.lambda");
     const size_t r0 = strip * rows_per_strip;
-    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, inner, r0,
-                                          rows_per_strip, vec_aligned, add,
-                                          out + r0);
+    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, r0, rows_per_strip,
+                                          inner, vec_aligned, add, out + r0);
   });
 
   // Remaining rows
@@ -204,7 +206,7 @@ HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs,
   if (r0 < outer) {
     PROFILER_ZONE("MatVec remainder");
     const size_t num_rows = outer - r0;
-    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, inner, r0, num_rows,
+    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, r0, num_rows, inner,
                                           vec_aligned, add, out + r0);
   }
 }
@@ -250,12 +252,10 @@ HWY_NOINLINE void TwoMatVecT(const ArrayT1& mat0, const ArrayT2& mat1,
   pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
     PROFILER_ZONE("TwoMatVec.lambda");
     const size_t r0 = strip * rows_per_strip;
-    detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, inner, r0,
-                                          rows_per_strip, vec_aligned, add0,
-                                          out0 + r0);
-    detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, inner, r0,
-                                          rows_per_strip, vec_aligned, add1,
-                                          out1 + r0);
+    detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, r0, rows_per_strip,
+                                          inner, vec_aligned, add0, out0 + r0);
+    detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, r0, rows_per_strip,
+                                          inner, vec_aligned, add1, out1 + r0);
   });
 
   // Remaining rows
@@ -263,10 +263,10 @@ HWY_NOINLINE void TwoMatVecT(const ArrayT1& mat0, const ArrayT2& mat1,
   if (r0 < outer) {
     PROFILER_ZONE("TwoMatVec remainder");
     const size_t num_rows = outer - r0;
-    detail::FullDotProductsForStrip<kAdd>(
-        df, mat0, mat_ofs, inner, r0, num_rows, vec_aligned, add0, out0 + r0);
-    detail::FullDotProductsForStrip<kAdd>(
-        df, mat1, mat_ofs, inner, r0, num_rows, vec_aligned, add1, out1 + r0);
+    detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, r0, num_rows,
+                                          inner, vec_aligned, add0, out0 + r0);
+    detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, r0, num_rows,
+                                          inner, vec_aligned, add1, out1 + r0);
   }
 }
 
diff --git a/ops/ops-inl.h b/ops/ops-inl.h
index 52f72bd..0806846 100644
--- a/ops/ops-inl.h
+++ b/ops/ops-inl.h
@@ -27,12 +27,14 @@
 #include <type_traits>  // std::enable_if_t
 #include <vector>
 
-#include "compression/compress.h"
+#include "ops/matmul.h"
+#include "util/allocator.h"
 #include "util/basics.h"  // TokenAndProb
+#include "util/mat.h"
+#include "util/threading_context.h"
 #include "hwy/base.h"
 #include "hwy/contrib/sort/order.h"
 #include "hwy/contrib/sort/vqsort.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/detect_targets.h"
 #include "hwy/profiler.h"
 #endif  // THIRD_PARTY_GEMMA_CPP_OPS_OPS_INL_H_
@@ -47,6 +49,7 @@
 
 #include "compression/compress-inl.h"
 #include "ops/dot-inl.h"
+#include "ops/matmul_static.h"  // includes highway.h
 #include "ops/sum-inl.h"
 #include "hwy/contrib/algo/transform-inl.h"
 #include "hwy/contrib/math/math-inl.h"
@@ -56,6 +59,14 @@ namespace gcpp {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
+template <typename TA, typename TC>
+MMPerKey* CallMatMul(const MatPtrT<TA>& A, const MatPtr& B,
+                     const float* HWY_RESTRICT add, MatMulEnv& env,
+                     MatPtrT<TC>& C) {
+  return CallUpcasted(
+      &B, [&](const auto* B_t) { return MatMulStatic(A, *B_t, add, env, C); });
+}
+
 HWY_INLINE double PackTokenAndProb(int32_t token, float prob) {
   // casting prob from float to double just makes some changes to the
   // exponent bias and pads zeros in the mantissa.
@@ -116,6 +127,7 @@ HWY_INLINE hn::Vec<D> Gelu(D d, hn::Vec<D> v) {
 
 static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(float* HWY_RESTRICT x,
                                                size_t size) {
+  PROFILER_ZONE("ops.Gelu");
   namespace hn = hwy::HWY_NAMESPACE;
   using D = hn::ScalableTag<float>;
   hn::Transform(D(), x, size,
@@ -179,7 +191,10 @@ namespace detail {
 
 // Shared by RMSNorm and RMSNormInplace.
 template <typename VT>
-float RMSNormMul(const VT* HWY_RESTRICT x, size_t size) {
+float RMSNormMul(const VT* HWY_RESTRICT x, const size_t size,
+                 const HWY_MAYBE_UNUSED size_t worker) {
+  PROFILER_ZONE2(worker, "ops.RMSNormMul");
+
   const hn::ScalableTag<float> d;
   const float l2 = DecompressAndCall(d, MakeSpan(x, size), DotKernelDefault());
   constexpr float kEps = 1e-6f;  // avoid divide by zero
@@ -188,31 +203,32 @@ float RMSNormMul(const VT* HWY_RESTRICT x, size_t size) {
 
 }  // namespace detail
 
-template <typename VecT, typename WeightT, typename OutT>
-HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(const VecT* HWY_RESTRICT x,
-                                           const WeightT* HWY_RESTRICT weight,
-                                           OutT* HWY_RESTRICT out,
-                                           const size_t size) {
-  PROFILER_FUNC;
+// `x_ofs` is the offset within `x`, required for NuqStream.
+template <typename XT, typename WT, typename OT>
+HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(
+    const XT* HWY_RESTRICT x, const WT* HWY_RESTRICT weight, size_t w_ofs,
+    OT* HWY_RESTRICT out, const size_t size,
+    const size_t HWY_MAYBE_UNUSED worker) {
+  PROFILER_ZONE2(worker, "ops.RMSNorm");
 
   namespace hn = hwy::HWY_NAMESPACE;
   const hn::ScalableTag<float> df;
   using VF = hn::Vec<decltype(df)>;
   const size_t NF = hn::Lanes(df);
 
-  const VF mul = hn::Set(df, detail::RMSNormMul(x, size));
+  const VF mul = hn::Set(df, detail::RMSNormMul(x, size, worker));
 
-  const auto packed_w = MakeSpan(weight, size);
-  const auto packed_v = MakeSpan(x, size);
+  const auto packed_x = MakeSpan(x, size);
+  const auto packed_w = MakeSpan(weight, w_ofs + size);
   const auto packed_out = MakeSpan(out, size);
 
-  HWY_DASSERT(size % (2 * MaxLanes(df)) == 0);
+  HWY_DASSERT(size % (2 * NF) == 0);
   for (size_t i = 0; i < size; i += 2 * NF) {
-    VF v0, v1, w0, w1;
-    Decompress2(df, packed_v, i, v0, v1);
-    Decompress2(df, packed_w, i, w0, w1);
-    const VF m0 = hn::Mul(mul, v0);
-    const VF m1 = hn::Mul(mul, v1);
+    VF x0, x1, w0, w1;
+    Decompress2(df, packed_x, i, x0, x1);
+    Decompress2(df, packed_w, w_ofs + i, w0, w1);
+    const VF m0 = hn::Mul(mul, x0);
+    const VF m1 = hn::Mul(mul, x1);
     // (1+weight) * m = m + weight*m = one FMA.
     const VF out0 = hn::MulAdd(m0, w0, m0);
     const VF out1 = hn::MulAdd(m1, w1, m1);
@@ -221,85 +237,129 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(const VecT* HWY_RESTRICT x,
 }
 
 // Same as RMSNorm, but its HWY_RESTRICT forbids passing the same pointer.
-template <typename WeightT, typename VecT>
+template <typename WT, typename XT>
 HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(
-    const WeightT* HWY_RESTRICT weight, VecT* HWY_RESTRICT inout,
-    const size_t size) {
-  PROFILER_FUNC;
+    const WT* HWY_RESTRICT weight, size_t w_ofs, XT* HWY_RESTRICT inout,
+    const size_t size, const HWY_MAYBE_UNUSED size_t worker) {
+  PROFILER_ZONE2(worker, "ops.RMSNormInplace");
 
   namespace hn = hwy::HWY_NAMESPACE;
   const hn::ScalableTag<float> df;
   using VF = hn::Vec<decltype(df)>;
   const size_t NF = hn::Lanes(df);
 
-  const VF mul = hn::Set(df, detail::RMSNormMul(inout, size));
+  const VF mul = hn::Set(df, detail::RMSNormMul(inout, size, worker));
 
-  const auto packed_w = MakeSpan(weight, size);
-  const auto packed_v = MakeSpan(inout, size);
+  const auto packed_w = MakeSpan(weight, w_ofs + size);
+  const auto packed_x = MakeSpan(inout, size);
 
-  HWY_DASSERT(size % (2 * MaxLanes(df)) == 0);
+  HWY_DASSERT(size % (2 * NF) == 0);
   for (size_t i = 0; i < size; i += 2 * NF) {
-    VF v0, v1, w0, w1;
-    Decompress2(df, MakeConst(packed_v), i, v0, v1);
-    Decompress2(df, packed_w, i, w0, w1);
-    const VF m0 = hn::Mul(mul, v0);
-    const VF m1 = hn::Mul(mul, v1);
+    VF x0, x1, w0, w1;
+    Decompress2(df, packed_x, i, x0, x1);
+    Decompress2(df, packed_w, w_ofs + i, w0, w1);
+    const VF m0 = hn::Mul(mul, x0);
+    const VF m1 = hn::Mul(mul, x1);
     // (1+weight) * m = m + weight*m = one FMA.
     const VF out0 = hn::MulAdd(m0, w0, m0);
     const VF out1 = hn::MulAdd(m1, w1, m1);
-    Compress2(df, out0, out1, packed_v, i);
+    Compress2(df, out0, out1, packed_x, i);
   }
 }
 
 // Computes mean mu and mean of squares mu2 of a vector. Used in LayerNorm.
-template <typename T>
-HWY_NOINLINE void ScalarMus(const T* HWY_RESTRICT a, size_t size, T& mu,
-                            T& mu2) {
+template <typename XT>
+HWY_NOINLINE void ComputeMoments(const XT* HWY_RESTRICT x, size_t size,
+                                 double& mu, double& mu2) {
   HWY_ASSERT(size > 0);
-  double sum = 0.0;
-  double sum2 = 0.0;
-  for (size_t i = 0; i < size; ++i) {
-    const float f = hwy::ConvertScalarTo<float>(a[i]);
-    sum += f;
-    sum2 += f * f;
-  }
-  mu = sum / size;
-  mu2 = sum2 / size;
+  const hn::ScalableTag<float> df;
+
+  // Use the existing Sum and Dot kernels for simplicity. The second pass
+  // is likely not too expensive because it will be in L1.
+  const double sum = Sum(df, x, size);
+  // We only have one array, so calling `DecompressAndCall` instead of `Dot``
+  // avoids loading the 'second' vector again.
+  const double sum2 =
+      DecompressAndCall(df, MakeSpan(x, size), DotKernelDouble());
+
+  const double inv_size = 1.0 / static_cast<double>(size);
+  mu = sum * inv_size;
+  mu2 = sum2 * inv_size;
 }
 
 // Compare py/flax/linen/normalization.py.
 // out = (x - mean) * scale * rsqrt(var + epsilon) + bias
-template <typename VecT, typename WeightT, typename OutT>
-HWY_NOINLINE void ScalarLayerNorm(const VecT* x,
-                                  const WeightT* HWY_RESTRICT scale,
-                                  const WeightT* HWY_RESTRICT bias,
-                                  OutT* out,
-                                  size_t size) {
-  constexpr float kEps = 1e-6f;
-  VecT mu, mu2;
-  ScalarMus(x, size, mu, mu2);
-  VecT var = mu2 - mu * mu;
-  VecT zero = 0.0f;
-  var = HWY_MAX(var, zero);
-  var = 1.0f / sqrtf(var + kEps);
-  for (size_t j = 0; j < size; j++) {
-    const float v = hwy::ConvertScalarTo<float>(x[j]);
-    const float s = hwy::ConvertScalarTo<float>(scale[j]);
-    const float b = hwy::ConvertScalarTo<float>(bias[j]);
-    out[j] = hwy::ConvertScalarTo<OutT>((v - mu) * s * var + b);
-  }
-}
+// x and out may be the same.
+template <typename XT, typename WT, typename OT>
+HWY_NOINLINE void LayerNorm(const XT* x, const WT* HWY_RESTRICT scale,
+                            const WT* HWY_RESTRICT bias, OT* out, size_t size) {
+  PROFILER_ZONE("ops.LayerNorm");
 
-template <typename VecT, typename WeightT, typename OutT>
-HWY_NOINLINE HWY_MAYBE_UNUSED void LayerNorm(const VecT* x,
-                                             const WeightT* HWY_RESTRICT weight,
-                                             const WeightT* HWY_RESTRICT bias,
-                                             OutT* out,
-                                             const size_t size) {
-  PROFILER_FUNC;
-  // For now we only delegate to the scalar version.
-  // TODO: implement vectorized version.
-  ScalarLayerNorm(x, weight, bias, out, size);
+  namespace hn = hwy::HWY_NAMESPACE;
+  const hn::ScalableTag<float> df;
+  using VF = hn::Vec<decltype(df)>;
+  const size_t NF = hn::Lanes(df);
+
+  double mu, mu2;
+  ComputeMoments(x, size, mu, mu2);
+  double var = mu2 - mu * mu;
+  var = HWY_MAX(var, 0.0);
+  var = 1.0 / sqrt(var + 1E-6);
+  const VF vmu = hn::Set(df, static_cast<float>(mu));
+  const VF vvar = hn::Set(df, static_cast<float>(var));
+  const VF* HWY_RESTRICT pmu = &vmu;
+  const VF* HWY_RESTRICT pvar = &vvar;
+
+  const auto packed_x = MakeSpan(x, size);
+  const auto packed_scale = MakeSpan(scale, size);
+  const auto packed_bias = MakeSpan(bias, size);
+  const auto packed_out = MakeSpan(out, size);
+
+  // Loop body for one vector, called from main loop and remainder loop.
+  const auto norm = [pmu, pvar](VF x, VF s, VF add) HWY_ATTR -> VF {
+    const VF centered = hn::Sub(x, *pmu);
+    const VF mul = hn::Mul(s, *pvar);
+    return hn::MulAdd(centered, mul, add);
+  };
+
+  size_t i = 0;
+  if (size >= 2 * NF) {
+    for (; i <= size - 2 * NF; i += 2 * NF) {
+      VF x0, x1, s0, s1, add0, add1;
+      Decompress2(df, packed_x, i, x0, x1);
+      Decompress2(df, packed_scale, i, s0, s1);
+      Decompress2(df, packed_bias, i, add0, add1);
+      const VF n0 = norm(x0, s0, add0);
+      const VF n1 = norm(x1, s1, add1);
+      Compress2(df, n0, n1, packed_out, i);
+    }
+  }
+
+  const size_t remaining = size - i;
+  HWY_DASSERT(remaining < 2 * NF);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf_scale[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf_bias[2 * hn::MaxLanes(df)];
+    // Ensure the second vectors are zeroed even if remaining <= NF.
+    hn::Store(hn::Zero(df), df, buf_x + NF);
+    hn::Store(hn::Zero(df), df, buf_scale + NF);
+    hn::Store(hn::Zero(df), df, buf_bias + NF);
+    HWY_ALIGN OT buf_out[2 * hn::MaxLanes(df)];
+    DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
+    DecompressAndZeroPad(df, packed_scale, i, buf_scale, remaining);
+    DecompressAndZeroPad(df, packed_bias, i, buf_bias, remaining);
+    const VF x0 = hn::Load(df, buf_x);
+    const VF x1 = hn::Load(df, buf_x + NF);
+    const VF s0 = hn::Load(df, buf_scale);
+    const VF s1 = hn::Load(df, buf_scale + NF);
+    const VF add0 = hn::Load(df, buf_bias);
+    const VF add1 = hn::Load(df, buf_bias + NF);
+    const VF n0 = norm(x0, s0, add0);
+    const VF n1 = norm(x1, s1, add1);
+    Compress2(df, n0, n1, MakeSpan(buf_out, 2 * NF), 0);
+    hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT));
+  }
 }
 
 static HWY_NOINLINE HWY_MAYBE_UNUSED void AddAbsolutePositionalEmbeddings(
@@ -317,7 +377,7 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void AddAbsolutePositionalEmbeddings(
   }
 }
 
-/* RoPE as in Rotary Position Embeddings from the RoFormer paper
+/* RoPE as in Rotary Position Embeddings from the `RoFormer` paper
    (https://arxiv.org/abs/2104.09864v5). The query and key vectors are rotated
    as a function of their absolute position using the rotation matrix R before
    the self-attention operation. R is a d x d matrix.
@@ -343,199 +403,370 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void AddAbsolutePositionalEmbeddings(
   of this rotation matrix which is simply the same matrix with -pos parameter)
 */
 
-// `inv_timescale[dim_qkv / 2]` is precomputed in Activations::Allocate.
-// This overload is called from backprop/ and if kUseHalfRope.
+// `inv_timescale[dim_qkv / 2]` is precomputed in AttentionActivations.
+// This overload is called if `post_qk == PostQKType::HalfRope`.
 static HWY_NOINLINE HWY_MAYBE_UNUSED void Rope(
-    float* HWY_RESTRICT x, size_t dim_qkv,
-    const float* HWY_RESTRICT inv_timescale, int pos) {
-  PROFILER_FUNC;
-  HWY_DASSERT(dim_qkv % 2 == 0);
-  const size_t half_dim_qkv = dim_qkv / 2;
-  for (size_t dim = 0; dim < half_dim_qkv; ++dim) {
-    const float theta = StaticCast<float>(pos) * inv_timescale[dim];
-    const float cos_val = cosf(theta);
-    const float sin_val = sinf(theta);
-    const float x0 = x[dim];
-    const float x1 = x[dim + half_dim_qkv];
-    x[dim] = x0 * cos_val - x1 * sin_val;
-    x[dim + half_dim_qkv] = x0 * sin_val + x1 * cos_val;
-  }
-}
-
-// `inv_timescale[dim_qkv / 2]` is precomputed in Activations::Allocate.
-static HWY_NOINLINE HWY_MAYBE_UNUSED void RopeAndMulBy(
-    const float mul, float* HWY_RESTRICT x, size_t dim_qkv,
-    const float* HWY_RESTRICT inv_timescale, int pos) {
-  PROFILER_FUNC;
+    float* HWY_RESTRICT x, const size_t dim_qkv,
+    const float* HWY_RESTRICT inv_timescale, const int pos,
+    const size_t HWY_MAYBE_UNUSED worker = 0) {
+  PROFILER_ZONE2(worker, "ops.Rope");
   HWY_DASSERT(dim_qkv % 2 == 0);
   const size_t half_dim_qkv = dim_qkv / 2;
 
-  using D = hn::ScalableTag<float>;
-  using V = hn::Vec<D>;
-  const D d;
+  const hn::ScalableTag<float> df;
+  const size_t NF = hn::Lanes(df);
+  using VF = hn::Vec<decltype(df)>;
+  const VF vpos = hn::Set(df, static_cast<float>(pos));
 
   // Vectorize computation for half_dim_qkv - (half_dim_qkv % Lanes)
-  const size_t vectorizable_dims = hwy::RoundDownTo(half_dim_qkv, hn::Lanes(d));
+  const size_t vectorizable_dims = hwy::RoundDownTo(half_dim_qkv, NF);
   size_t dim = 0;
-  for (; dim < vectorizable_dims; dim += hn::Lanes(d)) {
-    // Compute thetas
-    V pos_vec = hn::Set(d, pos);
-    V inv_time_scale_vec = hn::LoadU(d, inv_timescale + dim);
-    V theta_vec = hn::Mul(pos_vec, inv_time_scale_vec);
+  for (; dim < vectorizable_dims; dim += NF) {
+    const VF vinv_time_scale = hn::LoadU(df, inv_timescale + dim);
+    const VF vtheta = hn::Mul(vpos, vinv_time_scale);
 
     // Compute rotations.
-    V cos_theta_vec;
-    V sin_theta_vec;
-    hn::SinCos(d, theta_vec, sin_theta_vec, cos_theta_vec);
+    VF vcos_theta;
+    VF vsin_theta;
+    hn::SinCos(df, vtheta, vsin_theta, vcos_theta);
 
-    // Scale input with rotations and multiply with constant.
-    V mul_vec = hn::Set(d, mul);
-    V x0_vec = hn::Mul(mul_vec, hn::LoadU(d, x + dim));
-    V x1_vec = hn::Mul(mul_vec, hn::LoadU(d, x + dim + half_dim_qkv));
+    // Scale input with rotations.
+    const VF vx0 = hn::LoadU(df, x + dim);
+    const VF vx1 = hn::LoadU(df, x + dim + half_dim_qkv);
+    const VF vout0 = hn::MulSub(vx0, vcos_theta, hn::Mul(vx1, vsin_theta));
+    const VF vout1 = hn::MulAdd(vx0, vsin_theta, hn::Mul(vx1, vcos_theta));
 
-    V xout_0_vec = hn::MulSub(x0_vec, cos_theta_vec,
-                                 hn::Mul(x1_vec, sin_theta_vec));
-    V xout_1_vec = hn::MulAdd(x0_vec, sin_theta_vec,
-                                            hn::Mul(x1_vec, cos_theta_vec));
-
-    // Store
-    hn::StoreU(xout_0_vec, d, x + dim);
-    hn::StoreU(xout_1_vec, d, x + dim + half_dim_qkv);
+    hn::StoreU(vout0, df, x + dim);
+    hn::StoreU(vout1, df, x + dim + half_dim_qkv);
   }
 
   // Vectorize computation for remaining dims - same as above, but with LoadN.
   const size_t remaining_dims = half_dim_qkv - dim;
-  HWY_DASSERT(remaining_dims < hn::Lanes(d));  // at most one iteration
+  HWY_DASSERT(remaining_dims < NF);  // at most one iteration
   if (remaining_dims != 0) {
-    // Compute thetas
-    V pos_vec = hn::Set(d, pos);
-    V inv_time_scale_vec = hn::LoadN(d, inv_timescale + dim, remaining_dims);
-    V theta_vec = hn::Mul(pos_vec, inv_time_scale_vec);
+    VF vinv_time_scale = hn::LoadN(df, inv_timescale + dim, remaining_dims);
+    VF vtheta = hn::Mul(vpos, vinv_time_scale);
 
     // Compute rotations.
-    V cos_theta_vec;
-    V sin_theta_vec;
-    hn::SinCos(d, theta_vec, sin_theta_vec, cos_theta_vec);
+    VF vcos_theta;
+    VF vsin_theta;
+    hn::SinCos(df, vtheta, vsin_theta, vcos_theta);
 
-    // Scale input with rotations and multiply with constant.
-    V mul_vec = hn::Set(d, mul);
-    V x0_vec = hn::Mul(mul_vec, hn::LoadN(d, x + dim, remaining_dims));
-    V x1_vec =
-        hn::Mul(mul_vec, hn::LoadN(d, x + dim + half_dim_qkv, remaining_dims));
+    // Scale input with rotations.
+    const VF vx0 = hn::LoadN(df, x + dim, remaining_dims);
+    const VF vx1 = hn::LoadN(df, x + dim + half_dim_qkv, remaining_dims);
+    const VF vout0 = hn::MulSub(vx0, vcos_theta, hn::Mul(vx1, vsin_theta));
+    const VF vout1 = hn::MulAdd(vx0, vsin_theta, hn::Mul(vx1, vcos_theta));
 
-    V xout_0_vec =
-        hn::MulSub(x0_vec, cos_theta_vec, hn::Mul(x1_vec, sin_theta_vec));
-    V xout_1_vec =
-        hn::MulAdd(x0_vec, sin_theta_vec, hn::Mul(x1_vec, cos_theta_vec));
-
-    // Store
-    hn::StoreN(xout_0_vec, d, x + dim, remaining_dims);
-    hn::StoreN(xout_1_vec, d, x + dim + half_dim_qkv, remaining_dims);
+    hn::StoreN(vout0, df, x + dim, remaining_dims);
+    hn::StoreN(vout1, df, x + dim + half_dim_qkv, remaining_dims);
   }
 }
 
-static HWY_NOINLINE HWY_MAYBE_UNUSED void AddFrom(
-    const float* HWY_RESTRICT other, float* HWY_RESTRICT x, const size_t size) {
-  namespace hn = hwy::HWY_NAMESPACE;
-  using D = hn::ScalableTag<float>;
-  using V = hn::Vec<D>;
+// `inv_timescale[dim_qkv / 2]` is precomputed in AttentionActivations.
+static HWY_NOINLINE HWY_MAYBE_UNUSED void RopeAndMulBy(
+    const float mul, float* HWY_RESTRICT x, const size_t dim_qkv,
+    const float* HWY_RESTRICT inv_timescale, const int pos,
+    const size_t HWY_MAYBE_UNUSED worker = 0) {
+  PROFILER_ZONE2(worker, "ops.RopeAndMulBy");
+  HWY_DASSERT(dim_qkv % 2 == 0);
+  const size_t half_dim_qkv = dim_qkv / 2;
 
-  hn::Transform1(D(), x, size, other,
-                 [](const auto d, const V x, const V other)
-                     HWY_ATTR { return hn::Add(x, other); });
+  const hn::ScalableTag<float> df;
+  const size_t NF = hn::Lanes(df);
+  using VF = hn::Vec<decltype(df)>;
+  const VF vmul = hn::Set(df, mul);
+  const VF vpos = hn::Set(df, static_cast<float>(pos));
+
+  // Vectorize computation for half_dim_qkv - (half_dim_qkv % Lanes)
+  const size_t vectorizable_dims = hwy::RoundDownTo(half_dim_qkv, NF);
+  size_t dim = 0;
+  for (; dim < vectorizable_dims; dim += NF) {
+    const VF vinv_time_scale = hn::LoadU(df, inv_timescale + dim);
+    const VF vtheta = hn::Mul(vpos, vinv_time_scale);
+
+    // Compute rotations.
+    VF vcos_theta;
+    VF vsin_theta;
+    hn::SinCos(df, vtheta, vsin_theta, vcos_theta);
+
+    // Scale input with rotations and multiply with constant.
+    const VF vx0 = hn::Mul(vmul, hn::LoadU(df, x + dim));
+    const VF vx1 = hn::Mul(vmul, hn::LoadU(df, x + dim + half_dim_qkv));
+    const VF vout0 = hn::MulSub(vx0, vcos_theta, hn::Mul(vx1, vsin_theta));
+    const VF vout1 = hn::MulAdd(vx0, vsin_theta, hn::Mul(vx1, vcos_theta));
+
+    hn::StoreU(vout0, df, x + dim);
+    hn::StoreU(vout1, df, x + dim + half_dim_qkv);
+  }
+
+  // Vectorize computation for remaining dims - same as above, but with LoadN.
+  const size_t remaining_dims = half_dim_qkv - dim;
+  HWY_DASSERT(remaining_dims < NF);  // at most one iteration
+  if (remaining_dims != 0) {
+    VF vinv_time_scale = hn::LoadN(df, inv_timescale + dim, remaining_dims);
+    VF vtheta = hn::Mul(vpos, vinv_time_scale);
+
+    // Compute rotations.
+    VF vcos_theta;
+    VF vsin_theta;
+    hn::SinCos(df, vtheta, vsin_theta, vcos_theta);
+
+    // Scale input with rotations and multiply with constant.
+    const VF vx0 = hn::Mul(vmul, hn::LoadN(df, x + dim, remaining_dims));
+    const VF vx1 =
+        hn::Mul(vmul, hn::LoadN(df, x + dim + half_dim_qkv, remaining_dims));
+    const VF vout0 = hn::MulSub(vx0, vcos_theta, hn::Mul(vx1, vsin_theta));
+    const VF vout1 = hn::MulAdd(vx0, vsin_theta, hn::Mul(vx1, vcos_theta));
+
+    hn::StoreN(vout0, df, x + dim, remaining_dims);
+    hn::StoreN(vout1, df, x + dim + half_dim_qkv, remaining_dims);
+  }
+}
+
+template <typename XT>
+static HWY_NOINLINE HWY_MAYBE_UNUSED void AddFrom(
+    const XT* HWY_RESTRICT x, float* HWY_RESTRICT out, const size_t size,
+    const HWY_MAYBE_UNUSED size_t worker) {
+  PROFILER_ZONE2(worker, "ops.AddFrom");
+
+  namespace hn = hwy::HWY_NAMESPACE;
+  const hn::ScalableTag<float> df;
+  const size_t NF = hn::Lanes(df);
+  using VF = hn::Vec<decltype(df)>;
+
+  const auto packed_x = MakeSpan(x, size);
+
+  size_t i = 0;
+  if (size >= 2 * NF) {
+    for (; i <= size - 2 * NF; i += 2 * NF) {
+      VF x0, x1;
+      Decompress2(df, packed_x, i, x0, x1);
+      VF out0 = hn::Load(df, out + i);
+      VF out1 = hn::Load(df, out + i + NF);
+      hn::Store(hn::Add(x0, out0), df, out + i);
+      hn::Store(hn::Add(x1, out1), df, out + i + NF);
+    }
+  }
+
+  const size_t remaining = size - i;
+  const size_t remaining1 = remaining - HWY_MIN(remaining, NF);
+  HWY_DASSERT(remaining < 2 * NF);
+  HWY_DASSERT(remaining1 < NF);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
+    // Ensure the second vector is zeroed even if remaining <= NF.
+    hn::Store(hn::Zero(df), df, buf_x + NF);
+    DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
+    const VF x0 = hn::Load(df, buf_x);
+    const VF x1 = hn::Load(df, buf_x + NF);
+    const VF out0 = hn::LoadN(df, out + i, remaining);
+    const VF out1 = hn::LoadN(df, out + i + NF, remaining1);
+    hn::StoreN(hn::Add(x0, out0), df, out + i, remaining);
+    hn::StoreN(hn::Add(x1, out1), df, out + i + NF, remaining1);
+  }
 }
 
 // Simple loops unless/until batch sizes are large enough to parallelize.
-template <typename WeightT, typename OutT>
-void RMSNormBatched(size_t num_tokens, const float* activations,
-                    const WeightT* weights, OutT* out, const size_t model_dim) {
-  for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    RMSNorm(activations + token_idx * model_dim, weights,
-            out + token_idx * model_dim, model_dim);
-  }
-}
+template <typename XT, typename OT>
+void RMSNormBatched(const MatPtrT<XT>& activations, const MatPtr& weights,
+                    MatPtrT<OT>& out, ThreadingContext& ctx) {
+  HWY_DASSERT(weights.Rows() == 1);
+  HWY_DASSERT(weights.Cols() == activations.Cols());
+  HWY_DASSERT(activations.SameShape(out));
 
-// TODO: pass RowVectorBatch argument.
-template <typename WeightT, typename InOutT>
-void RMSNormInplaceBatched(size_t num_tokens, const WeightT* weights,
-                           InOutT* inout, const size_t model_dim) {
-  for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    RMSNormInplace(weights, inout + token_idx * model_dim, model_dim);
-  }
-}
-
-template <typename VecT, typename WeightT, typename OutT>
-void LayerNormBatched(size_t num_tokens, const VecT* x,
-                      const WeightT* HWY_RESTRICT weight,
-                      const WeightT* HWY_RESTRICT bias, OutT* out,
-                      const size_t size) {
-  for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    LayerNorm(x + token_idx * size, weight, bias, out + token_idx * size, size);
-  }
-}
-
-static HWY_INLINE void AddFromBatched(size_t num_tokens, const float* other,
-                                      float* x, const size_t model_dim) {
-  for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    AddFrom(other + token_idx * model_dim, x + token_idx * model_dim,
-            model_dim);
-  }
-}
-
-static HWY_NOINLINE void MulBy(const float* HWY_RESTRICT other,
-                               float* HWY_RESTRICT x, const size_t size,
-                               const size_t max_pos) {
-  HWY_DASSERT(max_pos <= size);
-  namespace hn = hwy::HWY_NAMESPACE;
-  using D = hn::ScalableTag<float>;
-  using V = hn::Vec<D>;
-
-  hn::Transform1(D(), x, max_pos, other,
-                 [](const auto d, const V x, const V other)
-                     HWY_ATTR { return hn::Mul(x, other); });
-}
-
-static HWY_INLINE HWY_MAYBE_UNUSED void MulBy(const float* HWY_RESTRICT other,
-                                              float* HWY_RESTRICT x,
-                                              const size_t size) {
-  return MulBy(other, x, size, size);
-}
-
-static HWY_NOINLINE void MulByConst(const float c, float* HWY_RESTRICT x,
-                                    const size_t size, const size_t max_pos) {
-  HWY_DASSERT(max_pos <= size);
-  namespace hn = hwy::HWY_NAMESPACE;
-  using D = hn::ScalableTag<float>;
-  using V = hn::Vec<D>;
-  hn::Transform(D(), x, max_pos, [c](const auto d, const V x) HWY_ATTR {
-    return hn::Mul(x, hn::Set(d, c));
+  CallUpcasted(&weights, [&](const auto* weights_t) {
+    const size_t pkg_idx = 0;
+    SmallParallelFor(activations.Rows(), ctx.pools, pkg_idx,
+                     [&](uint64_t token_idx, size_t worker) {
+                       RMSNorm(activations.Row(token_idx),
+                               weights_t->PackedScale1(), 0, out.Row(token_idx),
+                               activations.Cols(), worker);
+                     });
   });
 }
 
-static HWY_INLINE HWY_MAYBE_UNUSED void MulByConst(const float c,
-                                                   float* HWY_RESTRICT x,
-                                                   const size_t size) {
-  MulByConst(c, x, size, size);
+template <typename XT>
+void RMSNormInplaceBatched(const MatPtr& weights, MatPtrT<XT>& inout,
+                           ThreadingContext& ctx) {
+  HWY_DASSERT(weights.Rows() == 1);
+  HWY_DASSERT(weights.Cols() == inout.Cols());
+
+  CallUpcasted(&weights, [&](const auto* weights_t) {
+    const size_t pkg_idx = 0;
+    SmallParallelFor(inout.Rows(), ctx.pools, pkg_idx,
+                     [&](uint64_t token_idx, size_t worker) {
+                       RMSNormInplace(weights_t->PackedScale1(), 0,
+                                      inout.Row(token_idx), inout.Cols(),
+                                      worker);
+                     });
+  });
 }
 
-static HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd(
-    float c, const float* HWY_RESTRICT x, float* HWY_RESTRICT out,
-    size_t size) {
+// x and out may be the same.
+template <typename XT, typename OT>
+void LayerNormBatched(const MatPtrT<XT>& x, const MatPtr& weight,
+                      const MatPtr& bias, MatPtrT<OT>& out) {
+  HWY_DASSERT(weight.Cols() == bias.Cols());
+  HWY_DASSERT(weight.Cols() == x.Cols());
+  HWY_DASSERT(x.SameShape(out));
+
+  CallUpcastedSame(
+      &weight, &bias, [&](const auto* weight_t, const auto* bias_t) {
+        for (size_t token_idx = 0; token_idx < x.Rows(); ++token_idx) {
+          LayerNorm(x.Row(token_idx), weight_t->PackedScale1(),
+                    bias_t->PackedScale1(), out.Row(token_idx), x.Cols());
+        }
+      });
+}
+
+template <typename XT>
+static HWY_INLINE void AddFromBatched(const MatPtrT<XT>& x, MatPtrT<float>& out,
+                                      ThreadingContext& ctx) {
+  HWY_DASSERT(out.SameShape(x));
+  const size_t pkg_idx = 0;
+  SmallParallelFor(
+      out.Rows(), ctx.pools, pkg_idx, [&](uint64_t token_idx, size_t worker) {
+        AddFrom(x.Row(token_idx), out.Row(token_idx), x.Cols(), worker);
+      });
+}
+
+template <typename XT>
+HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConst(
+    const float c, XT* HWY_RESTRICT x, const size_t size,
+    const HWY_MAYBE_UNUSED size_t worker) {
+  PROFILER_ZONE2(worker, "ops.MulByConst");
   namespace hn = hwy::HWY_NAMESPACE;
-  using D = hn::ScalableTag<float>;
-  using V = hn::Vec<D>;
-  hn::Transform1(D(), out, size, x,
-                 [c](const auto d, const V v_out, const V v_x) HWY_ATTR {
-                   return hn::MulAdd(v_x, hn::Set(d, c), v_out);
-                 });
+  const hn::ScalableTag<float> df;
+  const size_t NF = hn::Lanes(df);
+  using VF = hn::Vec<decltype(df)>;
+
+  const VF v_c = hn::Set(df, c);
+  const auto packed_x = MakeSpan(x, size);
+
+  size_t i = 0;
+  if (size >= 2 * NF) {
+    for (; i <= size - 2 * NF; i += 2 * NF) {
+      VF x0, x1;
+      Decompress2(df, packed_x, i, x0, x1);
+      x0 = hn::Mul(x0, v_c);
+      x1 = hn::Mul(x1, v_c);
+      Compress2(df, x0, x1, packed_x, i);
+    }
+  }
+
+  const size_t remaining = size - i;
+  HWY_DASSERT(remaining < 2 * NF);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
+    // Ensure the second vector is zeroed even if remaining <= NF.
+    hn::Store(hn::Zero(df), df, buf_x + NF);
+    DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
+    VF x0 = hn::Load(df, buf_x);
+    VF x1 = hn::Load(df, buf_x + NF);
+    x0 = hn::Mul(x0, v_c);
+    x1 = hn::Mul(x1, v_c);
+    Compress2(df, x0, x1, MakeSpan(buf_x, 2 * NF), 0);
+    hwy::CopyBytes(buf_x, x + i, remaining * sizeof(XT));
+  }
+}
+
+// Same as above, but without a separate output. Same as below without the add.
+template <typename XT, typename OT>
+HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstTo(
+    const float c, const XT* HWY_RESTRICT x, OT* HWY_RESTRICT out,
+    const size_t size, const HWY_MAYBE_UNUSED size_t worker) {
+  PROFILER_ZONE2(worker, "ops.MulByConstTo");
+  namespace hn = hwy::HWY_NAMESPACE;
+  const hn::ScalableTag<float> df;
+  const size_t NF = hn::Lanes(df);
+  using VF = hn::Vec<decltype(df)>;
+
+  const VF v_c = hn::Set(df, c);
+  const auto packed_x = MakeSpan(x, size);
+  const auto packed_out = MakeSpan(out, size);
+
+  size_t i = 0;
+  if (size >= 2 * NF) {
+    for (; i <= size - 2 * NF; i += 2 * NF) {
+      VF x0, x1;
+      Decompress2(df, packed_x, i, x0, x1);
+      const VF out0 = hn::Mul(x0, v_c);
+      const VF out1 = hn::Mul(x1, v_c);
+      Compress2(df, out0, out1, packed_out, i);
+    }
+  }
+
+  const size_t remaining = size - i;
+  HWY_DASSERT(remaining < 2 * NF);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
+    // Ensure the second vector is zeroed even if remaining <= NF.
+    hn::Store(hn::Zero(df), df, buf_x + NF);
+    DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
+    const VF x0 = hn::Load(df, buf_x);
+    const VF x1 = hn::Load(df, buf_x + NF);
+    const VF out0 = hn::Mul(x0, v_c);
+    const VF out1 = hn::Mul(x1, v_c);
+    Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
+    hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT));
+  }
+}
+
+template <typename XT, typename OT>
+HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd(
+    const float c, const XT* HWY_RESTRICT x, OT* HWY_RESTRICT out,
+    const size_t size, const HWY_MAYBE_UNUSED size_t worker) {
+  PROFILER_ZONE2(worker, "ops.MulByConstAndAdd");
+  namespace hn = hwy::HWY_NAMESPACE;
+  const hn::ScalableTag<float> df;
+  const size_t NF = hn::Lanes(df);
+  using VF = hn::Vec<decltype(df)>;
+
+  const VF v_c = hn::Set(df, c);
+  const auto packed_x = MakeSpan(x, size);
+  const auto packed_out = MakeSpan(out, size);
+
+  size_t i = 0;
+  if (size >= 2 * NF) {
+    for (; i <= size - 2 * NF; i += 2 * NF) {
+      VF x0, x1, out0, out1;
+      Decompress2(df, packed_x, i, x0, x1);
+      Decompress2(df, packed_out, i, out0, out1);
+      out0 = hn::MulAdd(x0, v_c, out0);
+      out1 = hn::MulAdd(x1, v_c, out1);
+      Compress2(df, out0, out1, packed_out, i);
+    }
+  }
+
+  const size_t remaining = size - i;
+  HWY_DASSERT(remaining < 2 * NF);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
+    // Ensure the second vectors are zeroed even if remaining <= NF.
+    hn::Store(hn::Zero(df), df, buf_x + NF);
+    hn::Store(hn::Zero(df), df, buf_out + NF);
+    DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
+    DecompressAndZeroPad(df, packed_out, i, buf_out, remaining);
+    const VF x0 = hn::Load(df, buf_x);
+    const VF x1 = hn::Load(df, buf_x + NF);
+    VF out0 = hn::Load(df, buf_out);
+    VF out1 = hn::Load(df, buf_out + NF);
+    out0 = hn::MulAdd(x0, v_c, out0);
+    out1 = hn::MulAdd(x1, v_c, out1);
+    Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
+    hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT));
+  }
 }
 
 // See below for a specialized version for top-1 sampling.
 static HWY_NOINLINE void Softmax(float* HWY_RESTRICT x, const size_t size,
-                                 const size_t mask_pos,
+                                 const size_t worker,
                                  float temperature = 1.0f) {
+  PROFILER_ZONE2(worker, "ops.Softmax");
   HWY_DASSERT(size != 0);
-  HWY_DASSERT(mask_pos <= size);
 
   namespace hn = hwy::HWY_NAMESPACE;
   using D = hn::ScalableTag<float>;
@@ -545,13 +776,13 @@ static HWY_NOINLINE void Softmax(float* HWY_RESTRICT x, const size_t size,
   const V vmin = hn::Set(d, hwy::LowestValue<float>());
   V vmax = vmin;
   V* pmax = &vmax;  // workaround for SVE: cannot capture &vector directly
-  hn::Foreach(d, x, mask_pos, vmin,
-              [pmax](const auto d, const V value)
-                  HWY_ATTR { *pmax = hn::Max(*pmax, value); });
+  hn::Foreach(d, x, size, vmin, [pmax](const auto d, const V value) HWY_ATTR {
+    *pmax = hn::Max(*pmax, value);
+  });
   vmax = hn::MaxOfLanes(d, vmax);
 
   // Subtract max (avoid precision loss for large exponents) and exponentiate.
-  hn::Transform(d, x, mask_pos, [pmax](const auto d, const V value) HWY_ATTR {
+  hn::Transform(d, x, size, [pmax](const auto d, const V value) HWY_ATTR {
     if constexpr (HWY_TARGET & HWY_ALL_SVE) {
       // Temporary workaround for buggy SVE codegen: avoid inlined Exp().
       return hn::CallExp(d, hn::Sub(value, *pmax));
@@ -562,7 +793,7 @@ static HWY_NOINLINE void Softmax(float* HWY_RESTRICT x, const size_t size,
 
   if (temperature != 1.0f) {
     const float temperature_inv = 1.0f / temperature;
-    hn::Transform(d, x, mask_pos,
+    hn::Transform(d, x, size,
                   [temperature_inv](const auto d, const V value) HWY_ATTR {
                     return hn::Mul(value, hn::Set(d, temperature_inv));
                   });
@@ -572,16 +803,10 @@ static HWY_NOINLINE void Softmax(float* HWY_RESTRICT x, const size_t size,
   // not make a huge difference. It halves the standard deviation of the sum of
   // the normalized probabilities from 1E-7 to 5E-8, but actually also changes
   // the generated text after a few hundred tokens.
-  const float sum_exp = Sum(d, x, mask_pos);
+  const float sum_exp = Sum(d, x, size);
   // Double-precision reciprocal does not appear to affect the results.
   const float mul = 1.0f / sum_exp;
-  MulByConst(mul, x, size, mask_pos);
-}
-
-static HWY_INLINE HWY_MAYBE_UNUSED void Softmax(float* HWY_RESTRICT x,
-                                                const size_t size,
-                                                float temperature = 1.0f) {
-  Softmax(x, size, size, temperature);
+  MulByConst(mul, x, size, worker);
 }
 
 // Note: https://arxiv.org/pdf/2001.04438 proposes to replace the three max /
@@ -672,8 +897,8 @@ static HWY_MAYBE_UNUSED TokenAndProb Top1OfSoftmax(float* HWY_RESTRICT x,
 
 static HWY_NOINLINE void LogitsSoftCap(const float cap, float* HWY_RESTRICT x,
                                        const size_t size,
-                                       const size_t max_pos) {
-  HWY_DASSERT(max_pos <= size);
+                                       const HWY_MAYBE_UNUSED size_t worker) {
+  PROFILER_ZONE2(worker, "ops.LogitsSoftCap");
 
   namespace hn = hwy::HWY_NAMESPACE;
   using D = hn::ScalableTag<float>;
@@ -681,22 +906,18 @@ static HWY_NOINLINE void LogitsSoftCap(const float cap, float* HWY_RESTRICT x,
 
   const float inv_cap = 1.0f / cap;
 
-  hn::Transform(D(), x, max_pos, [cap, inv_cap](D d, V v) HWY_ATTR {
+  hn::Transform(D(), x, size, [cap, inv_cap](D d, V v) HWY_ATTR {
     return hn::Mul(hn::Set(d, cap),
                    hn::Tanh(d, hn::Mul(v, hn::Set(d, inv_cap))));
   });
 }
 
-static HWY_INLINE void LogitsSoftCap(const float cap, float* HWY_RESTRICT x,
-                                     const size_t size) {
-  LogitsSoftCap(cap, x, size, size);
-}
-
 // Calls LogitsSoftCap if cap != 0.0f.
 static HWY_INLINE HWY_MAYBE_UNUSED void MaybeLogitsSoftCap(
-    const float cap, float* HWY_RESTRICT x, const size_t size) {
+    const float cap, float* HWY_RESTRICT x, const size_t size,
+    const size_t worker) {
   if (cap != 0.0f) {
-    LogitsSoftCap(cap, x, size, size);
+    LogitsSoftCap(cap, x, size, worker);
   }
 }
 
@@ -742,8 +963,8 @@ HWY_NOINLINE HWY_MAYBE_UNUSED std::vector<TokenAndProb> TopK(
   HWY_ASSERT(k != 0);
   HWY_ASSERT(k <= vocab_size);
   std::vector<double> packed_token_probs;
-  for (int32_t i = 0; i < vocab_size; ++i) {
-    if (accept_token && !accept_token(StaticCast<int>(i), probabilities[i])) {
+  for (int32_t i = 0; i < static_cast<int32_t>(vocab_size); ++i) {
+    if (accept_token && !accept_token(i, probabilities[i])) {
       continue;
     }
     packed_token_probs.push_back(PackTokenAndProb(i, probabilities[i]));
@@ -755,7 +976,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED std::vector<TokenAndProb> TopK(
 
   std::vector<TokenAndProb> token_probs;
   token_probs.reserve(k);
-  for (int32_t i = 0; i < k; ++i) {
+  for (int32_t i = 0; i < static_cast<int32_t>(k); ++i) {
     token_probs.push_back(UnpackTokenAndProb(packed_token_probs[i]));
   }
   return token_probs;
@@ -769,7 +990,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED int SampleTopK(
       TopK(probabilities, vocab_size, k, accept_token);
   std::vector<int> topk_indices(k);
   std::vector<float> topk_probs(k);
-  for (int i = 0; i < k; ++i) {
+  for (size_t i = 0; i < k; ++i) {
     topk_indices[i] = token_probs[i].token;
     topk_probs[i] = token_probs[i].prob;
   }
@@ -779,7 +1000,8 @@ HWY_NOINLINE HWY_MAYBE_UNUSED int SampleTopK(
 template <typename TAcceptToken>
 HWY_NOINLINE HWY_MAYBE_UNUSED TokenAndProb FusedSoftmaxAndSampleTopK(
     const float* HWY_RESTRICT logits, size_t k, size_t vocab_size,
-    std::mt19937& gen, float temperature, TAcceptToken& accept_token) {
+    std::mt19937& gen, float temperature, TAcceptToken& accept_token,
+    size_t worker) {
   // Softmax and sample top-K is equivalent to taking the top-K logits and
   // sampling from the softmax of the top-K logits. The latter is faster as it
   // avoids computing the softmax of all logits.
@@ -787,13 +1009,13 @@ HWY_NOINLINE HWY_MAYBE_UNUSED TokenAndProb FusedSoftmaxAndSampleTopK(
       TopK(logits, vocab_size, k, accept_token);
   std::vector<int> topk_indices(k);
   std::vector<float> topk_logits(k);
-  for (int i = 0; i < token_logits.size(); ++i) {
+  for (size_t i = 0; i < token_logits.size(); ++i) {
     topk_indices[i] = token_logits[i].token;
     topk_logits[i] = token_logits[i].prob;
   }
 
   size_t mask = token_logits.size();
-  Softmax(topk_logits.data(), mask, temperature);
+  Softmax(topk_logits.data(), mask, worker, temperature);
   auto distribution = std::discrete_distribution<int>(
       std::begin(topk_logits), std::begin(topk_logits) + mask);
   int topk_sampled_index = distribution(gen);
@@ -806,19 +1028,20 @@ HWY_NOINLINE HWY_MAYBE_UNUSED TokenAndProb FusedSoftmaxAndSampleTopK(
 // Input has 4096 (64*64) rows, output has 256 (16*16) rows
 // Each output row is the average of a 4x4 block of input rows
 template <typename T>
-RowVectorBatch<T> AvgPool4x4(RowVectorBatch<T>& input) {
-  Extents2D extents = input.Extents();
+MatStorageT<T> AvgPool4x4(MatStorageT<T>& input, const Allocator& allocator) {
+  const Extents2D extents = input.Extents();
   // Input validation
   HWY_DASSERT(extents.rows == 4096);  // 64 * 64 = 4096 input rows
   // Create output with 256 rows and same number of columns
   const size_t out_rows = 256;  // 16 * 16 = 256 output rows
-  RowVectorBatch<T> result(Extents2D{out_rows, extents.cols});
+  MatStorageT<T> result("pool4x4", Extents2D(out_rows, extents.cols), allocator,
+                        MatPadding::kOdd);
   const size_t input_dim = 64;   // Input is 64×64
   const size_t output_dim = 16;  // Output is 16×16
   for (size_t out_row_idx = 0; out_row_idx < output_dim; ++out_row_idx) {
     for (size_t out_col_idx = 0; out_col_idx < output_dim; ++out_col_idx) {
       size_t out_idx = out_row_idx * output_dim + out_col_idx;
-      T* output_row = result.Batch(out_idx);
+      T* output_row = result.Row(out_idx);
       // Initialize output row to zeros
       std::fill(output_row, output_row + extents.cols, 0);
       // Average 16 row vectors from a 4x4 block
@@ -827,9 +1050,9 @@ RowVectorBatch<T> AvgPool4x4(RowVectorBatch<T>& input) {
           size_t in_row_idx = out_row_idx * 4 + i;
           size_t in_col_idx = out_col_idx * 4 + j;
           size_t in_idx = in_row_idx * input_dim + in_col_idx;
-          const T* input_row = input.Batch(in_idx);
+          const T* input_row = input.Row(in_idx);
           // Add each input row to the output
-          // TODO(philculliton): use AddFrom in ops-inl for a vectorized loop.
+          // TODO(philculliton): use AddFrom in `ops-inl` for a vectorized loop.
           for (size_t col = 0; col < extents.cols; ++col) {
             output_row[col] += input_row[col];
           }
diff --git a/ops/ops.h b/ops/ops.h
index 6c243da..03b023b 100644
--- a/ops/ops.h
+++ b/ops/ops.h
@@ -20,21 +20,22 @@
 
 #include <cmath>
 
-#include "util/allocator.h"
+#include "util/mat.h"
 #include "hwy/base.h"
 
 namespace gcpp {
 
-static inline HWY_MAYBE_UNUSED RowVectorBatch<float> CreateInvTimescale(
-    size_t qkv_dim, bool half_rope, double base_frequency = 10000.0) {
+static inline HWY_MAYBE_UNUSED MatStorageT<float> CreateInvTimescale(
+    const Allocator& allocator, size_t qkv_dim, bool half_rope,
+    double base_frequency = 10000.0) {
   const size_t rope_dim = half_rope ? qkv_dim / 2 : qkv_dim;
-  RowVectorBatch<float> inv_timescale(Extents2D(1, rope_dim / 2));
+  MatStorageT<float> inv_timescale("inv_timescale", rope_dim / 2, allocator);
   for (size_t dim = 0; dim < rope_dim / 2; ++dim) {
     const double freq_exponents =
         static_cast<double>(2 * dim) / static_cast<double>(rope_dim);
     // Replacing with expf(ln(1E4) * freq_exponents) changes results
     // noticeably.
-    inv_timescale.Batch(0)[dim] =
+    inv_timescale.Row(0)[dim] =
         static_cast<float>(1.0 / std::pow(base_frequency, freq_exponents));
   }
   return inv_timescale;
diff --git a/ops/ops_test.cc b/ops/ops_test.cc
index 5414138..2a51839 100644
--- a/ops/ops_test.cc
+++ b/ops/ops_test.cc
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// OrderedDemote2To is not supported by HWY_SCALAR.
+#include "compression/types.h"
 #ifndef HWY_DISABLED_TARGETS
-#define HWY_DISABLED_TARGETS HWY_SCALAR
-#endif
+#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
+#endif  // HWY_DISABLED_TARGETS
 
 #include "ops/ops.h"
 
@@ -31,14 +31,13 @@
 #include <random>
 #include <vector>
 
-#include "compression/compress.h"  // BF16
-#include "gemma/common.h"
+#include "gemma/activations.h"  // ChooseQueryScale
 #include "gemma/configs.h"
 #include "util/allocator.h"
-#include "util/app.h"
+#include "util/basics.h"  // BF16
+#include "util/mat.h"     // MatStorageT
 #include "util/test_util.h"
-#include "util/threading.h"
-#include "hwy/base.h"
+#include "util/threading_context.h"
 #include "hwy/tests/hwy_gtest.h"
 
 // clang-format off
@@ -84,48 +83,44 @@ T Random(hwy::RandomState& rng) {
       HWY_MAX(hwy::ConvertScalarTo<double>(hwy::LowestValue<T>()), val));
 }
 
-HWY_NOINLINE void SourceAddFrom(const float* HWY_RESTRICT other,
+HWY_NOINLINE void SimpleAddFrom(const float* HWY_RESTRICT other,
                                 float* HWY_RESTRICT x, size_t size) {
   for (size_t i = 0; i < size; ++i) {
     x[i] += other[i];
   }
 }
 
-HWY_NOINLINE void SourceMulBy(const float* HWY_RESTRICT other,
-                              float* HWY_RESTRICT x, size_t size,
-                              size_t max_pos) {
-  HWY_DASSERT(max_pos <= size);
-  for (size_t i = 0; i < max_pos; ++i) {
+HWY_NOINLINE void SimpleMulBy(const float* HWY_RESTRICT other,
+                              float* HWY_RESTRICT x, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
     x[i] *= other[i];
   }
 }
 
-HWY_NOINLINE void SourceMulByConst(float c, float* HWY_RESTRICT x, size_t size,
-                                   size_t max_pos) {
-  for (size_t i = 0; i < max_pos; ++i) {
+HWY_NOINLINE void SimpleMulByConst(float c, float* HWY_RESTRICT x,
+                                   size_t size) {
+  for (size_t i = 0; i < size; ++i) {
     x[i] *= c;
   }
 }
 
-HWY_NOINLINE void SourceMulByConstAndAdd(float c, const float* HWY_RESTRICT x,
+HWY_NOINLINE void SimpleMulByConstAndAdd(float c, const float* HWY_RESTRICT x,
                                          float* HWY_RESTRICT out, size_t size) {
   for (size_t i = 0; i < size; ++i) {
     out[i] += x[i] * c;
   }
 }
 
-HWY_NOINLINE void SourceSoftmax(float* HWY_RESTRICT x, size_t size,
-                                size_t mask_pos) {
+HWY_NOINLINE void SimpleSoftmax(float* HWY_RESTRICT x, size_t size) {
   HWY_DASSERT(size != 0);
-  HWY_DASSERT(mask_pos <= size);
   float sum = 0.0;
-  const float maxval = *std::max_element(x, x + mask_pos);
-  for (size_t i = 0; i < mask_pos; ++i) {
+  const float maxval = *std::max_element(x, x + size);
+  for (size_t i = 0; i < size; ++i) {
     x[i] = std::exp(x[i] - maxval);
     sum += x[i];
   }
   const float scale = 1.0f / sum;
-  for (size_t i = 0; i < mask_pos; ++i) {
+  for (size_t i = 0; i < size; ++i) {
     x[i] *= scale;
   }
 }
@@ -170,40 +165,8 @@ struct TestAddFrom {
       o[i] = Random<T>(rng);
     }
 
-    SourceAddFrom(o, e, count);
-    AddFrom(o, x, count);
-
-    hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
-                            __LINE__);
-  }
-};
-
-struct TestMulBy {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
-                  hwy::RandomState& rng) {
-    using T = hn::TFromD<D>;
-
-    hwy::AlignedFreeUniquePtr<T[]> px =
-        hwy::AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
-    hwy::AlignedFreeUniquePtr<T[]> pe =
-        hwy::AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
-    hwy::AlignedFreeUniquePtr<T[]> po =
-        hwy::AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
-    HWY_ASSERT(px && pe && po);
-
-    T* x = px.get() + misalign_a;
-    T* e = pe.get() + misalign_a;
-    T* o = po.get() + misalign_b;
-
-    for (size_t i = 0; i < count; ++i) {
-      x[i] = Random<T>(rng);
-      e[i] = x[i];
-      o[i] = Random<T>(rng);
-    }
-
-    SourceMulBy(o, e, count, count);
-    MulBy(o, x, count, count);
+    SimpleAddFrom(o, e, count);
+    AddFrom(o, x, count, /*worker=*/0);
 
     hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
                             __LINE__);
@@ -235,8 +198,8 @@ struct TestMulByConstAndAdd {
     }
     T constant = Random<T>(rng);
 
-    SourceMulByConstAndAdd(constant, o, e, count);
-    MulByConstAndAdd(constant, o, x, count);
+    SimpleMulByConstAndAdd(constant, o, e, count);
+    MulByConstAndAdd(constant, o, x, count, /*worker=*/0);
 
     hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
                             __LINE__);
@@ -265,8 +228,8 @@ struct TestMulByConst {
     }
     T constant = Random<T>(rng);
 
-    SourceMulByConst(constant, e, count, count);
-    MulByConst(constant, x, count, count);
+    SimpleMulByConst(constant, e, count);
+    MulByConst(constant, x, count, /*worker=*/0);
 
     hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
                             __LINE__);
@@ -295,8 +258,8 @@ struct TestSoftmax {
       e[i] = x[i];
     }
 
-    SourceSoftmax(e, count, count);
-    Softmax(x, count, count);
+    SimpleSoftmax(e, count);
+    Softmax(x, count, /*worker=*/0);
 
     T sum = 0.0f;
     for (size_t i = 0; i < count; ++i) {
@@ -332,10 +295,6 @@ void TestAllAddFrom() {
   hn::ForPartialVectors<ForeachCountAndMisalign<TestAddFrom>>()(float());
 }
 
-void TestAllMulBy() {
-  hn::ForPartialVectors<ForeachCountAndMisalign<TestMulBy>>()(float());
-}
-
 void TestAllMulByConst() {
   hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConst>>()(float());
 }
@@ -372,8 +331,8 @@ void TestSigmoid() {
 }
 
 static HWY_NOINLINE HWY_MAYBE_UNUSED void ScalarRopeAndMulBy(
-    const float mul, float* HWY_RESTRICT x, size_t dim_qkv,
-    const float* HWY_RESTRICT inv_timescale, int pos) {
+    const float mul, float* HWY_RESTRICT x, const size_t dim_qkv,
+    const float* HWY_RESTRICT inv_timescale, const int pos) {
   HWY_DASSERT(dim_qkv % 2 == 0);
   const size_t half_dim_qkv = dim_qkv / 2;
   for (size_t dim = 0; dim < half_dim_qkv; ++dim) {
@@ -388,57 +347,70 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void ScalarRopeAndMulBy(
 }
 
 void TestRopeAndMulBy() {
-  AppArgs app;
-  BoundedTopology topology = CreateTopology(app);
-  NestedPools pools = CreatePools(topology, app);
-
-  ModelConfig config = ConfigFromModel(Model::GEMMA2_9B);
-  int dim_qkv = config.layer_configs[0].qkv_dim;
-  RowVectorBatch<float> x(Extents2D(1, dim_qkv));
+  ThreadingArgs threading_args;
+  ThreadingContext ctx(threading_args);
+  const ModelConfig config(Model::GEMMA2_9B, Type::kSFP,
+                           ChooseWrapping(Model::GEMMA2_9B));
+  const size_t dim_qkv = config.layer_configs[0].qkv_dim;
+  MatStorageT<float> x("x", dim_qkv, ctx.allocator);
 
   std::mt19937 gen;
   gen.seed(0x12345678);
   std::normal_distribution<float> r{0.0, 5.0};
   auto random_float = [&r, &gen] { return r(gen); };
 
-  for (int i = 0; i < dim_qkv; ++i) {
-    x.All()[i] = random_float();
+  for (size_t i = 0; i < dim_qkv; ++i) {
+    x.Row(0)[i] = random_float();
   }
 
-  const float qmul = ChooseQueryScale(config);
-  const float kmul = 1.0;
+  const float qmul = AttentionActivations::ChooseQueryScale(config);
+  constexpr float kmul = 1.0f;
 
-  std::vector<float> qexpected(dim_qkv);
-  std::vector<float> qactual(dim_qkv);
-  std::vector<float> kexpected(dim_qkv);
-  std::vector<float> kactual(dim_qkv);
-  RowVectorBatch<float> inv_timescale = gcpp::CreateInvTimescale(
-      config.layer_configs[0].qkv_dim,
+  MatStorageT<float> qexpected("qexpected", dim_qkv, ctx.allocator);
+  MatStorageT<float> qactual("qactual", dim_qkv, ctx.allocator);
+  MatStorageT<float> kexpected("kexpected", dim_qkv, ctx.allocator);
+  MatStorageT<float> kactual("kactual", dim_qkv, ctx.allocator);
+  MatStorageT<float> kactual2("kactual2", dim_qkv, ctx.allocator);
+  MatStorageT<float> inv_timescale = CreateInvTimescale(
+      ctx.allocator, config.layer_configs[0].qkv_dim,
       config.layer_configs[0].post_qk == PostQKType::HalfRope);
   // Assert VectorizedRope computation is same as regular rope at different pos.
-  for (int pos = 1; pos < 500; pos++) {
-    // Rope'd Q embeddings
-    hwy::CopyBytes(x.Const(), qactual.data(), dim_qkv);
-    hwy::CopyBytes(x.Const(), qexpected.data(), dim_qkv);
-    ScalarRopeAndMulBy(qmul, qexpected.data(), dim_qkv, inv_timescale.Const(),
+  for (size_t pos = 1; pos < 500; pos++) {
+    // Rope'd Q embeddings with query scale
+    CopyMat(x, qexpected);
+    CopyMat(x, qactual);
+    ScalarRopeAndMulBy(qmul, qexpected.Row(0), dim_qkv, inv_timescale.Row(0),
                        pos);
-    RopeAndMulBy(qmul, qactual.data(), dim_qkv, inv_timescale.Const(), pos);
+    RopeAndMulBy(qmul, qactual.Row(0), dim_qkv, inv_timescale.Row(0), pos);
+    for (size_t i = 0; i < dim_qkv; ++i) {
+      EXPECT_NEAR(qexpected.Row(0)[i], qactual.Row(0)[i], 1e-4) << " " << i;
+    }
 
-    for (int i = 0; i < dim_qkv; ++i) {
-      EXPECT_NEAR(qactual[i], qexpected[i], 1e-4)
-          << "qIndex:" << i << "qInput:" << qactual[i];
+    // Same without query scale
+    CopyMat(x, qexpected);
+    CopyMat(x, qactual);
+    ScalarRopeAndMulBy(1.0f, qexpected.Row(0), dim_qkv, inv_timescale.Row(0),
+                       pos);
+    Rope(qactual.Row(0), dim_qkv, inv_timescale.Row(0), pos);
+    for (size_t i = 0; i < dim_qkv; ++i) {
+      EXPECT_NEAR(qexpected.Row(0)[i], qactual.Row(0)[i], 1e-4) << " " << i;
     }
 
     // Rope'd K embeddings
-    hwy::CopyBytes(x.Const(), kactual.data(), dim_qkv);
-    hwy::CopyBytes(x.Const(), kexpected.data(), dim_qkv);
-    ScalarRopeAndMulBy(kmul, kexpected.data(), dim_qkv, inv_timescale.Const(),
+    CopyMat(x, kexpected);
+    CopyMat(x, kactual);
+    CopyMat(x, kactual2);
+    ScalarRopeAndMulBy(kmul, kexpected.Row(0), dim_qkv, inv_timescale.Row(0),
                        pos);
-    RopeAndMulBy(kmul, kactual.data(), dim_qkv, inv_timescale.Const(), pos);
+    RopeAndMulBy(kmul, kactual.Row(0), dim_qkv, inv_timescale.Row(0), pos);
+    static_assert(kmul == 1.0f, "");
+    Rope(kactual2.Row(0), dim_qkv, inv_timescale.Row(0), pos);
 
-    for (int i = 0; i < dim_qkv; ++i) {
-      EXPECT_NEAR(kactual[i], kexpected[i], 1e-4)
-          << "kIndex:" << i << "kInput:" << kactual[i];
+    for (size_t i = 0; i < dim_qkv; ++i) {
+      EXPECT_NEAR(kexpected.Row(0)[i], kactual.Row(0)[i], 1e-4) << " " << i;
+    }
+    for (size_t i = 0; i < dim_qkv; ++i) {
+      EXPECT_NEAR(kexpected.Row(0)[i], kactual2.Row(0)[i], 1e-4) << " " << i;
     }
   }
 }
@@ -454,10 +426,9 @@ HWY_NOINLINE float ScalarSquaredL2(const T* HWY_RESTRICT a, size_t size) {
 }
 
 // Supports bf16 and f32 inputs/outputs, which can be in-place.
-template <typename VecT, typename WeightT, typename OutT>
-HWY_NOINLINE void ScalarRMSNorm(const VecT* x,
-                                const WeightT* HWY_RESTRICT weight, OutT* out,
-                                size_t size) {
+template <typename XT, typename WT, typename OT>
+HWY_NOINLINE void ScalarRMSNorm(const XT* x, const WT* HWY_RESTRICT weight,
+                                OT* out, size_t size) {
   constexpr float kEps = 1e-6f;
   float ss = ScalarSquaredL2(x, size);
   ss = 1.0f / sqrtf(ss / StaticCast<float>(size) + kEps);
@@ -465,32 +436,32 @@ HWY_NOINLINE void ScalarRMSNorm(const VecT* x,
     const float v = hwy::ConvertScalarTo<float>(x[j]);
     const float w = hwy::ConvertScalarTo<float>(weight[j]);
     // Note 1.0f centering here
-    out[j] = hwy::ConvertScalarTo<OutT>((1.0f + w) * (ss * v));
+    out[j] = hwy::ConvertScalarTo<OT>((1.0f + w) * (ss * v));
   }
 }
 
-template <typename VecT, typename WeightT, typename OutT>
+template <typename XT, typename WT, typename OT>
 void TestRMSNorm(hwy::RandomState& rng) {
   constexpr size_t kSize = 128;
-  HWY_ALIGN VecT vec[kSize];
-  HWY_ALIGN WeightT weight[kSize];
-  HWY_ALIGN OutT expected[kSize];
-  HWY_ALIGN OutT actual[kSize];
+  HWY_ALIGN XT vec[kSize];
+  HWY_ALIGN WT weight[kSize];
+  HWY_ALIGN OT expected[kSize];
+  HWY_ALIGN OT actual[kSize];
 
   for (size_t i = 0; i < kSize; ++i) {
-    vec[i] = hwy::ConvertScalarTo<VecT>(RandomGaussian(rng));
-    weight[i] = hwy::ConvertScalarTo<WeightT>(RandomGaussian(rng));
+    vec[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
+    weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
   }
 
   ScalarRMSNorm(vec, weight, expected, kSize);
-  RMSNorm(vec, weight, actual, kSize);
+  RMSNorm(vec, weight, 0, actual, kSize, /*worker=*/0);
 
   for (size_t i = 0; i < kSize; i++) {
     const float e = hwy::ConvertScalarTo<float>(expected[i]);
     const float a = hwy::ConvertScalarTo<float>(actual[i]);
     if (!IsNear(e, a, 1e-5f)) {
-      HWY_ABORT("RMSNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<VecT>(),
-                TypeName<WeightT>(), TypeName<OutT>(), i, e, a);
+      HWY_ABORT("RMSNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<XT>(),
+                TypeName<WT>(), TypeName<OT>(), i, e, a);
     }
   }
 }
@@ -511,7 +482,7 @@ void TestLayerNormSimple() {
   const size_t kSize = 52;
   std::vector<float> values(kSize);
   // Alternating 1.0/-1.0, so mean=0.0, var=1.0, rsqrt(var+epsilon)=0.9999995
-  for (int i = 0; i < kSize; ++i) {
+  for (size_t i = 0; i < kSize; ++i) {
     values[i] = (i % 2 == 0) ? 1.0f : -1.0f;
   }
   std::vector<float> scale(kSize, 1.2f);
@@ -529,24 +500,64 @@ void TestLayerNormSimple() {
   }
 }
 
-// Note: there is no vectorized implementation of LayerNorm yet. So this test
-// currently only checks that the scalar version can be called for the below
-// combinations of float/BF16 inputs and outputs.
-template <typename VecT, typename WeightT, typename OutT>
+// Computes mean mu and mean of squares mu2 of a vector. Used in
+// ScalarLayerNorm.
+template <typename T>
+HWY_NOINLINE void ScalarMus(const T* HWY_RESTRICT a, size_t size, double& mu,
+                            double& mu2) {
+  HWY_ASSERT(size > 0);
+  double sum = 0.0;
+  double sum2 = 0.0;
+  for (size_t i = 0; i < size; ++i) {
+    const float f = hwy::ConvertScalarTo<float>(a[i]);
+    sum += f;
+    sum2 += f * f;
+  }
+  mu = sum / size;
+  mu2 = sum2 / size;
+}
+
+// Compare py/flax/linen/normalization.py.
+// out = (x - mean) * scale * rsqrt(var + epsilon) + bias
+template <typename XT, typename WT, typename OT>
+HWY_NOINLINE void ScalarLayerNorm(const XT* x, const WT* HWY_RESTRICT scale,
+                                  const WT* HWY_RESTRICT bias, OT* out,
+                                  size_t size) {
+  constexpr double kEps = 1e-6;
+  double mu, mu2;
+  ScalarMus(x, size, mu, mu2);
+  double var = mu2 - mu * mu;
+  constexpr double kZero = 0.0;
+  var = HWY_MAX(var, kZero);
+  var = 1.0 / sqrt(var + kEps);
+  for (size_t j = 0; j < size; j++) {
+    const float v = hwy::ConvertScalarTo<float>(x[j]);
+    const float s = hwy::ConvertScalarTo<float>(scale[j]);
+    const float b = hwy::ConvertScalarTo<float>(bias[j]);
+    out[j] = hwy::ConvertScalarTo<OT>((v - mu) * s * var + b);
+  }
+}
+
+template <typename XT, typename WT, typename OT>
 void TestLayerNorm(hwy::RandomState& rng) {
   constexpr size_t kSize = 128;
-  VecT vec[kSize];
-  WeightT weight[kSize];
-  WeightT bias[kSize];
-  OutT expected[kSize];
-  OutT actual[kSize];
+  XT vec[kSize];
+  WT weight[kSize];
+  WT bias[kSize];
+  OT expected[kSize];
+  OT actual[kSize];
 
   for (size_t i = 0; i < kSize; ++i) {
-    vec[i] = hwy::ConvertScalarTo<VecT>(RandomGaussian(rng));
-    weight[i] = hwy::ConvertScalarTo<WeightT>(RandomGaussian(rng));
-    bias[i] = hwy::ConvertScalarTo<WeightT>(RandomGaussian(rng));
+    vec[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
+    weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
+    bias[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
   }
 
+  double expected_mu, expected_mu2;
+  ScalarMus(vec, kSize, expected_mu, expected_mu2);
+  double actual_mu, actual_mu2;
+  ComputeMoments(vec, kSize, actual_mu, actual_mu2);
+
   ScalarLayerNorm(vec, weight, bias, expected, kSize);
   LayerNorm(vec, weight, bias, actual, kSize);
 
@@ -554,8 +565,8 @@ void TestLayerNorm(hwy::RandomState& rng) {
     const float e = hwy::ConvertScalarTo<float>(expected[i]);
     const float a = hwy::ConvertScalarTo<float>(actual[i]);
     if (!IsNear(e, a, 1e-5f)) {
-      HWY_ABORT("LayerNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<VecT>(),
-                TypeName<WeightT>(), TypeName<OutT>(), i, e, a);
+      HWY_ABORT("LayerNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<XT>(),
+                TypeName<WT>(), TypeName<OT>(), i, e, a);
     }
   }
 }
@@ -573,7 +584,7 @@ void TestSampleTopK() {
   std::vector<float> logits(kSize);
   // Create a vector going from -100 to -100+51=49 and take Softmax.
   std::iota(logits.begin(), logits.end(), -100.0f);
-  Softmax(logits.data(), kSize);
+  Softmax(logits.data(), kSize, /*worker=*/0);
   std::mt19937 gen;
   gen.seed(0x12345678);
   float temperature = 1.0f;
@@ -589,7 +600,7 @@ void TestSampleTopK() {
   EXPECT_EQ(sample, 50);  // Last even index.
   // Reset the logits to a positive, increasing sequence and take Softmax.
   std::iota(logits.begin(), logits.end(), 1.0f);
-  Softmax(logits.data(), kSize);
+  Softmax(logits.data(), kSize, /*worker=*/0);
   // Sample from the top 3, expect one of the top 3 even indices.
   for (int i = 0; i < 100; ++i) {
     sample = SampleTopK(logits.data(), /*k=*/3, kSize, gen, temperature,
@@ -627,7 +638,6 @@ HWY_AFTER_NAMESPACE();
 namespace gcpp {
 HWY_BEFORE_TEST(OpsTest);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllAddFrom);
-HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulBy);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConst);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstAndAdd);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmax);
diff --git a/paligemma/BUILD.bazel b/paligemma/BUILD.bazel
index 8f61ce2..5b5fdd4 100644
--- a/paligemma/BUILD.bazel
+++ b/paligemma/BUILD.bazel
@@ -13,7 +13,7 @@ cc_library(
     srcs = ["image.cc"],
     hdrs = ["image.h"],
     deps = [
-        "//compression:io",
+        "//io",
         "@highway//:hwy",
         "@highway//:profiler",
     ],
@@ -29,9 +29,28 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "paligemma_helper",
+    srcs = ["paligemma_helper.cc"],
+    hdrs = ["paligemma_helper.h"],
+    deps = [
+        ":image",
+        "//:allocator",
+        "//:benchmark_helper",
+        "//:configs",
+        "//:gemma_args",
+        "//:gemma_lib",
+        "//compression:types",
+        "//io",
+        "@highway//:hwy",
+        "@highway//:profiler",
+    ],
+)
+
 cc_test(
     name = "paligemma_test",
     srcs = ["paligemma_test.cc"],
+    linkstatic = True,
     # Requires model files
     tags = [
         "local",
@@ -39,11 +58,15 @@ cc_test(
         "no_tap",
     ],
     deps = [
+        ":paligemma_helper",
+        "//devtools/build/runtime:get_runfiles_dir",
         "@googletest//:gtest_main",  # buildcleaner: keep
+        "//:allocator",
         "//:benchmark_helper",
-        "//:common",
+        "//:configs",
         "//:gemma_lib",
-        "//compression:sfp",
+        "//compression:types",
+        "//io",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
     ],
diff --git a/paligemma/image.cc b/paligemma/image.cc
index 32e23ef..20ecad8 100644
--- a/paligemma/image.cc
+++ b/paligemma/image.cc
@@ -22,6 +22,7 @@
 #include <cctype>
 #include <cmath>
 #include <cstdio>
+#include <cstring>
 #include <fstream>
 #include <iostream>
 #include <limits>
@@ -29,7 +30,7 @@
 #include <utility>
 #include <vector>
 
-#include "compression/io.h"
+#include "io/io.h"
 #include "hwy/aligned_allocator.h"  // hwy::Span
 #include "hwy/base.h"
 #include "hwy/profiler.h"
@@ -83,7 +84,7 @@ const char* ParseUnsigned(const char* pos, const char* end, size_t& num) {
     return nullptr;
   }
   num = 0;
-  for ( ; pos < end && std::isdigit(*pos); ++pos) {
+  for (; pos < end && std::isdigit(*pos); ++pos) {
     num *= 10;
     num += *pos - '0';
   }
@@ -200,9 +201,8 @@ bool Image::WriteBinary(const std::string& filename) const {
     std::cerr << "Failed to open " << filename << "\n";
     return false;
   }
-  for (size_t i = 0; i < data_.size(); ++i) {
-    file.write(reinterpret_cast<const char*>(&data_[i]), sizeof(float));
-  }
+  file.write(reinterpret_cast<const char*>(data_.data()),
+             data_.size() * sizeof(data_[0]));
   file.close();
   return true;
 }
@@ -211,30 +211,32 @@ bool Image::WriteBinary(const std::string& filename) const {
 // We want the N-th patch of size kPatchSize x kPatchSize x 3.
 void Image::GetPatch(size_t patch_num, float* patch) const {
   PROFILER_FUNC;
-  const size_t kDataSize = width_ * height_ * 3;
+  constexpr size_t kNumChannels = 3;
+  constexpr size_t kBytesPerPixel = (kNumChannels * sizeof(float));
+  constexpr size_t kBytesPerRow = (kPatchSize * kBytesPerPixel);
+  const size_t kDataSize = width_ * height_ * kNumChannels;
+  const size_t in_bytes_to_next_row = (width_ * kBytesPerPixel);
   HWY_ASSERT(size() == kDataSize);
   HWY_ASSERT(width_ % kPatchSize == 0);
   HWY_ASSERT(height_ % kPatchSize == 0);
   const size_t kNumPatchesPerRow = width_ / kPatchSize;
-  size_t i_offs = patch_num / kNumPatchesPerRow;
-  size_t j_offs = patch_num % kNumPatchesPerRow;
-  HWY_ASSERT(0 <= i_offs && i_offs < height_ / kPatchSize);
-  HWY_ASSERT(0 <= j_offs && j_offs < kNumPatchesPerRow);
-  i_offs *= kPatchSize;
-  j_offs *= kPatchSize;
-  // This can be made faster, but let's first see whether it matters.
-  const float* image_data = data();
-  for (size_t i = 0; i < kPatchSize; ++i) {
-    for (size_t j = 0; j < kPatchSize; ++j) {
-      for (size_t k = 0; k < 3; ++k) {
-        const size_t patch_index = (i * kPatchSize + j) * 3 + k;
-        HWY_DASSERT(patch_index < kPatchSize * kPatchSize * 3);
-        const size_t image_index =
-            ((i + i_offs) * width_ + (j + j_offs)) * 3 + k;
-        HWY_DASSERT(image_index < kDataSize);
-        patch[patch_index] = image_data[image_index];
-      }
-    }
+  size_t patch_y = patch_num / kNumPatchesPerRow;
+  size_t patch_x = patch_num % kNumPatchesPerRow;
+  HWY_ASSERT(0 <= patch_y && patch_y < height_ / kPatchSize);
+  HWY_ASSERT(0 <= patch_x && patch_x < kNumPatchesPerRow);
+  patch_y *= kPatchSize;
+  patch_x *= kPatchSize;
+
+  // Move `out` and `in` to the start of the patch.
+  char* out = reinterpret_cast<char*>(patch);
+  const char* in = reinterpret_cast<const char*>(data());
+  in += (((patch_y * width_) + patch_x) * kBytesPerPixel);
+
+  // Copy the patch one row at a time.
+  for (size_t y = 0; y < kPatchSize; ++y) {
+    std::memcpy(out, in, kBytesPerRow);
+    out += kBytesPerRow;
+    in += in_bytes_to_next_row;
   }
 }
 
diff --git a/paligemma/paligemma_helper.cc b/paligemma/paligemma_helper.cc
new file mode 100644
index 0000000..2c798b9
--- /dev/null
+++ b/paligemma/paligemma_helper.cc
@@ -0,0 +1,68 @@
+#include "paligemma/paligemma_helper.h"
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+#include "compression/types.h"
+#include "evals/benchmark_helper.h"
+#include "gemma/configs.h"
+#include "gemma/gemma.h"
+#include "util/allocator.h"
+#include "hwy/base.h"
+
+
+namespace gcpp {
+
+void PaliGemmaHelper::InitVit(const std::string& path) {
+  HWY_ASSERT(env_->GetGemma() != nullptr);
+  const Gemma& gemma = *(env_->GetGemma());
+  const ModelConfig& config = gemma.Config();
+  HWY_ASSERT(config.wrapping == PromptWrapping::PALIGEMMA);
+
+  image_tokens_ = std::make_unique<ImageTokens>(
+      "image", Extents2D(config.vit_config.seq_len, config.model_dim),
+      env_->Env().ctx.allocator, MatPadding::kPacked);
+  image_tokens_->AllocateAndAttachRowPtrs(env_->Env().row_ptrs);
+  Image image;
+  HWY_ASSERT(image.ReadPPM(path));
+  const size_t image_size = config.vit_config.image_size;
+  image.Resize(image_size, image_size);
+  RuntimeConfig runtime_config = {.gen = &env_->MutableGen(),
+                                  .verbosity = 0};
+  gemma.GenerateImageTokens(runtime_config, env_->MutableKVCache().SeqLen(),
+                            image, *image_tokens_, env_->MutableEnv());
+}
+
+std::string PaliGemmaHelper::GemmaReply(const std::string& prompt_text) const {
+  const Gemma& model = *(env_->GetGemma());
+    env_->MutableGen().seed(0x12345678);
+
+    std::string response;
+    auto stream_token = [&](int token, float) {
+      std::string token_text;
+      HWY_ASSERT(
+          model.Tokenizer().Decode(std::vector<int>{token}, &token_text));
+      response += token_text;
+      return true;
+    };
+
+    std::string mutable_prompt = prompt_text;
+    std::vector<int> tokens = env_->WrapAndTokenize(mutable_prompt);
+    tokens.insert(tokens.begin(), image_tokens_->Rows(), 0);
+
+    RuntimeConfig runtime_config = {.max_generated_tokens = 512,
+                                    // PrefixLM sees/attends to all tokens.
+                                    .prefill_tbatch_size = tokens.size(),
+                                    .gen = &env_->MutableGen(),
+                                    .verbosity = 0,
+                                    .stream_token = stream_token,
+                                    .image_tokens = image_tokens_.get()};
+
+    const size_t prefix_end = tokens.size();
+    TimingInfo timing_info = {.verbosity = 0};
+    model.Generate(runtime_config, tokens, /*pos=*/0, prefix_end,
+                   env_->MutableKVCache(), env_->MutableEnv(), timing_info);
+    return response;
+}
+
+}  // namespace gcpp
diff --git a/paligemma/paligemma_helper.h b/paligemma/paligemma_helper.h
new file mode 100644
index 0000000..4994c43
--- /dev/null
+++ b/paligemma/paligemma_helper.h
@@ -0,0 +1,25 @@
+#ifndef THIRD_PARTY_GEMMA_CPP_PALIGEMMA_PALIGEMMA_HELPER_H_
+#define THIRD_PARTY_GEMMA_CPP_PALIGEMMA_PALIGEMMA_HELPER_H_
+
+#include <memory>
+#include <string>
+#include "evals/benchmark_helper.h"
+#include "gemma/gemma_args.h"
+
+namespace gcpp {
+
+class PaliGemmaHelper {
+ public:
+  explicit PaliGemmaHelper(GemmaEnv* env) : env_(env) {};
+
+  void InitVit(const std::string& path);
+  std::string GemmaReply(const std::string& prompt_text) const;
+
+ private:
+  std::unique_ptr<ImageTokens> image_tokens_;
+  GemmaEnv* env_;
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_PALIGEMMA_PALIGEMMA_HELPER_H_
diff --git a/paligemma/paligemma_test.cc b/paligemma/paligemma_test.cc
index fe7fcc9..0a7401a 100644
--- a/paligemma/paligemma_test.cc
+++ b/paligemma/paligemma_test.cc
@@ -13,23 +13,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cstdio>
-#include <string>
-#include <vector>
+#include <stdio.h>
+
+#include <memory>
+#include <string>
 
-#include "compression/shared.h"
 #include "evals/benchmark_helper.h"
-#include "gemma/common.h"
+#include "gemma/configs.h"
 #include "gemma/gemma.h"
-#include "hwy/base.h"
+#include "io/io.h"
+#include "util/allocator.h"
 #include "hwy/tests/hwy_gtest.h"
+#include "paligemma/paligemma_helper.h"
 
 // This test can be run manually with the downloaded PaliGemma weights.
-// To run the test, pass the following flags:
-// --model paligemma-224 --tokenizer <tokenizer_path> --weights <weights_path>
-// or just use the single-file weights file with --weights <weights_path>.
-// It should pass for the following models:
-// paligemma-3b-mix-224, paligemma2-3b-pt-448
+// It should pass for `paligemma-3b-mix-224` and `paligemma2-3b-pt-448`.
 
 namespace gcpp {
 namespace {
@@ -41,114 +39,43 @@ GemmaEnv* s_env = nullptr;
 
 class PaliGemmaTest : public ::testing::Test {
  protected:
-  void InitVit(const std::string& path);
-  std::string GemmaReply(const std::string& prompt_text) const;
-  void TestQuestions(const char* kQA[][2], size_t num_questions);
+  void TestQuestion(const char* question, const char* expected_substring) {
+    ASSERT_NE(s_env->GetGemma(), nullptr);
+    std::string path = "paligemma/testdata/image.ppm";
 
-  ImageTokens image_tokens_;
+    PaliGemmaHelper paligemma_helper(s_env);
+    paligemma_helper.InitVit(path);
+    const std::string reply = paligemma_helper.GemmaReply(question);
+    fprintf(stderr, "'%s'\n\n", reply.c_str());
+    EXPECT_TRUE(reply.find(expected_substring) != std::string::npos);  // NOLINT
+  }
+
+  std::unique_ptr<ImageTokens> image_tokens_;
 };
 
-void PaliGemmaTest::InitVit(const std::string& path) {
-  ASSERT_NE(s_env->GetModel(), nullptr);
-  Gemma& model = *(s_env->GetModel());
-  image_tokens_ =
-      ImageTokens(Extents2D(model.GetModelConfig().vit_config.seq_len,
-                            model.GetModelConfig().model_dim));
-  Image image;
-  HWY_ASSERT(model.Info().wrapping == PromptWrapping::PALIGEMMA);
-  HWY_ASSERT(image.ReadPPM(path));
-  const size_t image_size = model.GetModelConfig().vit_config.image_size;
-  image.Resize(image_size, image_size);
-  RuntimeConfig runtime_config = {.gen = &s_env->MutableGen(), .verbosity = 0};
-  model.GenerateImageTokens(runtime_config, image, image_tokens_);
-}
-
-std::string PaliGemmaTest::GemmaReply(const std::string& prompt_text) const{
-  Gemma& model = *(s_env->GetModel());
-  s_env->MutableGen().seed(0x12345678);
-  RuntimeConfig runtime_config = {.max_generated_tokens = 512,
-                                  .gen = &s_env->MutableGen(),
-                                  .verbosity = 0};
-  runtime_config.image_tokens = &image_tokens_;
-  size_t abs_pos = 0;
-  std::string mutable_prompt = prompt_text;
-  std::vector<int> tokens = s_env->WrapAndTokenize(mutable_prompt);
-  std::string response;
-  auto stream_token = [&](int token, float) {
-    std::string token_text;
-    HWY_ASSERT(model.Tokenizer().Decode(std::vector<int>{token}, &token_text));
-    response += token_text;
-    return true;
-  };
-  runtime_config.stream_token = stream_token,
-  tokens.insert(tokens.begin(), image_tokens_.BatchSize(), 0);
-  size_t num_tokens = tokens.size();
-  size_t prefix_end = num_tokens;
-  runtime_config.prefill_tbatch_size = num_tokens;
-  TimingInfo timing_info = {.verbosity = 0};
-  model.Generate(runtime_config, tokens, abs_pos, prefix_end,
-                 s_env->MutableKVCache(), timing_info);
-  return response;
-}
-
-void PaliGemmaTest::TestQuestions(const char* kQA[][2], size_t num_questions) {
-  ASSERT_NE(s_env->GetModel(), nullptr);
-  std::string path = "paligemma/testdata/image.ppm";
-  InitVit(path);
-  for (size_t i = 0; i < num_questions; ++i) {
-    fprintf(stderr, "Question %zu\n\n", i + 1);
-    std::string response = GemmaReply(kQA[i][0]);
-    fprintf(stderr, "'%s'\n\n", response.c_str());
-    EXPECT_TRUE(response.find(kQA[i][1]) != std::string::npos);  // NOLINT
+TEST_F(PaliGemmaTest, QueryObjects) {
+  ASSERT_NE(s_env->GetGemma(), nullptr);
+  const char* question = "answer en What objects are in the image?";
+  // 3B PT/Mix 224, 10B Mix 224
+  const char* expected_substring = "Building, Tower";
+  const Model model = s_env->GetGemma()->Config().model;
+  if (model == Model::PALIGEMMA2_3B_448) {
+    expected_substring = "Lake.";
+  } else if (model == Model::PALIGEMMA2_10B_224) {
+    expected_substring = "Building.";
   }
-}
-
-TEST_F(PaliGemmaTest, General) {
-  ASSERT_NE(s_env->GetModel(), nullptr);
-  static const char* kQA_3B_mix_224[][2] = {
-      {"describe this image",
-       "A large building with two towers stands tall on the water's edge."},
-      {"describe image briefly",
-       "A large building with two towers in the middle of a city."},
-      {"What kind of building is it?", "church"},
-      {"How many towers does the church have?", "2"},
-      {"detect water", "<loc1022> water"},
-      {"segment water", "<seg010> water"},
-      {"Which city is this more likely? Tokio or Zurich?", "zurich"},
-  };
-  static const char* kQA_2_3B_pt_448[][2] = {
-      {"describe this image", "The Grossmünster in Zürich"},
-      {"describe image briefly", "The Grossmünster"},
-      {"answer en What objects are in the image?", "Building, Tower"},
-      {"segment water", "<loc1023> water"},
-  };
-  const char* (*qa)[2];
-  size_t num;
-  switch (s_env->GetModel()->Info().model) {
-    case Model::PALIGEMMA_224:
-      qa = kQA_3B_mix_224;
-      num = sizeof(kQA_3B_mix_224) / sizeof(kQA_3B_mix_224[0]);
-      break;
-    case Model::PALIGEMMA2_3B_448:
-      qa = kQA_2_3B_pt_448;
-      num = sizeof(kQA_2_3B_pt_448) / sizeof(kQA_2_3B_pt_448[0]);
-      break;
-    default:
-      FAIL() << "Unsupported model: "
-             << s_env->GetModel()->GetModelConfig().model_name;
-      break;
-  }
-  TestQuestions(qa, num);
+  TestQuestion(question, expected_substring);
 }
 
 }  // namespace
 }  // namespace gcpp
 
 int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  gcpp::InternalInit();
+
   gcpp::GemmaEnv env(argc, argv);
   gcpp::s_env = &env;
 
-  testing::InitGoogleTest(&argc, argv);
-
   return RUN_ALL_TESTS();
 }
diff --git a/python/BUILD.bazel b/python/BUILD.bazel
index 29de6bc..3ff733f 100644
--- a/python/BUILD.bazel
+++ b/python/BUILD.bazel
@@ -1,3 +1,4 @@
+# [internal2] load py_binary
 # [internal] load py_binary
 load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
 
@@ -12,8 +13,9 @@ pybind_extension(
     name = "configs",
     srcs = ["configs.cc"],
     deps = [
-        "//:common",
-        "//compression:sfp",
+        "//:configs",
+        "//:tensor_info",
+        "//compression:types",
     ],
 )
 
@@ -21,12 +23,11 @@ pybind_extension(
     name = "gemma",
     srcs = ["gemma_py.cc"],
     deps = [
-        "//:app",
         "//:benchmark_helper",
+        "//:gemma_args",
         "//:gemma_lib",
-        "//compression:sfp",
+        "//:threading_context",
         "@highway//:hwy",
-        "@highway//:thread_pool",
     ],
 )
 
diff --git a/python/configs.cc b/python/configs.cc
index 53ba5c4..b9a4bf6 100644
--- a/python/configs.cc
+++ b/python/configs.cc
@@ -19,10 +19,12 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "compression/shared.h"
-#include "gemma/tensor_index.h"
+#include "compression/types.h"
+#include "gemma/tensor_info.h"
 
 using gcpp::ActivationType;
+using gcpp::InternalLayerConfig;
+using gcpp::InternalModelConfig;
 using gcpp::LayerAttentionType;
 using gcpp::LayerConfig;
 using gcpp::Model;
@@ -32,8 +34,8 @@ using gcpp::PostQKType;
 using gcpp::PromptWrapping;
 using gcpp::QueryScaleType;
 using gcpp::ResidualType;
-using gcpp::TensorIndex;
 using gcpp::TensorInfo;
+using gcpp::TensorInfoRegistry;
 using gcpp::Type;
 using gcpp::VitConfig;
 
@@ -43,6 +45,7 @@ PYBIND11_MODULE(configs, py_module) {
   enum_<PromptWrapping>(py_module, "PromptWrapping")
       .value("GEMMA_IT", PromptWrapping::GEMMA_IT)
       .value("GEMMA_PT", PromptWrapping::GEMMA_PT)
+      .value("GEMMA_VLM", PromptWrapping::GEMMA_VLM)
       .value("PALIGEMMA", PromptWrapping::PALIGEMMA);
 
   enum_<Type>(py_module, "Type")
@@ -50,10 +53,7 @@ PYBIND11_MODULE(configs, py_module) {
       .value("kF32", Type::kF32)
       .value("kBF16", Type::kBF16)
       .value("kSFP", Type::kSFP)
-      .value("kNUQ", Type::kNUQ)
-      .value("kF64", Type::kF64)
-      .value("kC64", Type::kC64)
-      .value("kU128", Type::kU128);
+      .value("kNUQ", Type::kNUQ);
 
   enum_<LayerAttentionType>(py_module, "LayerAttentionType")
       .value("kGemma", LayerAttentionType::kGemma)
@@ -82,8 +82,6 @@ PYBIND11_MODULE(configs, py_module) {
 
   enum_<Model>(py_module, "Model")
       .value("UNKNOWN", Model::UNKNOWN)
-      .value("GEMMA_2B", Model::GEMMA_2B)
-      .value("GEMMA_7B", Model::GEMMA_7B)
       .value("GEMMA2_9B", Model::GEMMA2_9B)
       .value("GEMMA2_27B", Model::GEMMA2_27B)
       .value("GRIFFIN_2B", Model::GRIFFIN_2B)
@@ -93,12 +91,11 @@ PYBIND11_MODULE(configs, py_module) {
       .value("PALIGEMMA2_10B_224", Model::PALIGEMMA2_10B_224)
       .value("PALIGEMMA2_3B_448", Model::PALIGEMMA2_3B_448)
       .value("PALIGEMMA2_10B_448", Model::PALIGEMMA2_10B_448)
-      .value("PALIGEMMA_224", Model::PALIGEMMA_224)
   .value("PALIGEMMA_448", Model::PALIGEMMA_448);
 
   class_<TensorInfo>(py_module, "TensorInfo")
       .def(init())
-      .def_readwrite("name", &TensorInfo::name)
+      .def_readwrite("name", &TensorInfo::base_name)
       .def_readwrite("source_names", &TensorInfo::source_names)
       .def_readwrite("preshape", &TensorInfo::preshape)
       .def_readwrite("axes", &TensorInfo::axes)
@@ -109,13 +106,17 @@ PYBIND11_MODULE(configs, py_module) {
       .def_readwrite("scaled_softplus", &TensorInfo::scaled_softplus)
       .def_readwrite("cols_take_extra_dims", &TensorInfo::cols_take_extra_dims);
 
-  class_<TensorIndex>(py_module, "TensorIndex")
-      .def(init<const ModelConfig&, int, int, bool>())
+  class_<TensorInfoRegistry>(py_module, "TensorInfoRegistry")
+      .def(init<const ModelConfig&>())
       .def("tensor_info_from_source_path",
-           &TensorIndex::TensorInfoFromSourcePath, arg("path"))
-      .def("tensor_info_from_name", &TensorIndex::TensorInfoFromName,
+           &TensorInfoRegistry::TensorInfoFromSourcePath, arg("path"),
+           arg("layer_idx"))
+      .def("tensor_info_from_name", &TensorInfoRegistry::TensorInfoFromName,
            arg("name"));
 
+  class_<InternalLayerConfig>(py_module, "InternalLayerConfig")
+      .def(init<>());
+
   class_<LayerConfig>(py_module, "LayerConfig")
       .def(init())
       .def_readwrite("model_dim", &LayerConfig::model_dim)
@@ -132,7 +133,9 @@ PYBIND11_MODULE(configs, py_module) {
       .def_readwrite("post_norm", &LayerConfig::post_norm)
       .def_readwrite("type", &LayerConfig::type)
       .def_readwrite("activation", &LayerConfig::activation)
-  .def_readwrite("post_qk", &LayerConfig::post_qk);
+      .def_readwrite("post_qk", &LayerConfig::post_qk)
+      .def_readwrite("use_qk_norm", &LayerConfig::use_qk_norm)
+      .def_readwrite("internal", &LayerConfig::internal);
 
   class_<VitConfig>(py_module, "VitConfig")
       .def(init())
@@ -143,18 +146,23 @@ PYBIND11_MODULE(configs, py_module) {
       .def_readwrite("image_size", &VitConfig::image_size)
       .def_readwrite("layer_configs", &VitConfig::layer_configs);
 
+  class_<InternalModelConfig>(py_module, "InternalModelConfig")
+      .def(init<>());
+
   class_<ModelConfig>(py_module, "ModelConfig")
-      .def(init())
+      .def(init<>())
+      .def(init<Model, Type, PromptWrapping>())
+      .def(init<const char*>())
       .def_readwrite("model_family_version", &ModelConfig::model_family_version)
-      .def_readwrite("model_name", &ModelConfig::model_name)
+      .def_readwrite("display_name", &ModelConfig::display_name)
       .def_readwrite("model", &ModelConfig::model)
       .def_readwrite("wrapping", &ModelConfig::wrapping)
       .def_readwrite("weight", &ModelConfig::weight)
       .def_readwrite("num_layers", &ModelConfig::num_layers)
       .def_readwrite("model_dim", &ModelConfig::model_dim)
       .def_readwrite("vocab_size", &ModelConfig::vocab_size)
-      .def_readwrite("seq_len", &ModelConfig::seq_len)
-      .def_readwrite("num_tensor_scales", &ModelConfig::num_tensor_scales)
+      .def_readwrite("max_seq_len", &ModelConfig::max_seq_len)
+      // Skip `unused_num_tensor_scales`.
       .def_readwrite("att_cap", &ModelConfig::att_cap)
       .def_readwrite("final_cap", &ModelConfig::final_cap)
       .def_readwrite("absolute_pe", &ModelConfig::absolute_pe)
@@ -163,22 +171,24 @@ PYBIND11_MODULE(configs, py_module) {
       .def_readwrite("layer_configs", &ModelConfig::layer_configs)
       .def_readwrite("attention_window_sizes",
                      &ModelConfig::attention_window_sizes)
-      .def_readwrite("scale_names", &ModelConfig::scale_names)
       .def_readwrite("norm_num_groups", &ModelConfig::norm_num_groups)
       .def_readwrite("vit_config", &ModelConfig::vit_config)
+      .def_readwrite("pool_dim", &ModelConfig::pool_dim)
+      .def_readwrite("eos_id", &ModelConfig::eos_id)
+      .def_readwrite("secondary_eos_id", &ModelConfig::secondary_eos_id)
+      .def_readwrite("scale_base_names", &ModelConfig::scale_base_names)
+      .def_readwrite("internal", &ModelConfig::internal)
+
       .def("add_layer_config", &ModelConfig::AddLayerConfig,
            arg("layer_config"))
-      .def("test_equal", &ModelConfig::TestEqual, arg("other"), arg("partial"),
-           arg("debug"));
-
-  // Returns the config for the given model.
-  py_module.def("config_from_model", &gcpp::ConfigFromModel, arg("model"));
-
-  // Returns the model for the given config, if it matches any standard model.
-  py_module.def("model_from_config", &gcpp::ModelFromConfig, arg("config"));
+      .def("test_equal", &ModelConfig::TestEqual, arg("other"), arg("print"))
+      .def("overwrite_with_canonical", &ModelConfig::OverwriteWithCanonical)
+      .def("specifier", &ModelConfig::Specifier);
 
   // Returns the sub-config for the ViT model of the PaliGemma model.
   py_module.def("vit_config", &gcpp::GetVitConfig, arg("config"));
+
+  py_module.def("is_paligemma", &gcpp::IsPaliGemma, arg("model"));
 }
 
 }  // namespace pybind11
diff --git a/python/convert_from_safetensors.py b/python/convert_from_safetensors.py
index 8d28e7c..6574c98 100644
--- a/python/convert_from_safetensors.py
+++ b/python/convert_from_safetensors.py
@@ -42,6 +42,7 @@ import safetensors
 import torch
 
 from compression.python import compression
+from python import configs
 
 
 def flatten_f32(x: np.ndarray) -> np.ndarray:
@@ -70,14 +71,23 @@ def compute_scale(x: np.ndarray) -> float:
 
 
 def _is_float_param(param_name: str) -> bool:
-  for prefix in ["img_pos_emb", "attn_out_b", "linear_0_b", "linear_1_b",
-                 "qkv_ein_b", "img_emb_bias", "img_head_bias"]:
+  """Returns whether the tensor should be stored as float32."""
+  for prefix in [
+      "img_pos_emb",
+      "attn_out_b",
+      "linear_0_b",
+      "linear_1_b",
+      "qkv_ein_b",
+      "img_emb_bias",
+      "img_head_bias",
+  ]:
     if param_name.startswith(prefix):
       return True
   return False
 
 
 def _is_bf16_param(param_name: str) -> bool:
+  """Returns whether the tensor should be stored as bf16."""
   for prefix in ["pre_", "post_", "c_", "img_head_kernel"]:
     if param_name.startswith(prefix):
       return True
@@ -106,6 +116,7 @@ def _get_layer_config(dims: Dict[str, Any]):
 
   Args:
     dims: A dictionary of (mostly) dimension values.
+
   Returns:
     A dictionary of layer configurations.
   """
@@ -114,45 +125,141 @@ def _get_layer_config(dims: Dict[str, Any]):
   vit_seq_len = dims["vit_seq_len"]
   config = {
       "llm-non-layers": [
-          ("language_model.model.embed_tokens.weight", (257152, model_dim), "c_embedding"),
+          (
+              "language_model.model.embed_tokens.weight",
+              (257152, model_dim),
+              "c_embedding",
+          ),
           ("language_model.model.norm.weight", (model_dim,), "c_final_norm"),
       ],
       "llm-layers": [
-          ("language_model.model.layers.%d.mlp.down_proj.weight", (model_dim, hidden_dim), "linear_w"),
+          (
+              "language_model.model.layers.%d.mlp.down_proj.weight",
+              (model_dim, hidden_dim),
+              "linear_w",
+          ),
       ],
       "img-non-layers": [
-          ("vision_tower.vision_model.post_layernorm.bias", (1152,), "enc_norm_bias"),
-          ("vision_tower.vision_model.post_layernorm.weight", (1152,), "enc_norm_scale"),
-          ("vision_tower.vision_model.embeddings.patch_embedding.bias", (1152,), "img_emb_bias"),
-          ("vision_tower.vision_model.embeddings.patch_embedding.weight", (1152, 14, 14, 3), "img_emb_kernel"),
+          (
+              "vision_tower.vision_model.post_layernorm.bias",
+              (1152,),
+              "enc_norm_bias",
+          ),
+          (
+              "vision_tower.vision_model.post_layernorm.weight",
+              (1152,),
+              "enc_norm_scale",
+          ),
+          (
+              "vision_tower.vision_model.embeddings.patch_embedding.bias",
+              (1152,),
+              "img_emb_bias",
+          ),
+          (
+              "vision_tower.vision_model.embeddings.patch_embedding.weight",
+              (1152, 14, 14, 3),
+              "img_emb_kernel",
+          ),
           ("multi_modal_projector.linear.bias", (model_dim,), "img_head_bias"),
-          ("multi_modal_projector.linear.weight", (model_dim, 1152), "img_head_kernel"),
-          ("vision_tower.vision_model.embeddings.position_embedding.weight", (vit_seq_len, 1152), "img_pos_emb"),
+          (
+              "multi_modal_projector.linear.weight",
+              (model_dim, 1152),
+              "img_head_kernel",
+          ),
+          (
+              "vision_tower.vision_model.embeddings.position_embedding.weight",
+              (vit_seq_len, 1152),
+              "img_pos_emb",
+          ),
       ],
       "img-layers": [
-          ("vision_tower.vision_model.encoder.layers.%d.layer_norm1.bias", (1152,), "ln_0_bias"),
-          ("vision_tower.vision_model.encoder.layers.%d.layer_norm1.weight", (1152,), "ln_0_scale"),
-          ("vision_tower.vision_model.encoder.layers.%d.layer_norm2.bias", (1152,), "ln_1_bias"),
-          ("vision_tower.vision_model.encoder.layers.%d.layer_norm2.weight", (1152,), "ln_1_scale"),
-          ("vision_tower.vision_model.encoder.layers.%d.mlp.fc1.bias", (4304,), "linear_0_b"),
-          ("vision_tower.vision_model.encoder.layers.%d.mlp.fc1.weight", (4304, 1152), "linear_0_w"),
-          ("vision_tower.vision_model.encoder.layers.%d.mlp.fc2.bias", (1152,), "linear_1_b"),
-          ("vision_tower.vision_model.encoder.layers.%d.mlp.fc2.weight", (1152, 4304), "linear_1_w"),
-          ("vision_tower.vision_model.encoder.layers.%d.self_attn.out_proj.bias", (1152,), "attn_out_b"),
-          ("vision_tower.vision_model.encoder.layers.%d.self_attn.out_proj.weight", (1152, 16 * 72), "attn_out_w"),
+          (
+              "vision_tower.vision_model.encoder.layers.%d.layer_norm1.bias",
+              (1152,),
+              "ln_0_bias",
+          ),
+          (
+              "vision_tower.vision_model.encoder.layers.%d.layer_norm1.weight",
+              (1152,),
+              "ln_0_scale",
+          ),
+          (
+              "vision_tower.vision_model.encoder.layers.%d.layer_norm2.bias",
+              (1152,),
+              "ln_1_bias",
+          ),
+          (
+              "vision_tower.vision_model.encoder.layers.%d.layer_norm2.weight",
+              (1152,),
+              "ln_1_scale",
+          ),
+          (
+              "vision_tower.vision_model.encoder.layers.%d.mlp.fc1.bias",
+              (4304,),
+              "linear_0_b",
+          ),
+          (
+              "vision_tower.vision_model.encoder.layers.%d.mlp.fc1.weight",
+              (4304, 1152),
+              "linear_0_w",
+          ),
+          (
+              "vision_tower.vision_model.encoder.layers.%d.mlp.fc2.bias",
+              (1152,),
+              "linear_1_b",
+          ),
+          (
+              "vision_tower.vision_model.encoder.layers.%d.mlp.fc2.weight",
+              (1152, 4304),
+              "linear_1_w",
+          ),
+          (
+              "vision_tower.vision_model.encoder.layers.%d.self_attn.out_proj.bias",
+              (1152,),
+              "attn_out_b",
+          ),
+          (
+              "vision_tower.vision_model.encoder.layers.%d.self_attn.out_proj.weight",
+              (1152, 16 * 72),
+              "attn_out_w",
+          ),
       ],
   }
   if dims["has_post_norm"]:  # See longer comment above.
     config["llm-layers"] += [
-        ("language_model.model.layers.%d.input_layernorm.weight", (model_dim,), "pre_att_ns"),
-        ("language_model.model.layers.%d.pre_feedforward_layernorm.weight", (model_dim,), "pre_ff_ns"),
-        ("language_model.model.layers.%d.post_attention_layernorm.weight", (model_dim,), "post_att_ns"),
-        ("language_model.model.layers.%d.post_feedforward_layernorm.weight", (model_dim,), "post_ff_ns"),
+        (
+            "language_model.model.layers.%d.input_layernorm.weight",
+            (model_dim,),
+            "pre_att_ns",
+        ),
+        (
+            "language_model.model.layers.%d.pre_feedforward_layernorm.weight",
+            (model_dim,),
+            "pre_ff_ns",
+        ),
+        (
+            "language_model.model.layers.%d.post_attention_layernorm.weight",
+            (model_dim,),
+            "post_att_ns",
+        ),
+        (
+            "language_model.model.layers.%d.post_feedforward_layernorm.weight",
+            (model_dim,),
+            "post_ff_ns",
+        ),
     ]
   else:
     config["llm-layers"] += [
-        ("language_model.model.layers.%d.input_layernorm.weight", (model_dim,), "pre_att_ns"),
-        ("language_model.model.layers.%d.post_attention_layernorm.weight", (model_dim,), "pre_ff_ns"),
+        (
+            "language_model.model.layers.%d.input_layernorm.weight",
+            (model_dim,),
+            "pre_att_ns",
+        ),
+        (
+            "language_model.model.layers.%d.post_attention_layernorm.weight",
+            (model_dim,),
+            "pre_ff_ns",
+        ),
     ]
   return config
 
@@ -162,6 +269,7 @@ def _get_dimensions(params):
 
   Args:
     params: A dictionary with parameters.
+
   Returns:
     A dictionary of dimension values.
   """
@@ -191,7 +299,9 @@ def _get_dimensions(params):
 
 
 def export_paligemma_sbs(
+    model_specifier: str,
     load_path: str,
+    tokenizer_file: str,
     csv_file: str,
     sbs_file: str,
 ) -> None:
@@ -220,8 +330,7 @@ def export_paligemma_sbs(
       "language_model.model.embed_tokens.weight"
   ][:-64]
 
-  # Initialize a few things.
-  writer = compression.SbsWriter(compression.CompressorMode.NO_TOC)
+  writer = compression.SbsWriter()
   metadata = []
   scales = {}
   dims = _get_dimensions(params)
@@ -255,13 +364,13 @@ def export_paligemma_sbs(
 
     # Determine the type as which to insert.
     if _is_float_param(sbs_name):
-      insert = writer.insert_float  # Insert as float.
+      packed = configs.Type.kF32
       print(f"Inserting {both_names} as float (f32) (no scaling)")
     elif _is_bf16_param(sbs_name) or param_name.startswith("vision_tower"):
-      insert = writer.insert_bf16  # Insert as BF16.
+      packed = configs.Type.kBF16
       print(f"Inserting {both_names} as BF16 (no scaling)")
     else:
-      insert = writer.insert_sfp  # Insert as SFP.
+      packed = configs.Type.kSFP
       # Assumes that all scales are 1.0 for SFP. Consider adding scales.
       # They would still need to be written, but would be collected here.
       assert scale == 1.0, f"Scale for {both_names} is not 1.0"
@@ -272,7 +381,10 @@ def export_paligemma_sbs(
     sys.stdout.flush()
 
     # Add the data to the writer.
-    insert(sbs_name, value)
+    info = configs.TensorInfo()
+    info.name = sbs_name
+    info.shape = data.shape
+    writer.insert(sbs_name, value, packed, info)
 
   def add_qkv_einsum(i):  # Handle qkv for layer i.
     name = "language_model.model.layers.%d.self_attn.q_proj.weight"  # (N*H, D)
@@ -367,7 +479,12 @@ def export_paligemma_sbs(
 
   # Handle the image embedding kernel transpose.
   name = "vision_tower.vision_model.embeddings.patch_embedding.weight"
-  assert params[name].shape == (1152, 3, 14, 14,)
+  assert params[name].shape == (
+      1152,
+      3,
+      14,
+      14,
+  )
   params[name] = params[name].permute(0, 2, 3, 1)
 
   # Add the non-layer params.
@@ -393,17 +510,31 @@ def export_paligemma_sbs(
   assert not params, "Some params were not used: %s" % params.keys()
 
   # Write everything to the sbs file.
-  writer.write(sbs_file)
+  assert model_specifier.startswith("paligemma")
+  writer.write(configs.ModelConfig(model_specifier), tokenizer_file, sbs_file)
 
   # Write the metadata for manual inspection.
   with open(csv_file, "w") as csv_handle:
     csv.writer(csv_handle).writerows(metadata)
 
 
+_MODEL_SPECIFIER = flags.DEFINE_string(
+    "model_specifier",
+    None,
+    "String specifying model, size, weight, wrapping (ModelConfig.Specifier)",
+    required=True,
+)
+
 _LOAD_PATH = flags.DEFINE_string(
     "load_path",
-    "",
+    None,
     "Path to the safetensors index.json file to read",
+    required=True,
+)
+_TOKENIZER_FILE = flags.DEFINE_string(
+    "tokenizer_file",
+    "/tmp/tokenizer.spm",
+    "Path to the tokenizer file to read and embed",
 )
 _METADATA_FILE = flags.DEFINE_string(
     "metadata_file",
@@ -422,14 +553,22 @@ def main(argv: Sequence[str]) -> None:
     raise app.UsageError("Too many command-line arguments.")
   logging.use_python_logging()
   logging.set_verbosity(logging.INFO)
+  model_specifier = _MODEL_SPECIFIER.value
   load_path = _LOAD_PATH.value
+  tokenizer_file = _TOKENIZER_FILE.value
   metadata_file = _METADATA_FILE.value
   sbs_file = _SBS_FILE.value
 
   logging.info(
-      "\n====\nReading from %s and writing to %s\n====", load_path, sbs_file
+      "\n====\nReading %s from %s and %s, writing to %s\n====",
+      model_specifier,
+      load_path,
+      tokenizer_file,
+      sbs_file,
+  )
+  export_paligemma_sbs(
+      model_specifier, load_path, tokenizer_file, metadata_file, sbs_file
   )
-  export_paligemma_sbs(load_path, metadata_file, sbs_file)
 
 
 if __name__ == "__main__":
diff --git a/python/gemma_py.cc b/python/gemma_py.cc
index a7ce022..9af07b3 100644
--- a/python/gemma_py.cc
+++ b/python/gemma_py.cc
@@ -22,19 +22,17 @@
 
 #include <algorithm>
 #include <memory>
-#include <random>
 #include <set>
 #include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "compression/shared.h"
 #include "evals/benchmark_helper.h"
 #include "gemma/gemma.h"
-#include "util/app.h"
+#include "gemma/gemma_args.h"
+#include "util/threading_context.h"
 #include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace py = pybind11;
 
@@ -48,17 +46,18 @@ static void RemoveTrailingZeros(std::vector<int> &vec) {
 class GemmaModel {
  public:
   GemmaModel(const gcpp::LoaderArgs& loader,
-             const gcpp::InferenceArgs& inference, const gcpp::AppArgs& app)
-      : gemma_(loader, inference, app), last_prob_(0.0f) {}
+             const gcpp::ThreadingArgs& threading,
+             const gcpp::InferenceArgs& inference)
+      : env_(loader, threading, inference), last_prob_(0.0f) {}
 
   // Generates a single example, given a prompt and a callback to stream the
   // generated tokens.
   void GenerateEx(std::string prompt, gcpp::StreamFunc stream,
                   size_t max_generated_tokens, float temperature, float seed,
                   gcpp::AcceptFunc accept, bool skip_prompt) {
-    gemma_.MutableGen().seed(seed);
-    std::vector<int> prompt_tokens = gemma_.WrapAndTokenize(prompt);
-    gcpp::RuntimeConfig& config = gemma_.MutableConfig();
+    env_.MutableGen().seed(seed);
+    std::vector<int> prompt_tokens = env_.WrapAndTokenize(prompt);
+    gcpp::RuntimeConfig& config = env_.MutableConfig();
     config.max_generated_tokens = max_generated_tokens;
     config.temperature = temperature;
     config.verbosity = 0;
@@ -73,8 +72,7 @@ class GemmaModel {
       }
       return stream(token, score);
     };
-    gemma_.QueryModel(prompt_tokens,
-                      skip_prompt ? stream_with_skipping : stream);
+    env_.QueryModel(prompt_tokens, skip_prompt ? stream_with_skipping : stream);
   }
 
   // Generates a single example, given a prompt, and returns the result.
@@ -84,13 +82,13 @@ class GemmaModel {
                        const std::vector<std::string>& end) {
     std::set<int> end_token_set{};
     for (const std::string& end_token : end) {
-      std::vector<int> end_token_ids = gemma_.Tokenize(end_token);
+      std::vector<int> end_token_ids = env_.Tokenize(end_token);
       end_token_set.insert(end_token_ids.begin(), end_token_ids.end());
     }
 
     std::vector<int> predicted_token_ids;
     predicted_token_ids.reserve(max_generated_tokens);
-    std::vector<int> prompt_token_ids = gemma_.WrapAndTokenize(prompt);
+    std::vector<int> prompt_token_ids = env_.WrapAndTokenize(prompt);
     int generated = 0;
     auto stream_token = [&generated, &prompt_token_ids, &predicted_token_ids,
                          &end_token_set, this](int token, float proba) {
@@ -107,7 +105,7 @@ class GemmaModel {
 
     std::set<int> accept_token_set{};
     for (const std::string& accept_token : accept) {
-      std::vector<int> accept_token_ids = gemma_.Tokenize(accept_token);
+      std::vector<int> accept_token_ids = env_.Tokenize(accept_token);
       accept_token_set.insert(accept_token_ids.begin(), accept_token_ids.end());
     }
 
@@ -126,17 +124,17 @@ class GemmaModel {
       }
     };
 
-    gemma_.MutableGen().seed(seed);
-    gcpp::RuntimeConfig& config = gemma_.MutableConfig();
+    env_.MutableGen().seed(seed);
+    gcpp::RuntimeConfig& config = env_.MutableConfig();
     config.max_generated_tokens = max_generated_tokens;
     config.temperature = temperature;
     config.verbosity = 0;
     config.accept_token = accept_token;
 
-    gemma_.QueryModel(prompt_token_ids, stream_token);
+    env_.QueryModel(prompt_token_ids, stream_token);
 
     if (!predicted_token_ids.empty()) {
-      return gemma_.StringFromTokens(predicted_token_ids);
+      return env_.StringFromTokens(predicted_token_ids);
     } else {
       return "";
     }
@@ -148,14 +146,14 @@ class GemmaModel {
                                          size_t max_generated_tokens,
                                          float temperature, float seed,
                                          size_t top_k) {
-    gcpp::RuntimeConfig& config = gemma_.MutableConfig();
+    gcpp::RuntimeConfig& config = env_.MutableConfig();
     config.max_generated_tokens = max_generated_tokens;
     config.temperature = temperature;
     config.top_k = top_k;
     config.verbosity = 0;
-    gemma_.MutableGen().seed(seed);
+    env_.MutableGen().seed(seed);
 
-    std::vector<gcpp::QueryResult> outputs = gemma_.BatchQueryModel(inputs);
+    std::vector<gcpp::QueryResult> outputs = env_.BatchQueryModel(inputs);
     std::vector<std::string> result;
     result.reserve(outputs.size());
     for (const gcpp::QueryResult& output : outputs) {
@@ -168,8 +166,10 @@ class GemmaModel {
   // Generate* will use this image. Throws an error for other models.
   void SetImage(const py::array_t<float, py::array::c_style |
                                              py::array::forcecast>& image) {
-    gcpp::Gemma& model = *(gemma_.GetModel());
-    if (model.Info().wrapping != gcpp::PromptWrapping::PALIGEMMA) {
+    const gcpp::Gemma& gemma = *env_.GetGemma();
+    const gcpp::ModelConfig& config = gemma.Config();
+    if (config.wrapping != gcpp::PromptWrapping::PALIGEMMA &&
+        config.wrapping != gcpp::PromptWrapping::GEMMA_VLM) {
       throw std::invalid_argument("Not a PaliGemma model.");
     }
     py::buffer_info buffer = image.request();
@@ -181,14 +181,16 @@ class GemmaModel {
     float* ptr = static_cast<float*>(buffer.ptr);
     gcpp::Image c_image;
     c_image.Set(height, width, ptr);
-    const size_t image_size = model.GetModelConfig().vit_config.image_size;
+    const size_t image_size = config.vit_config.image_size;
     c_image.Resize(image_size, image_size);
-    image_tokens_ = gcpp::ImageTokens(gcpp::Extents2D(
-        model.GetModelConfig().vit_config.seq_len,
-        model.GetModelConfig().model_dim));
-    gcpp::RuntimeConfig runtime_config = {.gen = &gemma_.MutableGen(),
+    image_tokens_.reset(new gcpp::ImageTokens(
+        "image_tokens",
+        gcpp::Extents2D(config.vit_config.seq_len, config.model_dim),
+        env_.MutableEnv().ctx.allocator, gcpp::MatPadding::kOdd));
+    gcpp::RuntimeConfig runtime_config = {.gen = &env_.MutableGen(),
                                           .verbosity = 0};
-    model.GenerateImageTokens(runtime_config, c_image, image_tokens_);
+    gemma.GenerateImageTokens(runtime_config, env_.MutableKVCache().SeqLen(),
+                              c_image, *image_tokens_, env_.MutableEnv());
   }
 
   // Generates a response to the given prompt, using the last set image.
@@ -196,17 +198,15 @@ class GemmaModel {
   std::pair<std::string, std::vector<int>> GenerateWithImage(
       std::string prompt, size_t max_generated_tokens, float temperature,
       float seed, gcpp::AcceptFunc accept, std::vector<int> prompt_tokens) {
-    if (image_tokens_.Cols() == 0) {
-      throw std::invalid_argument("No image set.");
-    }
-    gcpp::Gemma& model = *(gemma_.GetModel());
-    gemma_.MutableGen().seed(seed);
-    gcpp::RuntimeConfig& config = gemma_.MutableConfig();
+    if (!image_tokens_) throw std::invalid_argument("No image set.");
+    const gcpp::Gemma& model = *env_.GetGemma();
+    env_.MutableGen().seed(seed);
+    gcpp::RuntimeConfig& config = env_.MutableConfig();
     config.max_generated_tokens = max_generated_tokens;
     config.temperature = temperature;
     config.verbosity = 0;
     config.accept_token = accept;
-    config.image_tokens = &image_tokens_;
+    config.image_tokens = image_tokens_.get();
     std::vector<int> tokens;
     if (!prompt_tokens.empty()) {
       if (!prompt.empty()) {
@@ -216,9 +216,9 @@ class GemmaModel {
       tokens = prompt_tokens;
       RemoveTrailingZeros(tokens);  // Remove padding, if any.
     } else {
-      tokens = gemma_.WrapAndTokenize(prompt);
+      tokens = env_.WrapAndTokenize(prompt);
     }
-    tokens.insert(tokens.begin(), image_tokens_.BatchSize(), 0);
+    tokens.insert(tokens.begin(), image_tokens_->Rows(), 0);
     size_t num_tokens = tokens.size();
     size_t prefix_end = num_tokens;
     config.prefill_tbatch_size = num_tokens;
@@ -234,8 +234,8 @@ class GemmaModel {
     };
     config.stream_token = stream_token;
     gcpp::TimingInfo timing_info = {.verbosity = 0};
-    model.Generate(config, tokens, /*pos=*/0, prefix_end,
-                   gemma_.MutableKVCache(), timing_info);
+    model.Generate(config, tokens, /*pos=*/0, prefix_end, env_.MutableKVCache(),
+                   env_.MutableEnv(), timing_info);
     std::string response;
     model.Tokenizer().Decode(response_tokens, &response);
     return {response, response_tokens};
@@ -244,40 +244,34 @@ class GemmaModel {
   float GetLastProb() const { return last_prob_; }
 
   std::string Detokenize(const std::vector<int>& token_ids) const {
-    return gemma_.StringFromTokens(token_ids);
+    return env_.StringFromTokens(token_ids);
   }
 
-  bool ModelIsLoaded() const { return gemma_.GetModel() != nullptr; }
+  bool ModelIsLoaded() const { return env_.GetGemma() != nullptr; }
 
  private:
-  gcpp::GemmaEnv gemma_;
-  gcpp::ImageTokens image_tokens_;
+  gcpp::GemmaEnv env_;
+  std::unique_ptr<gcpp::ImageTokens> image_tokens_;
   float last_prob_;
 };
 
 PYBIND11_MODULE(gemma, mod) {
   py::class_<GemmaModel>(mod, "GemmaModel")
-      .def(py::init([](std::string tokenizer, std::string weights,
-                       std::string model, std::string weight_type,
+      .def(py::init([](const std::string& tokenizer, const std::string& weights,
                        size_t max_threads) {
-             gcpp::LoaderArgs loader(tokenizer, weights, model);
-             if (const char* err = loader.Validate()) {
-               throw std::invalid_argument(err);
-             }
-             loader.weight_type_str = weight_type;
+             const gcpp::LoaderArgs loader(tokenizer, weights);
+             gcpp::ThreadingArgs threading;
+             threading.max_lps = max_threads;
              gcpp::InferenceArgs inference;
              inference.max_generated_tokens = 512;
-             gcpp::AppArgs app;
-             app.max_threads = max_threads;
              auto gemma =
-                 std::make_unique<GemmaModel>(loader, inference, app);
+                 std::make_unique<GemmaModel>(loader, threading, inference);
              if (!gemma->ModelIsLoaded()) {
                throw std::invalid_argument("Could not load model.");
              }
              return gemma;
            }),
            py::arg("tokenizer_path"), py::arg("weights_path"),
-           py::arg("model_flag"), py::arg("weight_type") = "sfp",
            py::arg("max_threads") = 0)
       .def("generate_ex", &GemmaModel::GenerateEx, py::arg("prompt"),
            py::arg("stream"), py::arg("max_generated_tokens") = 1024,
diff --git a/python/run_example.py b/python/run_example.py
index ba87dc0..c67b354 100644
--- a/python/run_example.py
+++ b/python/run_example.py
@@ -48,7 +48,6 @@ def main(argv: Sequence[str]) -> None:
   model = gemma.GemmaModel(
       tokenizer_path=tokenizer_path,
       weights_path=weights_path,
-      model_flag="gemma2-2b-it",
       max_threads=24,
   )
 
@@ -87,7 +86,6 @@ def main(argv: Sequence[str]) -> None:
   model = gemma.GemmaModel(
       tokenizer_path=tokenizer_path,
       weights_path=weights_path,
-      model_flag="paligemma-224",
       max_threads=24,
   )
   image = np.array(
diff --git a/testdata/frankenstein.txt b/testdata/frankenstein.txt
new file mode 100644
index 0000000..9b2ce36
--- /dev/null
+++ b/testdata/frankenstein.txt
@@ -0,0 +1,1573 @@
+Letter 1
+To Mrs. Saville, England.
+
+St. Petersburgh, Dec. 11th, 17—.
+
+You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking.
+
+I am already far north of London, and as I walk in the streets of Petersburgh, I feel a cold northern breeze play upon my cheeks, which braces my nerves and fills me with delight. Do you understand this feeling? This breeze, which has travelled from the regions towards which I am advancing, gives me a foretaste of those icy climes. Inspirited by this wind of promise, my daydreams become more fervent and vivid. I try in vain to be persuaded that the pole is the seat of frost and desolation; it ever presents itself to my imagination as the region of beauty and delight. There, Margaret, the sun is for ever visible, its broad disk just skirting the horizon and diffusing a perpetual splendour. There—for with your leave, my sister, I will put some trust in preceding navigators—there snow and frost are banished; and, sailing over a calm sea, we may be wafted to a land surpassing in wonders and in beauty every region hitherto discovered on the habitable globe. Its productions and features may be without example, as the phenomena of the heavenly bodies undoubtedly are in those undiscovered solitudes. What may not be expected in a country of eternal light? I may there discover the wondrous power which attracts the needle and may regulate a thousand celestial observations that require only this voyage to render their seeming eccentricities consistent for ever. I shall satiate my ardent curiosity with the sight of a part of the world never before visited, and may tread a land never before imprinted by the foot of man. These are my enticements, and they are sufficient to conquer all fear of danger or death and to induce me to commence this laborious voyage with the joy a child feels when he embarks in a little boat, with his holiday mates, on an expedition of discovery up his native river. But supposing all these conjectures to be false, you cannot contest the inestimable benefit which I shall confer on all mankind, to the last generation, by discovering a passage near the pole to those countries, to reach which at present so many months are requisite; or by ascertaining the secret of the magnet, which, if at all possible, can only be effected by an undertaking such as mine.
+
+These reflections have dispelled the agitation with which I began my letter, and I feel my heart glow with an enthusiasm which elevates me to heaven, for nothing contributes so much to tranquillise the mind as a steady purpose—a point on which the soul may fix its intellectual eye. This expedition has been the favourite dream of my early years. I have read with ardour the accounts of the various voyages which have been made in the prospect of arriving at the North Pacific Ocean through the seas which surround the pole. You may remember that a history of all the voyages made for purposes of discovery composed the whole of our good Uncle Thomas’ library. My education was neglected, yet I was passionately fond of reading. These volumes were my study day and night, and my familiarity with them increased that regret which I had felt, as a child, on learning that my father’s dying injunction had forbidden my uncle to allow me to embark in a seafaring life.
+
+These visions faded when I perused, for the first time, those poets whose effusions entranced my soul and lifted it to heaven. I also became a poet and for one year lived in a paradise of my own creation; I imagined that I also might obtain a niche in the temple where the names of Homer and Shakespeare are consecrated. You are well acquainted with my failure and how heavily I bore the disappointment. But just at that time I inherited the fortune of my cousin, and my thoughts were turned into the channel of their earlier bent.
+
+Six years have passed since I resolved on my present undertaking. I can, even now, remember the hour from which I dedicated myself to this great enterprise. I commenced by inuring my body to hardship. I accompanied the whale-fishers on several expeditions to the North Sea; I voluntarily endured cold, famine, thirst, and want of sleep; I often worked harder than the common sailors during the day and devoted my nights to the study of mathematics, the theory of medicine, and those branches of physical science from which a naval adventurer might derive the greatest practical advantage. Twice I actually hired myself as an under-mate in a Greenland whaler, and acquitted myself to admiration. I must own I felt a little proud when my captain offered me the second dignity in the vessel and entreated me to remain with the greatest earnestness, so valuable did he consider my services.
+
+And now, dear Margaret, do I not deserve to accomplish some great purpose? My life might have been passed in ease and luxury, but I preferred glory to every enticement that wealth placed in my path. Oh, that some encouraging voice would answer in the affirmative! My courage and my resolution is firm; but my hopes fluctuate, and my spirits are often depressed. I am about to proceed on a long and difficult voyage, the emergencies of which will demand all my fortitude: I am required not only to raise the spirits of others, but sometimes to sustain my own, when theirs are failing.
+
+This is the most favourable period for travelling in Russia. They fly quickly over the snow in their sledges; the motion is pleasant, and, in my opinion, far more agreeable than that of an English stagecoach. The cold is not excessive, if you are wrapped in furs—a dress which I have already adopted, for there is a great difference between walking the deck and remaining seated motionless for hours, when no exercise prevents the blood from actually freezing in your veins. I have no ambition to lose my life on the post-road between St. Petersburgh and Archangel.
+
+I shall depart for the latter town in a fortnight or three weeks; and my intention is to hire a ship there, which can easily be done by paying the insurance for the owner, and to engage as many sailors as I think necessary among those who are accustomed to the whale-fishing. I do not intend to sail until the month of June; and when shall I return? Ah, dear sister, how can I answer this question? If I succeed, many, many months, perhaps years, will pass before you and I may meet. If I fail, you will see me again soon, or never.
+
+Farewell, my dear, excellent Margaret. Heaven shower down blessings on you, and save me, that I may again and again testify my gratitude for all your love and kindness.
+
+Your affectionate brother,
+R. Walton
+
+Letter 2
+To Mrs. Saville, England.
+
+Archangel, 28th March, 17—.
+
+How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage.
+
+But I have one want which I have never yet been able to satisfy, and the absence of the object of which I now feel as a most severe evil, I have no friend, Margaret: when I am glowing with the enthusiasm of success, there will be none to participate my joy; if I am assailed by disappointment, no one will endeavour to sustain me in dejection. I shall commit my thoughts to paper, it is true; but that is a poor medium for the communication of feeling. I desire the company of a man who could sympathise with me, whose eyes would reply to mine. You may deem me romantic, my dear sister, but I bitterly feel the want of a friend. I have no one near me, gentle yet courageous, possessed of a cultivated as well as of a capacious mind, whose tastes are like my own, to approve or amend my plans. How would such a friend repair the faults of your poor brother! I am too ardent in execution and too impatient of difficulties. But it is a still greater evil to me that I am self-educated: for the first fourteen years of my life I ran wild on a common and read nothing but our Uncle Thomas’ books of voyages. At that age I became acquainted with the celebrated poets of our own country; but it was only when it had ceased to be in my power to derive its most important benefits from such a conviction that I perceived the necessity of becoming acquainted with more languages than that of my native country. Now I am twenty-eight and am in reality more illiterate than many schoolboys of fifteen. It is true that I have thought more and that my daydreams are more extended and magnificent, but they want (as the painters call it) keeping; and I greatly need a friend who would have sense enough not to despise me as romantic, and affection enough for me to endeavour to regulate my mind.
+
+Well, these are useless complaints; I shall certainly find no friend on the wide ocean, nor even here in Archangel, among merchants and seamen. Yet some feelings, unallied to the dross of human nature, beat even in these rugged bosoms. My lieutenant, for instance, is a man of wonderful courage and enterprise; he is madly desirous of glory, or rather, to word my phrase more characteristically, of advancement in his profession. He is an Englishman, and in the midst of national and professional prejudices, unsoftened by cultivation, retains some of the noblest endowments of humanity. I first became acquainted with him on board a whale vessel; finding that he was unemployed in this city, I easily engaged him to assist in my enterprise.
+
+The master is a person of an excellent disposition and is remarkable in the ship for his gentleness and the mildness of his discipline. This circumstance, added to his well-known integrity and dauntless courage, made me very desirous to engage him. A youth passed in solitude, my best years spent under your gentle and feminine fosterage, has so refined the groundwork of my character that I cannot overcome an intense distaste to the usual brutality exercised on board ship: I have never believed it to be necessary, and when I heard of a mariner equally noted for his kindliness of heart and the respect and obedience paid to him by his crew, I felt myself peculiarly fortunate in being able to secure his services. I heard of him first in rather a romantic manner, from a lady who owes to him the happiness of her life. This, briefly, is his story. Some years ago he loved a young Russian lady of moderate fortune, and having amassed a considerable sum in prize-money, the father of the girl consented to the match. He saw his mistress once before the destined ceremony; but she was bathed in tears, and throwing herself at his feet, entreated him to spare her, confessing at the same time that she loved another, but that he was poor, and that her father would never consent to the union. My generous friend reassured the suppliant, and on being informed of the name of her lover, instantly abandoned his pursuit. He had already bought a farm with his money, on which he had designed to pass the remainder of his life; but he bestowed the whole on his rival, together with the remains of his prize-money to purchase stock, and then himself solicited the young woman’s father to consent to her marriage with her lover. But the old man decidedly refused, thinking himself bound in honour to my friend, who, when he found the father inexorable, quitted his country, nor returned until he heard that his former mistress was married according to her inclinations. “What a noble fellow!” you will exclaim. He is so; but then he is wholly uneducated: he is as silent as a Turk, and a kind of ignorant carelessness attends him, which, while it renders his conduct the more astonishing, detracts from the interest and sympathy which otherwise he would command.
+
+Yet do not suppose, because I complain a little or because I can conceive a consolation for my toils which I may never know, that I am wavering in my resolutions. Those are as fixed as fate, and my voyage is only now delayed until the weather shall permit my embarkation. The winter has been dreadfully severe, but the spring promises well, and it is considered as a remarkably early season, so that perhaps I may sail sooner than I expected. I shall do nothing rashly: you know me sufficiently to confide in my prudence and considerateness whenever the safety of others is committed to my care.
+
+I cannot describe to you my sensations on the near prospect of my undertaking. It is impossible to communicate to you a conception of the trembling sensation, half pleasurable and half fearful, with which I am preparing to depart. I am going to unexplored regions, to “the land of mist and snow,” but I shall kill no albatross; therefore do not be alarmed for my safety or if I should come back to you as worn and woeful as the “Ancient Mariner.” You will smile at my allusion, but I will disclose a secret. I have often attributed my attachment to, my passionate enthusiasm for, the dangerous mysteries of ocean to that production of the most imaginative of modern poets. There is something at work in my soul which I do not understand. I am practically industrious—painstaking, a workman to execute with perseverance and labour—but besides this there is a love for the marvellous, a belief in the marvellous, intertwined in all my projects, which hurries me out of the common pathways of men, even to the wild sea and unvisited regions I am about to explore.
+
+But to return to dearer considerations. Shall I meet you again, after having traversed immense seas, and returned by the most southern cape of Africa or America? I dare not expect such success, yet I cannot bear to look on the reverse of the picture. Continue for the present to write to me by every opportunity: I may receive your letters on some occasions when I need them most to support my spirits. I love you very tenderly. Remember me with affection, should you never hear from me again.
+
+Your affectionate brother,
+Robert Walton
+
+Letter 3
+To Mrs. Saville, England.
+
+July 7th, 17—.
+
+My dear Sister,
+
+I write a few lines in haste to say that I am safe—and well advanced on my voyage. This letter will reach England by a merchantman now on its homeward voyage from Archangel; more fortunate than I, who may not see my native land, perhaps, for many years. I am, however, in good spirits: my men are bold and apparently firm of purpose, nor do the floating sheets of ice that continually pass us, indicating the dangers of the region towards which we are advancing, appear to dismay them. We have already reached a very high latitude; but it is the height of summer, and although not so warm as in England, the southern gales, which blow us speedily towards those shores which I so ardently desire to attain, breathe a degree of renovating warmth which I had not expected.
+
+No incidents have hitherto befallen us that would make a figure in a letter. One or two stiff gales and the springing of a leak are accidents which experienced navigators scarcely remember to record, and I shall be well content if nothing worse happen to us during our voyage.
+
+Adieu, my dear Margaret. Be assured that for my own sake, as well as yours, I will not rashly encounter danger. I will be cool, persevering, and prudent.
+
+But success shall crown my endeavours. Wherefore not? Thus far I have gone, tracing a secure way over the pathless seas, the very stars themselves being witnesses and testimonies of my triumph. Why not still proceed over the untamed yet obedient element? What can stop the determined heart and resolved will of man?
+
+My swelling heart involuntarily pours itself out thus. But I must finish. Heaven bless my beloved sister!
+
+R.W.
+
+Letter 4
+To Mrs. Saville, England.
+
+August 5th, 17—.
+
+So strange an accident has happened to us that I cannot forbear recording it, although it is very probable that you will see me before these papers can come into your possession.
+
+Last Monday (July 31st) we were nearly surrounded by ice, which closed in the ship on all sides, scarcely leaving her the sea-room in which she floated. Our situation was somewhat dangerous, especially as we were compassed round by a very thick fog. We accordingly lay to, hoping that some change would take place in the atmosphere and weather.
+
+About two o’clock the mist cleared away, and we beheld, stretched out in every direction, vast and irregular plains of ice, which seemed to have no end. Some of my comrades groaned, and my own mind began to grow watchful with anxious thoughts, when a strange sight suddenly attracted our attention and diverted our solicitude from our own situation. We perceived a low carriage, fixed on a sledge and drawn by dogs, pass on towards the north, at the distance of half a mile; a being which had the shape of a man, but apparently of gigantic stature, sat in the sledge and guided the dogs. We watched the rapid progress of the traveller with our telescopes until he was lost among the distant inequalities of the ice.
+
+This appearance excited our unqualified wonder. We were, as we believed, many hundred miles from any land; but this apparition seemed to denote that it was not, in reality, so distant as we had supposed. Shut in, however, by ice, it was impossible to follow his track, which we had observed with the greatest attention.
+
+About two hours after this occurrence we heard the ground sea, and before night the ice broke and freed our ship. We, however, lay to until the morning, fearing to encounter in the dark those large loose masses which float about after the breaking up of the ice. I profited of this time to rest for a few hours.
+
+In the morning, however, as soon as it was light, I went upon deck and found all the sailors busy on one side of the vessel, apparently talking to someone in the sea. It was, in fact, a sledge, like that we had seen before, which had drifted towards us in the night on a large fragment of ice. Only one dog remained alive; but there was a human being within it whom the sailors were persuading to enter the vessel. He was not, as the other traveller seemed to be, a savage inhabitant of some undiscovered island, but a European. When I appeared on deck the master said, “Here is our captain, and he will not allow you to perish on the open sea.”
+
+On perceiving me, the stranger addressed me in English, although with a foreign accent. “Before I come on board your vessel,” said he, “will you have the kindness to inform me whither you are bound?”
+
+You may conceive my astonishment on hearing such a question addressed to me from a man on the brink of destruction and to whom I should have supposed that my vessel would have been a resource which he would not have exchanged for the most precious wealth the earth can afford. I replied, however, that we were on a voyage of discovery towards the northern pole.
+
+Upon hearing this he appeared satisfied and consented to come on board. Good God! Margaret, if you had seen the man who thus capitulated for his safety, your surprise would have been boundless. His limbs were nearly frozen, and his body dreadfully emaciated by fatigue and suffering. I never saw a man in so wretched a condition. We attempted to carry him into the cabin, but as soon as he had quitted the fresh air he fainted. We accordingly brought him back to the deck and restored him to animation by rubbing him with brandy and forcing him to swallow a small quantity. As soon as he showed signs of life we wrapped him up in blankets and placed him near the chimney of the kitchen stove. By slow degrees he recovered and ate a little soup, which restored him wonderfully.
+
+Two days passed in this manner before he was able to speak, and I often feared that his sufferings had deprived him of understanding. When he had in some measure recovered, I removed him to my own cabin and attended on him as much as my duty would permit. I never saw a more interesting creature: his eyes have generally an expression of wildness, and even madness, but there are moments when, if anyone performs an act of kindness towards him or does him any the most trifling service, his whole countenance is lighted up, as it were, with a beam of benevolence and sweetness that I never saw equalled. But he is generally melancholy and despairing, and sometimes he gnashes his teeth, as if impatient of the weight of woes that oppresses him.
+
+When my guest was a little recovered I had great trouble to keep off the men, who wished to ask him a thousand questions; but I would not allow him to be tormented by their idle curiosity, in a state of body and mind whose restoration evidently depended upon entire repose. Once, however, the lieutenant asked why he had come so far upon the ice in so strange a vehicle.
+
+His countenance instantly assumed an aspect of the deepest gloom, and he replied, “To seek one who fled from me.”
+
+“And did the man whom you pursued travel in the same fashion?”
+
+“Yes.”
+
+“Then I fancy we have seen him, for the day before we picked you up we saw some dogs drawing a sledge, with a man in it, across the ice.”
+
+This aroused the stranger’s attention, and he asked a multitude of questions concerning the route which the dæmon, as he called him, had pursued. Soon after, when he was alone with me, he said, “I have, doubtless, excited your curiosity, as well as that of these good people; but you are too considerate to make inquiries.”
+
+“Certainly; it would indeed be very impertinent and inhuman in me to trouble you with any inquisitiveness of mine.”
+
+“And yet you rescued me from a strange and perilous situation; you have benevolently restored me to life.”
+
+Soon after this he inquired if I thought that the breaking up of the ice had destroyed the other sledge. I replied that I could not answer with any degree of certainty, for the ice had not broken until near midnight, and the traveller might have arrived at a place of safety before that time; but of this I could not judge.
+
+From this time a new spirit of life animated the decaying frame of the stranger. He manifested the greatest eagerness to be upon deck to watch for the sledge which had before appeared; but I have persuaded him to remain in the cabin, for he is far too weak to sustain the rawness of the atmosphere. I have promised that someone should watch for him and give him instant notice if any new object should appear in sight.
+
+Such is my journal of what relates to this strange occurrence up to the present day. The stranger has gradually improved in health but is very silent and appears uneasy when anyone except myself enters his cabin. Yet his manners are so conciliating and gentle that the sailors are all interested in him, although they have had very little communication with him. For my own part, I begin to love him as a brother, and his constant and deep grief fills me with sympathy and compassion. He must have been a noble creature in his better days, being even now in wreck so attractive and amiable.
+
+I said in one of my letters, my dear Margaret, that I should find no friend on the wide ocean; yet I have found a man who, before his spirit had been broken by misery, I should have been happy to have possessed as the brother of my heart.
+
+I shall continue my journal concerning the stranger at intervals, should I have any fresh incidents to record.
+
+August 13th, 17—.
+
+My affection for my guest increases every day. He excites at once my admiration and my pity to an astonishing degree. How can I see so noble a creature destroyed by misery without feeling the most poignant grief? He is so gentle, yet so wise; his mind is so cultivated, and when he speaks, although his words are culled with the choicest art, yet they flow with rapidity and unparalleled eloquence.
+
+He is now much recovered from his illness and is continually on the deck, apparently watching for the sledge that preceded his own. Yet, although unhappy, he is not so utterly occupied by his own misery but that he interests himself deeply in the projects of others. He has frequently conversed with me on mine, which I have communicated to him without disguise. He entered attentively into all my arguments in favour of my eventual success and into every minute detail of the measures I had taken to secure it. I was easily led by the sympathy which he evinced to use the language of my heart, to give utterance to the burning ardour of my soul and to say, with all the fervour that warmed me, how gladly I would sacrifice my fortune, my existence, my every hope, to the furtherance of my enterprise. One man’s life or death were but a small price to pay for the acquirement of the knowledge which I sought, for the dominion I should acquire and transmit over the elemental foes of our race. As I spoke, a dark gloom spread over my listener’s countenance. At first I perceived that he tried to suppress his emotion; he placed his hands before his eyes, and my voice quivered and failed me as I beheld tears trickle fast from between his fingers; a groan burst from his heaving breast. I paused; at length he spoke, in broken accents: “Unhappy man! Do you share my madness? Have you drunk also of the intoxicating draught? Hear me; let me reveal my tale, and you will dash the cup from your lips!”
+
+Such words, you may imagine, strongly excited my curiosity; but the paroxysm of grief that had seized the stranger overcame his weakened powers, and many hours of repose and tranquil conversation were necessary to restore his composure.
+
+Having conquered the violence of his feelings, he appeared to despise himself for being the slave of passion; and quelling the dark tyranny of despair, he led me again to converse concerning myself personally. He asked me the history of my earlier years. The tale was quickly told, but it awakened various trains of reflection. I spoke of my desire of finding a friend, of my thirst for a more intimate sympathy with a fellow mind than had ever fallen to my lot, and expressed my conviction that a man could boast of little happiness who did not enjoy this blessing.
+
+“I agree with you,” replied the stranger; “we are unfashioned creatures, but half made up, if one wiser, better, dearer than ourselves—such a friend ought to be—do not lend his aid to perfectionate our weak and faulty natures. I once had a friend, the most noble of human creatures, and am entitled, therefore, to judge respecting friendship. You have hope, and the world before you, and have no cause for despair. But I—I have lost everything and cannot begin life anew.”
+
+As he said this his countenance became expressive of a calm, settled grief that touched me to the heart. But he was silent and presently retired to his cabin.
+
+Even broken in spirit as he is, no one can feel more deeply than he does the beauties of nature. The starry sky, the sea, and every sight afforded by these wonderful regions seem still to have the power of elevating his soul from earth. Such a man has a double existence: he may suffer misery and be overwhelmed by disappointments, yet when he has retired into himself, he will be like a celestial spirit that has a halo around him, within whose circle no grief or folly ventures.
+
+Will you smile at the enthusiasm I express concerning this divine wanderer? You would not if you saw him. You have been tutored and refined by books and retirement from the world, and you are therefore somewhat fastidious; but this only renders you the more fit to appreciate the extraordinary merits of this wonderful man. Sometimes I have endeavoured to discover what quality it is which he possesses that elevates him so immeasurably above any other person I ever knew. I believe it to be an intuitive discernment, a quick but never-failing power of judgment, a penetration into the causes of things, unequalled for clearness and precision; add to this a facility of expression and a voice whose varied intonations are soul-subduing music.
+
+August 19th, 17—.
+
+Yesterday the stranger said to me, “You may easily perceive, Captain Walton, that I have suffered great and unparalleled misfortunes. I had determined at one time that the memory of these evils should die with me, but you have won me to alter my determination. You seek for knowledge and wisdom, as I once did; and I ardently hope that the gratification of your wishes may not be a serpent to sting you, as mine has been. I do not know that the relation of my disasters will be useful to you; yet, when I reflect that you are pursuing the same course, exposing yourself to the same dangers which have rendered me what I am, I imagine that you may deduce an apt moral from my tale, one that may direct you if you succeed in your undertaking and console you in case of failure. Prepare to hear of occurrences which are usually deemed marvellous. Were we among the tamer scenes of nature I might fear to encounter your unbelief, perhaps your ridicule; but many things will appear possible in these wild and mysterious regions which would provoke the laughter of those unacquainted with the ever-varied powers of nature; nor can I doubt but that my tale conveys in its series internal evidence of the truth of the events of which it is composed.”
+
+You may easily imagine that I was much gratified by the offered communication, yet I could not endure that he should renew his grief by a recital of his misfortunes. I felt the greatest eagerness to hear the promised narrative, partly from curiosity and partly from a strong desire to ameliorate his fate if it were in my power. I expressed these feelings in my answer.
+
+“I thank you,” he replied, “for your sympathy, but it is useless; my fate is nearly fulfilled. I wait but for one event, and then I shall repose in peace. I understand your feeling,” continued he, perceiving that I wished to interrupt him; “but you are mistaken, my friend, if thus you will allow me to name you; nothing can alter my destiny; listen to my history, and you will perceive how irrevocably it is determined.”
+
+He then told me that he would commence his narrative the next day when I should be at leisure. This promise drew from me the warmest thanks. I have resolved every night, when I am not imperatively occupied by my duties, to record, as nearly as possible in his own words, what he has related during the day. If I should be engaged, I will at least make notes. This manuscript will doubtless afford you the greatest pleasure; but to me, who know him, and who hear it from his own lips—with what interest and sympathy shall I read it in some future day! Even now, as I commence my task, his full-toned voice swells in my ears; his lustrous eyes dwell on me with all their melancholy sweetness; I see his thin hand raised in animation, while the lineaments of his face are irradiated by the soul within. Strange and harrowing must be his story, frightful the storm which embraced the gallant vessel on its course and wrecked it—thus!
+
+Chapter 1
+I am by birth a Genevese, and my family is one of the most distinguished of that republic. My ancestors had been for many years counsellors and syndics, and my father had filled several public situations with honour and reputation. He was respected by all who knew him for his integrity and indefatigable attention to public business. He passed his younger days perpetually occupied by the affairs of his country; a variety of circumstances had prevented his marrying early, nor was it until the decline of life that he became a husband and the father of a family.
+
+As the circumstances of his marriage illustrate his character, I cannot refrain from relating them. One of his most intimate friends was a merchant who, from a flourishing state, fell, through numerous mischances, into poverty. This man, whose name was Beaufort, was of a proud and unbending disposition and could not bear to live in poverty and oblivion in the same country where he had formerly been distinguished for his rank and magnificence. Having paid his debts, therefore, in the most honourable manner, he retreated with his daughter to the town of Lucerne, where he lived unknown and in wretchedness. My father loved Beaufort with the truest friendship and was deeply grieved by his retreat in these unfortunate circumstances. He bitterly deplored the false pride which led his friend to a conduct so little worthy of the affection that united them. He lost no time in endeavouring to seek him out, with the hope of persuading him to begin the world again through his credit and assistance.
+
+Beaufort had taken effectual measures to conceal himself, and it was ten months before my father discovered his abode. Overjoyed at this discovery, he hastened to the house, which was situated in a mean street near the Reuss. But when he entered, misery and despair alone welcomed him. Beaufort had saved but a very small sum of money from the wreck of his fortunes, but it was sufficient to provide him with sustenance for some months, and in the meantime he hoped to procure some respectable employment in a merchant’s house. The interval was, consequently, spent in inaction; his grief only became more deep and rankling when he had leisure for reflection, and at length it took so fast hold of his mind that at the end of three months he lay on a bed of sickness, incapable of any exertion.
+
+His daughter attended him with the greatest tenderness, but she saw with despair that their little fund was rapidly decreasing and that there was no other prospect of support. But Caroline Beaufort possessed a mind of an uncommon mould, and her courage rose to support her in her adversity. She procured plain work; she plaited straw and by various means contrived to earn a pittance scarcely sufficient to support life.
+
+Several months passed in this manner. Her father grew worse; her time was more entirely occupied in attending him; her means of subsistence decreased; and in the tenth month her father died in her arms, leaving her an orphan and a beggar. This last blow overcame her, and she knelt by Beaufort’s coffin weeping bitterly, when my father entered the chamber. He came like a protecting spirit to the poor girl, who committed herself to his care; and after the interment of his friend he conducted her to Geneva and placed her under the protection of a relation. Two years after this event Caroline became his wife.
+
+There was a considerable difference between the ages of my parents, but this circumstance seemed to unite them only closer in bonds of devoted affection. There was a sense of justice in my father’s upright mind which rendered it necessary that he should approve highly to love strongly. Perhaps during former years he had suffered from the late-discovered unworthiness of one beloved and so was disposed to set a greater value on tried worth. There was a show of gratitude and worship in his attachment to my mother, differing wholly from the doting fondness of age, for it was inspired by reverence for her virtues and a desire to be the means of, in some degree, recompensing her for the sorrows she had endured, but which gave inexpressible grace to his behaviour to her. Everything was made to yield to her wishes and her convenience. He strove to shelter her, as a fair exotic is sheltered by the gardener, from every rougher wind and to surround her with all that could tend to excite pleasurable emotion in her soft and benevolent mind. Her health, and even the tranquillity of her hitherto constant spirit, had been shaken by what she had gone through. During the two years that had elapsed previous to their marriage my father had gradually relinquished all his public functions; and immediately after their union they sought the pleasant climate of Italy, and the change of scene and interest attendant on a tour through that land of wonders, as a restorative for her weakened frame.
+
+From Italy they visited Germany and France. I, their eldest child, was born at Naples, and as an infant accompanied them in their rambles. I remained for several years their only child. Much as they were attached to each other, they seemed to draw inexhaustible stores of affection from a very mine of love to bestow them upon me. My mother’s tender caresses and my father’s smile of benevolent pleasure while regarding me are my first recollections. I was their plaything and their idol, and something better—their child, the innocent and helpless creature bestowed on them by Heaven, whom to bring up to good, and whose future lot it was in their hands to direct to happiness or misery, according as they fulfilled their duties towards me. With this deep consciousness of what they owed towards the being to which they had given life, added to the active spirit of tenderness that animated both, it may be imagined that while during every hour of my infant life I received a lesson of patience, of charity, and of self-control, I was so guided by a silken cord that all seemed but one train of enjoyment to me.
+
+For a long time I was their only care. My mother had much desired to have a daughter, but I continued their single offspring. When I was about five years old, while making an excursion beyond the frontiers of Italy, they passed a week on the shores of the Lake of Como. Their benevolent disposition often made them enter the cottages of the poor. This, to my mother, was more than a duty; it was a necessity, a passion—remembering what she had suffered, and how she had been relieved—for her to act in her turn the guardian angel to the afflicted. During one of their walks a poor cot in the foldings of a vale attracted their notice as being singularly disconsolate, while the number of half-clothed children gathered about it spoke of penury in its worst shape. One day, when my father had gone by himself to Milan, my mother, accompanied by me, visited this abode. She found a peasant and his wife, hard working, bent down by care and labour, distributing a scanty meal to five hungry babes. Among these there was one which attracted my mother far above all the rest. She appeared of a different stock. The four others were dark-eyed, hardy little vagrants; this child was thin and very fair. Her hair was the brightest living gold, and despite the poverty of her clothing, seemed to set a crown of distinction on her head. Her brow was clear and ample, her blue eyes cloudless, and her lips and the moulding of her face so expressive of sensibility and sweetness that none could behold her without looking on her as of a distinct species, a being heaven-sent, and bearing a celestial stamp in all her features.
+
+The peasant woman, perceiving that my mother fixed eyes of wonder and admiration on this lovely girl, eagerly communicated her history. She was not her child, but the daughter of a Milanese nobleman. Her mother was a German and had died on giving her birth. The infant had been placed with these good people to nurse: they were better off then. They had not been long married, and their eldest child was but just born. The father of their charge was one of those Italians nursed in the memory of the antique glory of Italy—one among the schiavi ognor frementi, who exerted himself to obtain the liberty of his country. He became the victim of its weakness. Whether he had died or still lingered in the dungeons of Austria was not known. His property was confiscated; his child became an orphan and a beggar. She continued with her foster parents and bloomed in their rude abode, fairer than a garden rose among dark-leaved brambles.
+
+When my father returned from Milan, he found playing with me in the hall of our villa a child fairer than pictured cherub—a creature who seemed to shed radiance from her looks and whose form and motions were lighter than the chamois of the hills. The apparition was soon explained. With his permission my mother prevailed on her rustic guardians to yield their charge to her. They were fond of the sweet orphan. Her presence had seemed a blessing to them, but it would be unfair to her to keep her in poverty and want when Providence afforded her such powerful protection. They consulted their village priest, and the result was that Elizabeth Lavenza became the inmate of my parents’ house—my more than sister—the beautiful and adored companion of all my occupations and my pleasures.
+
+Everyone loved Elizabeth. The passionate and almost reverential attachment with which all regarded her became, while I shared it, my pride and my delight. On the evening previous to her being brought to my home, my mother had said playfully, “I have a pretty present for my Victor—tomorrow he shall have it.” And when, on the morrow, she presented Elizabeth to me as her promised gift, I, with childish seriousness, interpreted her words literally and looked upon Elizabeth as mine—mine to protect, love, and cherish. All praises bestowed on her I received as made to a possession of my own. We called each other familiarly by the name of cousin. No word, no expression could body forth the kind of relation in which she stood to me—my more than sister, since till death she was to be mine only.
+
+Chapter 2
+We were brought up together; there was not quite a year difference in our ages. I need not say that we were strangers to any species of disunion or dispute. Harmony was the soul of our companionship, and the diversity and contrast that subsisted in our characters drew us nearer together. Elizabeth was of a calmer and more concentrated disposition; but, with all my ardour, I was capable of a more intense application and was more deeply smitten with the thirst for knowledge. She busied herself with following the aerial creations of the poets; and in the majestic and wondrous scenes which surrounded our Swiss home —the sublime shapes of the mountains, the changes of the seasons, tempest and calm, the silence of winter, and the life and turbulence of our Alpine summers—she found ample scope for admiration and delight. While my companion contemplated with a serious and satisfied spirit the magnificent appearances of things, I delighted in investigating their causes. The world was to me a secret which I desired to divine. Curiosity, earnest research to learn the hidden laws of nature, gladness akin to rapture, as they were unfolded to me, are among the earliest sensations I can remember.
+
+On the birth of a second son, my junior by seven years, my parents gave up entirely their wandering life and fixed themselves in their native country. We possessed a house in Geneva, and a campagne on Belrive, the eastern shore of the lake, at the distance of rather more than a league from the city. We resided principally in the latter, and the lives of my parents were passed in considerable seclusion. It was my temper to avoid a crowd and to attach myself fervently to a few. I was indifferent, therefore, to my school-fellows in general; but I united myself in the bonds of the closest friendship to one among them. Henry Clerval was the son of a merchant of Geneva. He was a boy of singular talent and fancy. He loved enterprise, hardship, and even danger for its own sake. He was deeply read in books of chivalry and romance. He composed heroic songs and began to write many a tale of enchantment and knightly adventure. He tried to make us act plays and to enter into masquerades, in which the characters were drawn from the heroes of Roncesvalles, of the Round Table of King Arthur, and the chivalrous train who shed their blood to redeem the holy sepulchre from the hands of the infidels.
+
+No human being could have passed a happier childhood than myself. My parents were possessed by the very spirit of kindness and indulgence. We felt that they were not the tyrants to rule our lot according to their caprice, but the agents and creators of all the many delights which we enjoyed. When I mingled with other families I distinctly discerned how peculiarly fortunate my lot was, and gratitude assisted the development of filial love.
+
+My temper was sometimes violent, and my passions vehement; but by some law in my temperature they were turned not towards childish pursuits but to an eager desire to learn, and not to learn all things indiscriminately. I confess that neither the structure of languages, nor the code of governments, nor the politics of various states possessed attractions for me. It was the secrets of heaven and earth that I desired to learn; and whether it was the outward substance of things or the inner spirit of nature and the mysterious soul of man that occupied me, still my inquiries were directed to the metaphysical, or in its highest sense, the physical secrets of the world.
+
+Meanwhile Clerval occupied himself, so to speak, with the moral relations of things. The busy stage of life, the virtues of heroes, and the actions of men were his theme; and his hope and his dream was to become one among those whose names are recorded in story as the gallant and adventurous benefactors of our species. The saintly soul of Elizabeth shone like a shrine-dedicated lamp in our peaceful home. Her sympathy was ours; her smile, her soft voice, the sweet glance of her celestial eyes, were ever there to bless and animate us. She was the living spirit of love to soften and attract; I might have become sullen in my study, rough through the ardour of my nature, but that she was there to subdue me to a semblance of her own gentleness. And Clerval—could aught ill entrench on the noble spirit of Clerval? Yet he might not have been so perfectly humane, so thoughtful in his generosity, so full of kindness and tenderness amidst his passion for adventurous exploit, had she not unfolded to him the real loveliness of beneficence and made the doing good the end and aim of his soaring ambition.
+
+I feel exquisite pleasure in dwelling on the recollections of childhood, before misfortune had tainted my mind and changed its bright visions of extensive usefulness into gloomy and narrow reflections upon self. Besides, in drawing the picture of my early days, I also record those events which led, by insensible steps, to my after tale of misery, for when I would account to myself for the birth of that passion which afterwards ruled my destiny I find it arise, like a mountain river, from ignoble and almost forgotten sources; but, swelling as it proceeded, it became the torrent which, in its course, has swept away all my hopes and joys.
+
+Natural philosophy is the genius that has regulated my fate; I desire, therefore, in this narration, to state those facts which led to my predilection for that science. When I was thirteen years of age we all went on a party of pleasure to the baths near Thonon; the inclemency of the weather obliged us to remain a day confined to the inn. In this house I chanced to find a volume of the works of Cornelius Agrippa. I opened it with apathy; the theory which he attempts to demonstrate and the wonderful facts which he relates soon changed this feeling into enthusiasm. A new light seemed to dawn upon my mind, and bounding with joy, I communicated my discovery to my father. My father looked carelessly at the title page of my book and said, “Ah! Cornelius Agrippa! My dear Victor, do not waste your time upon this; it is sad trash.”
+
+If, instead of this remark, my father had taken the pains to explain to me that the principles of Agrippa had been entirely exploded and that a modern system of science had been introduced which possessed much greater powers than the ancient, because the powers of the latter were chimerical, while those of the former were real and practical, under such circumstances I should certainly have thrown Agrippa aside and have contented my imagination, warmed as it was, by returning with greater ardour to my former studies. It is even possible that the train of my ideas would never have received the fatal impulse that led to my ruin. But the cursory glance my father had taken of my volume by no means assured me that he was acquainted with its contents, and I continued to read with the greatest avidity.
+
+When I returned home my first care was to procure the whole works of this author, and afterwards of Paracelsus and Albertus Magnus. I read and studied the wild fancies of these writers with delight; they appeared to me treasures known to few besides myself. I have described myself as always having been imbued with a fervent longing to penetrate the secrets of nature. In spite of the intense labour and wonderful discoveries of modern philosophers, I always came from my studies discontented and unsatisfied. Sir Isaac Newton is said to have avowed that he felt like a child picking up shells beside the great and unexplored ocean of truth. Those of his successors in each branch of natural philosophy with whom I was acquainted appeared even to my boy’s apprehensions as tyros engaged in the same pursuit.
+
+The untaught peasant beheld the elements around him and was acquainted with their practical uses. The most learned philosopher knew little more. He had partially unveiled the face of Nature, but her immortal lineaments were still a wonder and a mystery. He might dissect, anatomise, and give names; but, not to speak of a final cause, causes in their secondary and tertiary grades were utterly unknown to him. I had gazed upon the fortifications and impediments that seemed to keep human beings from entering the citadel of nature, and rashly and ignorantly I had repined.
+
+But here were books, and here were men who had penetrated deeper and knew more. I took their word for all that they averred, and I became their disciple. It may appear strange that such should arise in the eighteenth century; but while I followed the routine of education in the schools of Geneva, I was, to a great degree, self-taught with regard to my favourite studies. My father was not scientific, and I was left to struggle with a child’s blindness, added to a student’s thirst for knowledge. Under the guidance of my new preceptors I entered with the greatest diligence into the search of the philosopher’s stone and the elixir of life; but the latter soon obtained my undivided attention. Wealth was an inferior object, but what glory would attend the discovery if I could banish disease from the human frame and render man invulnerable to any but a violent death!
+
+Nor were these my only visions. The raising of ghosts or devils was a promise liberally accorded by my favourite authors, the fulfilment of which I most eagerly sought; and if my incantations were always unsuccessful, I attributed the failure rather to my own inexperience and mistake than to a want of skill or fidelity in my instructors. And thus for a time I was occupied by exploded systems, mingling, like an unadept, a thousand contradictory theories and floundering desperately in a very slough of multifarious knowledge, guided by an ardent imagination and childish reasoning, till an accident again changed the current of my ideas.
+
+When I was about fifteen years old we had retired to our house near Belrive, when we witnessed a most violent and terrible thunderstorm. It advanced from behind the mountains of Jura, and the thunder burst at once with frightful loudness from various quarters of the heavens. I remained, while the storm lasted, watching its progress with curiosity and delight. As I stood at the door, on a sudden I beheld a stream of fire issue from an old and beautiful oak which stood about twenty yards from our house; and so soon as the dazzling light vanished, the oak had disappeared, and nothing remained but a blasted stump. When we visited it the next morning, we found the tree shattered in a singular manner. It was not splintered by the shock, but entirely reduced to thin ribbons of wood. I never beheld anything so utterly destroyed.
+
+Before this I was not unacquainted with the more obvious laws of electricity. On this occasion a man of great research in natural philosophy was with us, and excited by this catastrophe, he entered on the explanation of a theory which he had formed on the subject of electricity and galvanism, which was at once new and astonishing to me. All that he said threw greatly into the shade Cornelius Agrippa, Albertus Magnus, and Paracelsus, the lords of my imagination; but by some fatality the overthrow of these men disinclined me to pursue my accustomed studies. It seemed to me as if nothing would or could ever be known. All that had so long engaged my attention suddenly grew despicable. By one of those caprices of the mind which we are perhaps most subject to in early youth, I at once gave up my former occupations, set down natural history and all its progeny as a deformed and abortive creation, and entertained the greatest disdain for a would-be science which could never even step within the threshold of real knowledge. In this mood of mind I betook myself to the mathematics and the branches of study appertaining to that science as being built upon secure foundations, and so worthy of my consideration.
+
+Thus strangely are our souls constructed, and by such slight ligaments are we bound to prosperity or ruin. When I look back, it seems to me as if this almost miraculous change of inclination and will was the immediate suggestion of the guardian angel of my life—the last effort made by the spirit of preservation to avert the storm that was even then hanging in the stars and ready to envelop me. Her victory was announced by an unusual tranquillity and gladness of soul which followed the relinquishing of my ancient and latterly tormenting studies. It was thus that I was to be taught to associate evil with their prosecution, happiness with their disregard.
+
+It was a strong effort of the spirit of good, but it was ineffectual. Destiny was too potent, and her immutable laws had decreed my utter and terrible destruction.
+
+Chapter 3
+When I had attained the age of seventeen my parents resolved that I should become a student at the university of Ingolstadt. I had hitherto attended the schools of Geneva, but my father thought it necessary for the completion of my education that I should be made acquainted with other customs than those of my native country. My departure was therefore fixed at an early date, but before the day resolved upon could arrive, the first misfortune of my life occurred—an omen, as it were, of my future misery.
+
+Elizabeth had caught the scarlet fever; her illness was severe, and she was in the greatest danger. During her illness many arguments had been urged to persuade my mother to refrain from attending upon her. She had at first yielded to our entreaties, but when she heard that the life of her favourite was menaced, she could no longer control her anxiety. She attended her sickbed; her watchful attentions triumphed over the malignity of the distemper—Elizabeth was saved, but the consequences of this imprudence were fatal to her preserver. On the third day my mother sickened; her fever was accompanied by the most alarming symptoms, and the looks of her medical attendants prognosticated the worst event. On her deathbed the fortitude and benignity of this best of women did not desert her. She joined the hands of Elizabeth and myself. “My children,” she said, “my firmest hopes of future happiness were placed on the prospect of your union. This expectation will now be the consolation of your father. Elizabeth, my love, you must supply my place to my younger children. Alas! I regret that I am taken from you; and, happy and beloved as I have been, is it not hard to quit you all? But these are not thoughts befitting me; I will endeavour to resign myself cheerfully to death and will indulge a hope of meeting you in another world.”
+
+She died calmly, and her countenance expressed affection even in death. I need not describe the feelings of those whose dearest ties are rent by that most irreparable evil, the void that presents itself to the soul, and the despair that is exhibited on the countenance. It is so long before the mind can persuade itself that she whom we saw every day and whose very existence appeared a part of our own can have departed for ever—that the brightness of a beloved eye can have been extinguished and the sound of a voice so familiar and dear to the ear can be hushed, never more to be heard. These are the reflections of the first days; but when the lapse of time proves the reality of the evil, then the actual bitterness of grief commences. Yet from whom has not that rude hand rent away some dear connection? And why should I describe a sorrow which all have felt, and must feel? The time at length arrives when grief is rather an indulgence than a necessity; and the smile that plays upon the lips, although it may be deemed a sacrilege, is not banished. My mother was dead, but we had still duties which we ought to perform; we must continue our course with the rest and learn to think ourselves fortunate whilst one remains whom the spoiler has not seized.
+
+My departure for Ingolstadt, which had been deferred by these events, was now again determined upon. I obtained from my father a respite of some weeks. It appeared to me sacrilege so soon to leave the repose, akin to death, of the house of mourning and to rush into the thick of life. I was new to sorrow, but it did not the less alarm me. I was unwilling to quit the sight of those that remained to me, and above all, I desired to see my sweet Elizabeth in some degree consoled.
+
+She indeed veiled her grief and strove to act the comforter to us all. She looked steadily on life and assumed its duties with courage and zeal. She devoted herself to those whom she had been taught to call her uncle and cousins. Never was she so enchanting as at this time, when she recalled the sunshine of her smiles and spent them upon us. She forgot even her own regret in her endeavours to make us forget.
+
+The day of my departure at length arrived. Clerval spent the last evening with us. He had endeavoured to persuade his father to permit him to accompany me and to become my fellow student, but in vain. His father was a narrow-minded trader and saw idleness and ruin in the aspirations and ambition of his son. Henry deeply felt the misfortune of being debarred from a liberal education. He said little, but when he spoke I read in his kindling eye and in his animated glance a restrained but firm resolve not to be chained to the miserable details of commerce.
+
+We sat late. We could not tear ourselves away from each other nor persuade ourselves to say the word “Farewell!” It was said, and we retired under the pretence of seeking repose, each fancying that the other was deceived; but when at morning’s dawn I descended to the carriage which was to convey me away, they were all there—my father again to bless me, Clerval to press my hand once more, my Elizabeth to renew her entreaties that I would write often and to bestow the last feminine attentions on her playmate and friend.
+
+I threw myself into the chaise that was to convey me away and indulged in the most melancholy reflections. I, who had ever been surrounded by amiable companions, continually engaged in endeavouring to bestow mutual pleasure—I was now alone. In the university whither I was going I must form my own friends and be my own protector. My life had hitherto been remarkably secluded and domestic, and this had given me invincible repugnance to new countenances. I loved my brothers, Elizabeth, and Clerval; these were “old familiar faces,” but I believed myself totally unfitted for the company of strangers. Such were my reflections as I commenced my journey; but as I proceeded, my spirits and hopes rose. I ardently desired the acquisition of knowledge. I had often, when at home, thought it hard to remain during my youth cooped up in one place and had longed to enter the world and take my station among other human beings. Now my desires were complied with, and it would, indeed, have been folly to repent.
+
+I had sufficient leisure for these and many other reflections during my journey to Ingolstadt, which was long and fatiguing. At length the high white steeple of the town met my eyes. I alighted and was conducted to my solitary apartment to spend the evening as I pleased.
+
+The next morning I delivered my letters of introduction and paid a visit to some of the principal professors. Chance—or rather the evil influence, the Angel of Destruction, which asserted omnipotent sway over me from the moment I turned my reluctant steps from my father’s door—led me first to M. Krempe, professor of natural philosophy. He was an uncouth man, but deeply imbued in the secrets of his science. He asked me several questions concerning my progress in the different branches of science appertaining to natural philosophy. I replied carelessly, and partly in contempt, mentioned the names of my alchemists as the principal authors I had studied. The professor stared. “Have you,” he said, “really spent your time in studying such nonsense?”
+
+I replied in the affirmative. “Every minute,” continued M. Krempe with warmth, “every instant that you have wasted on those books is utterly and entirely lost. You have burdened your memory with exploded systems and useless names. Good God! In what desert land have you lived, where no one was kind enough to inform you that these fancies which you have so greedily imbibed are a thousand years old and as musty as they are ancient? I little expected, in this enlightened and scientific age, to find a disciple of Albertus Magnus and Paracelsus. My dear sir, you must begin your studies entirely anew.”
+
+So saying, he stepped aside and wrote down a list of several books treating of natural philosophy which he desired me to procure, and dismissed me after mentioning that in the beginning of the following week he intended to commence a course of lectures upon natural philosophy in its general relations, and that M. Waldman, a fellow professor, would lecture upon chemistry the alternate days that he omitted.
+
+I returned home not disappointed, for I have said that I had long considered those authors useless whom the professor reprobated; but I returned not at all the more inclined to recur to these studies in any shape. M. Krempe was a little squat man with a gruff voice and a repulsive countenance; the teacher, therefore, did not prepossess me in favour of his pursuits. In rather a too philosophical and connected a strain, perhaps, I have given an account of the conclusions I had come to concerning them in my early years. As a child I had not been content with the results promised by the modern professors of natural science. With a confusion of ideas only to be accounted for by my extreme youth and my want of a guide on such matters, I had retrod the steps of knowledge along the paths of time and exchanged the discoveries of recent inquirers for the dreams of forgotten alchemists. Besides, I had a contempt for the uses of modern natural philosophy. It was very different when the masters of the science sought immortality and power; such views, although futile, were grand; but now the scene was changed. The ambition of the inquirer seemed to limit itself to the annihilation of those visions on which my interest in science was chiefly founded. I was required to exchange chimeras of boundless grandeur for realities of little worth.
+
+Such were my reflections during the first two or three days of my residence at Ingolstadt, which were chiefly spent in becoming acquainted with the localities and the principal residents in my new abode. But as the ensuing week commenced, I thought of the information which M. Krempe had given me concerning the lectures. And although I could not consent to go and hear that little conceited fellow deliver sentences out of a pulpit, I recollected what he had said of M. Waldman, whom I had never seen, as he had hitherto been out of town.
+
+Partly from curiosity and partly from idleness, I went into the lecturing room, which M. Waldman entered shortly after. This professor was very unlike his colleague. He appeared about fifty years of age, but with an aspect expressive of the greatest benevolence; a few grey hairs covered his temples, but those at the back of his head were nearly black. His person was short but remarkably erect and his voice the sweetest I had ever heard. He began his lecture by a recapitulation of the history of chemistry and the various improvements made by different men of learning, pronouncing with fervour the names of the most distinguished discoverers. He then took a cursory view of the present state of the science and explained many of its elementary terms. After having made a few preparatory experiments, he concluded with a panegyric upon modern chemistry, the terms of which I shall never forget:
+
+“The ancient teachers of this science,” said he, “promised impossibilities and performed nothing. The modern masters promise very little; they know that metals cannot be transmuted and that the elixir of life is a chimera but these philosophers, whose hands seem only made to dabble in dirt, and their eyes to pore over the microscope or crucible, have indeed performed miracles. They penetrate into the recesses of nature and show how she works in her hiding-places. They ascend into the heavens; they have discovered how the blood circulates, and the nature of the air we breathe. They have acquired new and almost unlimited powers; they can command the thunders of heaven, mimic the earthquake, and even mock the invisible world with its own shadows.”
+
+Such were the professor’s words—rather let me say such the words of the fate—enounced to destroy me. As he went on I felt as if my soul were grappling with a palpable enemy; one by one the various keys were touched which formed the mechanism of my being; chord after chord was sounded, and soon my mind was filled with one thought, one conception, one purpose. So much has been done, exclaimed the soul of Frankenstein—more, far more, will I achieve; treading in the steps already marked, I will pioneer a new way, explore unknown powers, and unfold to the world the deepest mysteries of creation.
+
+I closed not my eyes that night. My internal being was in a state of insurrection and turmoil; I felt that order would thence arise, but I had no power to produce it. By degrees, after the morning’s dawn, sleep came. I awoke, and my yesternight’s thoughts were as a dream. There only remained a resolution to return to my ancient studies and to devote myself to a science for which I believed myself to possess a natural talent. On the same day I paid M. Waldman a visit. His manners in private were even more mild and attractive than in public, for there was a certain dignity in his mien during his lecture which in his own house was replaced by the greatest affability and kindness. I gave him pretty nearly the same account of my former pursuits as I had given to his fellow professor. He heard with attention the little narration concerning my studies and smiled at the names of Cornelius Agrippa and Paracelsus, but without the contempt that M. Krempe had exhibited. He said that “These were men to whose indefatigable zeal modern philosophers were indebted for most of the foundations of their knowledge. They had left to us, as an easier task, to give new names and arrange in connected classifications the facts which they in a great degree had been the instruments of bringing to light. The labours of men of genius, however erroneously directed, scarcely ever fail in ultimately turning to the solid advantage of mankind.” I listened to his statement, which was delivered without any presumption or affectation, and then added that his lecture had removed my prejudices against modern chemists; I expressed myself in measured terms, with the modesty and deference due from a youth to his instructor, without letting escape (inexperience in life would have made me ashamed) any of the enthusiasm which stimulated my intended labours. I requested his advice concerning the books I ought to procure.
+
+“I am happy,” said M. Waldman, “to have gained a disciple; and if your application equals your ability, I have no doubt of your success. Chemistry is that branch of natural philosophy in which the greatest improvements have been and may be made; it is on that account that I have made it my peculiar study; but at the same time, I have not neglected the other branches of science. A man would make but a very sorry chemist if he attended to that department of human knowledge alone. If your wish is to become really a man of science and not merely a petty experimentalist, I should advise you to apply to every branch of natural philosophy, including mathematics.”
+
+He then took me into his laboratory and explained to me the uses of his various machines, instructing me as to what I ought to procure and promising me the use of his own when I should have advanced far enough in the science not to derange their mechanism. He also gave me the list of books which I had requested, and I took my leave.
+
+Thus ended a day memorable to me; it decided my future destiny.
+
+Chapter 4
+From this day natural philosophy, and particularly chemistry, in the most comprehensive sense of the term, became nearly my sole occupation. I read with ardour those works, so full of genius and discrimination, which modern inquirers have written on these subjects. I attended the lectures and cultivated the acquaintance of the men of science of the university, and I found even in M. Krempe a great deal of sound sense and real information, combined, it is true, with a repulsive physiognomy and manners, but not on that account the less valuable. In M. Waldman I found a true friend. His gentleness was never tinged by dogmatism, and his instructions were given with an air of frankness and good nature that banished every idea of pedantry. In a thousand ways he smoothed for me the path of knowledge and made the most abstruse inquiries clear and facile to my apprehension. My application was at first fluctuating and uncertain; it gained strength as I proceeded and soon became so ardent and eager that the stars often disappeared in the light of morning whilst I was yet engaged in my laboratory.
+
+As I applied so closely, it may be easily conceived that my progress was rapid. My ardour was indeed the astonishment of the students, and my proficiency that of the masters. Professor Krempe often asked me, with a sly smile, how Cornelius Agrippa went on, whilst M. Waldman expressed the most heartfelt exultation in my progress. Two years passed in this manner, during which I paid no visit to Geneva, but was engaged, heart and soul, in the pursuit of some discoveries which I hoped to make. None but those who have experienced them can conceive of the enticements of science. In other studies you go as far as others have gone before you, and there is nothing more to know; but in a scientific pursuit there is continual food for discovery and wonder. A mind of moderate capacity which closely pursues one study must infallibly arrive at great proficiency in that study; and I, who continually sought the attainment of one object of pursuit and was solely wrapped up in this, improved so rapidly that at the end of two years I made some discoveries in the improvement of some chemical instruments, which procured me great esteem and admiration at the university. When I had arrived at this point and had become as well acquainted with the theory and practice of natural philosophy as depended on the lessons of any of the professors at Ingolstadt, my residence there being no longer conducive to my improvements, I thought of returning to my friends and my native town, when an incident happened that protracted my stay.
+
+One of the phenomena which had peculiarly attracted my attention was the structure of the human frame, and, indeed, any animal endued with life. Whence, I often asked myself, did the principle of life proceed? It was a bold question, and one which has ever been considered as a mystery; yet with how many things are we upon the brink of becoming acquainted, if cowardice or carelessness did not restrain our inquiries. I revolved these circumstances in my mind and determined thenceforth to apply myself more particularly to those branches of natural philosophy which relate to physiology. Unless I had been animated by an almost supernatural enthusiasm, my application to this study would have been irksome and almost intolerable. To examine the causes of life, we must first have recourse to death. I became acquainted with the science of anatomy, but this was not sufficient; I must also observe the natural decay and corruption of the human body. In my education my father had taken the greatest precautions that my mind should be impressed with no supernatural horrors. I do not ever remember to have trembled at a tale of superstition or to have feared the apparition of a spirit. Darkness had no effect upon my fancy, and a churchyard was to me merely the receptacle of bodies deprived of life, which, from being the seat of beauty and strength, had become food for the worm. Now I was led to examine the cause and progress of this decay and forced to spend days and nights in vaults and charnel-houses. My attention was fixed upon every object the most insupportable to the delicacy of the human feelings. I saw how the fine form of man was degraded and wasted; I beheld the corruption of death succeed to the blooming cheek of life; I saw how the worm inherited the wonders of the eye and brain. I paused, examining and analysing all the minutiae of causation, as exemplified in the change from life to death, and death to life, until from the midst of this darkness a sudden light broke in upon me—a light so brilliant and wondrous, yet so simple, that while I became dizzy with the immensity of the prospect which it illustrated, I was surprised that among so many men of genius who had directed their inquiries towards the same science, that I alone should be reserved to discover so astonishing a secret.
+
+Remember, I am not recording the vision of a madman. The sun does not more certainly shine in the heavens than that which I now affirm is true. Some miracle might have produced it, yet the stages of the discovery were distinct and probable. After days and nights of incredible labour and fatigue, I succeeded in discovering the cause of generation and life; nay, more, I became myself capable of bestowing animation upon lifeless matter.
+
+The astonishment which I had at first experienced on this discovery soon gave place to delight and rapture. After so much time spent in painful labour, to arrive at once at the summit of my desires was the most gratifying consummation of my toils. But this discovery was so great and overwhelming that all the steps by which I had been progressively led to it were obliterated, and I beheld only the result. What had been the study and desire of the wisest men since the creation of the world was now within my grasp. Not that, like a magic scene, it all opened upon me at once: the information I had obtained was of a nature rather to direct my endeavours so soon as I should point them towards the object of my search than to exhibit that object already accomplished. I was like the Arabian who had been buried with the dead and found a passage to life, aided only by one glimmering and seemingly ineffectual light.
+
+I see by your eagerness and the wonder and hope which your eyes express, my friend, that you expect to be informed of the secret with which I am acquainted; that cannot be; listen patiently until the end of my story, and you will easily perceive why I am reserved upon that subject. I will not lead you on, unguarded and ardent as I then was, to your destruction and infallible misery. Learn from me, if not by my precepts, at least by my example, how dangerous is the acquirement of knowledge and how much happier that man is who believes his native town to be the world, than he who aspires to become greater than his nature will allow.
+
+When I found so astonishing a power placed within my hands, I hesitated a long time concerning the manner in which I should employ it. Although I possessed the capacity of bestowing animation, yet to prepare a frame for the reception of it, with all its intricacies of fibres, muscles, and veins, still remained a work of inconceivable difficulty and labour. I doubted at first whether I should attempt the creation of a being like myself, or one of simpler organization; but my imagination was too much exalted by my first success to permit me to doubt of my ability to give life to an animal as complex and wonderful as man. The materials at present within my command hardly appeared adequate to so arduous an undertaking, but I doubted not that I should ultimately succeed. I prepared myself for a multitude of reverses; my operations might be incessantly baffled, and at last my work be imperfect, yet when I considered the improvement which every day takes place in science and mechanics, I was encouraged to hope my present attempts would at least lay the foundations of future success. Nor could I consider the magnitude and complexity of my plan as any argument of its impracticability. It was with these feelings that I began the creation of a human being. As the minuteness of the parts formed a great hindrance to my speed, I resolved, contrary to my first intention, to make the being of a gigantic stature, that is to say, about eight feet in height, and proportionably large. After having formed this determination and having spent some months in successfully collecting and arranging my materials, I began.
+
+No one can conceive the variety of feelings which bore me onwards, like a hurricane, in the first enthusiasm of success. Life and death appeared to me ideal bounds, which I should first break through, and pour a torrent of light into our dark world. A new species would bless me as its creator and source; many happy and excellent natures would owe their being to me. No father could claim the gratitude of his child so completely as I should deserve theirs. Pursuing these reflections, I thought that if I could bestow animation upon lifeless matter, I might in process of time (although I now found it impossible) renew life where death had apparently devoted the body to corruption.
+
+These thoughts supported my spirits, while I pursued my undertaking with unremitting ardour. My cheek had grown pale with study, and my person had become emaciated with confinement. Sometimes, on the very brink of certainty, I failed; yet still I clung to the hope which the next day or the next hour might realise. One secret which I alone possessed was the hope to which I had dedicated myself; and the moon gazed on my midnight labours, while, with unrelaxed and breathless eagerness, I pursued nature to her hiding-places. Who shall conceive the horrors of my secret toil as I dabbled among the unhallowed damps of the grave or tortured the living animal to animate the lifeless clay? My limbs now tremble, and my eyes swim with the remembrance; but then a resistless and almost frantic impulse urged me forward; I seemed to have lost all soul or sensation but for this one pursuit. It was indeed but a passing trance, that only made me feel with renewed acuteness so soon as, the unnatural stimulus ceasing to operate, I had returned to my old habits. I collected bones from charnel-houses and disturbed, with profane fingers, the tremendous secrets of the human frame. In a solitary chamber, or rather cell, at the top of the house, and separated from all the other apartments by a gallery and staircase, I kept my workshop of filthy creation; my eyeballs were starting from their sockets in attending to the details of my employment. The dissecting room and the slaughter-house furnished many of my materials; and often did my human nature turn with loathing from my occupation, whilst, still urged on by an eagerness which perpetually increased, I brought my work near to a conclusion.
+
+The summer months passed while I was thus engaged, heart and soul, in one pursuit. It was a most beautiful season; never did the fields bestow a more plentiful harvest or the vines yield a more luxuriant vintage, but my eyes were insensible to the charms of nature. And the same feelings which made me neglect the scenes around me caused me also to forget those friends who were so many miles absent, and whom I had not seen for so long a time. I knew my silence disquieted them, and I well remembered the words of my father: “I know that while you are pleased with yourself you will think of us with affection, and we shall hear regularly from you. You must pardon me if I regard any interruption in your correspondence as a proof that your other duties are equally neglected.”
+
+I knew well therefore what would be my father’s feelings, but I could not tear my thoughts from my employment, loathsome in itself, but which had taken an irresistible hold of my imagination. I wished, as it were, to procrastinate all that related to my feelings of affection until the great object, which swallowed up every habit of my nature, should be completed.
+
+I then thought that my father would be unjust if he ascribed my neglect to vice or faultiness on my part, but I am now convinced that he was justified in conceiving that I should not be altogether free from blame. A human being in perfection ought always to preserve a calm and peaceful mind and never to allow passion or a transitory desire to disturb his tranquillity. I do not think that the pursuit of knowledge is an exception to this rule. If the study to which you apply yourself has a tendency to weaken your affections and to destroy your taste for those simple pleasures in which no alloy can possibly mix, then that study is certainly unlawful, that is to say, not befitting the human mind. If this rule were always observed; if no man allowed any pursuit whatsoever to interfere with the tranquillity of his domestic affections, Greece had not been enslaved, Cæsar would have spared his country, America would have been discovered more gradually, and the empires of Mexico and Peru had not been destroyed.
+
+But I forget that I am moralizing in the most interesting part of my tale, and your looks remind me to proceed.
+
+My father made no reproach in his letters and only took notice of my silence by inquiring into my occupations more particularly than before. Winter, spring, and summer passed away during my labours; but I did not watch the blossom or the expanding leaves—sights which before always yielded me supreme delight—so deeply was I engrossed in my occupation. The leaves of that year had withered before my work drew near to a close, and now every day showed me more plainly how well I had succeeded. But my enthusiasm was checked by my anxiety, and I appeared rather like one doomed by slavery to toil in the mines, or any other unwholesome trade than an artist occupied by his favourite employment. Every night I was oppressed by a slow fever, and I became nervous to a most painful degree; the fall of a leaf startled me, and I shunned my fellow creatures as if I had been guilty of a crime. Sometimes I grew alarmed at the wreck I perceived that I had become; the energy of my purpose alone sustained me: my labours would soon end, and I believed that exercise and amusement would then drive away incipient disease; and I promised myself both of these when my creation should be complete.
+
+Chapter 5
+It was on a dreary night of November that I beheld the accomplishment of my toils. With an anxiety that almost amounted to agony, I collected the instruments of life around me, that I might infuse a spark of being into the lifeless thing that lay at my feet. It was already one in the morning; the rain pattered dismally against the panes, and my candle was nearly burnt out, when, by the glimmer of the half-extinguished light, I saw the dull yellow eye of the creature open; it breathed hard, and a convulsive motion agitated its limbs.
+
+How can I describe my emotions at this catastrophe, or how delineate the wretch whom with such infinite pains and care I had endeavoured to form? His limbs were in proportion, and I had selected his features as beautiful. Beautiful! Great God! His yellow skin scarcely covered the work of muscles and arteries beneath; his hair was of a lustrous black, and flowing; his teeth of a pearly whiteness; but these luxuriances only formed a more horrid contrast with his watery eyes, that seemed almost of the same colour as the dun-white sockets in which they were set, his shrivelled complexion and straight black lips.
+
+The different accidents of life are not so changeable as the feelings of human nature. I had worked hard for nearly two years, for the sole purpose of infusing life into an inanimate body. For this I had deprived myself of rest and health. I had desired it with an ardour that far exceeded moderation; but now that I had finished, the beauty of the dream vanished, and breathless horror and disgust filled my heart. Unable to endure the aspect of the being I had created, I rushed out of the room and continued a long time traversing my bed-chamber, unable to compose my mind to sleep. At length lassitude succeeded to the tumult I had before endured, and I threw myself on the bed in my clothes, endeavouring to seek a few moments of forgetfulness. But it was in vain; I slept, indeed, but I was disturbed by the wildest dreams. I thought I saw Elizabeth, in the bloom of health, walking in the streets of Ingolstadt. Delighted and surprised, I embraced her, but as I imprinted the first kiss on her lips, they became livid with the hue of death; her features appeared to change, and I thought that I held the corpse of my dead mother in my arms; a shroud enveloped her form, and I saw the grave-worms crawling in the folds of the flannel. I started from my sleep with horror; a cold dew covered my forehead, my teeth chattered, and every limb became convulsed; when, by the dim and yellow light of the moon, as it forced its way through the window shutters, I beheld the wretch—the miserable monster whom I had created. He held up the curtain of the bed; and his eyes, if eyes they may be called, were fixed on me. His jaws opened, and he muttered some inarticulate sounds, while a grin wrinkled his cheeks. He might have spoken, but I did not hear; one hand was stretched out, seemingly to detain me, but I escaped and rushed downstairs. I took refuge in the courtyard belonging to the house which I inhabited, where I remained during the rest of the night, walking up and down in the greatest agitation, listening attentively, catching and fearing each sound as if it were to announce the approach of the demoniacal corpse to which I had so miserably given life.
+
+Oh! No mortal could support the horror of that countenance. A mummy again endued with animation could not be so hideous as that wretch. I had gazed on him while unfinished; he was ugly then, but when those muscles and joints were rendered capable of motion, it became a thing such as even Dante could not have conceived.
+
+I passed the night wretchedly. Sometimes my pulse beat so quickly and hardly that I felt the palpitation of every artery; at others, I nearly sank to the ground through languor and extreme weakness. Mingled with this horror, I felt the bitterness of disappointment; dreams that had been my food and pleasant rest for so long a space were now become a hell to me; and the change was so rapid, the overthrow so complete!
+
+Morning, dismal and wet, at length dawned and discovered to my sleepless and aching eyes the church of Ingolstadt, its white steeple and clock, which indicated the sixth hour. The porter opened the gates of the court, which had that night been my asylum, and I issued into the streets, pacing them with quick steps, as if I sought to avoid the wretch whom I feared every turning of the street would present to my view. I did not dare return to the apartment which I inhabited, but felt impelled to hurry on, although drenched by the rain which poured from a black and comfortless sky.
+
+I continued walking in this manner for some time, endeavouring by bodily exercise to ease the load that weighed upon my mind. I traversed the streets without any clear conception of where I was or what I was doing. My heart palpitated in the sickness of fear, and I hurried on with irregular steps, not daring to look about me:
+
+Like one who, on a lonely road,
+Doth walk in fear and dread,
+And, having once turned round, walks on,
+And turns no more his head;
+Because he knows a frightful fiend
+Doth close behind him tread.
+
+[Coleridge’s “Ancient Mariner.”]
+
+Continuing thus, I came at length opposite to the inn at which the various diligences and carriages usually stopped. Here I paused, I knew not why; but I remained some minutes with my eyes fixed on a coach that was coming towards me from the other end of the street. As it drew nearer I observed that it was the Swiss diligence; it stopped just where I was standing, and on the door being opened, I perceived Henry Clerval, who, on seeing me, instantly sprung out. “My dear Frankenstein,” exclaimed he, “how glad I am to see you! How fortunate that you should be here at the very moment of my alighting!”
+
+Nothing could equal my delight on seeing Clerval; his presence brought back to my thoughts my father, Elizabeth, and all those scenes of home so dear to my recollection. I grasped his hand, and in a moment forgot my horror and misfortune; I felt suddenly, and for the first time during many months, calm and serene joy. I welcomed my friend, therefore, in the most cordial manner, and we walked towards my college. Clerval continued talking for some time about our mutual friends and his own good fortune in being permitted to come to Ingolstadt. “You may easily believe,” said he, “how great was the difficulty to persuade my father that all necessary knowledge was not comprised in the noble art of book-keeping; and, indeed, I believe I left him incredulous to the last, for his constant answer to my unwearied entreaties was the same as that of the Dutch schoolmaster in The Vicar of Wakefield: ‘I have ten thousand florins a year without Greek, I eat heartily without Greek.’ But his affection for me at length overcame his dislike of learning, and he has permitted me to undertake a voyage of discovery to the land of knowledge.”
+
+“It gives me the greatest delight to see you; but tell me how you left my father, brothers, and Elizabeth.”
+
+“Very well, and very happy, only a little uneasy that they hear from you so seldom. By the by, I mean to lecture you a little upon their account myself. But, my dear Frankenstein,” continued he, stopping short and gazing full in my face, “I did not before remark how very ill you appear; so thin and pale; you look as if you had been watching for several nights.”
+
+“You have guessed right; I have lately been so deeply engaged in one occupation that I have not allowed myself sufficient rest, as you see; but I hope, I sincerely hope, that all these employments are now at an end and that I am at length free.”
+
+I trembled excessively; I could not endure to think of, and far less to allude to, the occurrences of the preceding night. I walked with a quick pace, and we soon arrived at my college. I then reflected, and the thought made me shiver, that the creature whom I had left in my apartment might still be there, alive and walking about. I dreaded to behold this monster, but I feared still more that Henry should see him. Entreating him, therefore, to remain a few minutes at the bottom of the stairs, I darted up towards my own room. My hand was already on the lock of the door before I recollected myself. I then paused, and a cold shivering came over me. I threw the door forcibly open, as children are accustomed to do when they expect a spectre to stand in waiting for them on the other side; but nothing appeared. I stepped fearfully in: the apartment was empty, and my bedroom was also freed from its hideous guest. I could hardly believe that so great a good fortune could have befallen me, but when I became assured that my enemy had indeed fled, I clapped my hands for joy and ran down to Clerval.
+
+We ascended into my room, and the servant presently brought breakfast; but I was unable to contain myself. It was not joy only that possessed me; I felt my flesh tingle with excess of sensitiveness, and my pulse beat rapidly. I was unable to remain for a single instant in the same place; I jumped over the chairs, clapped my hands, and laughed aloud. Clerval at first attributed my unusual spirits to joy on his arrival, but when he observed me more attentively, he saw a wildness in my eyes for which he could not account, and my loud, unrestrained, heartless laughter frightened and astonished him.
+
+“My dear Victor,” cried he, “what, for God’s sake, is the matter? Do not laugh in that manner. How ill you are! What is the cause of all this?”
+
+“Do not ask me,” cried I, putting my hands before my eyes, for I thought I saw the dreaded spectre glide into the room; “he can tell. Oh, save me! Save me!” I imagined that the monster seized me; I struggled furiously and fell down in a fit.
+
+Poor Clerval! What must have been his feelings? A meeting, which he anticipated with such joy, so strangely turned to bitterness. But I was not the witness of his grief, for I was lifeless and did not recover my senses for a long, long time.
+
+This was the commencement of a nervous fever which confined me for several months. During all that time Henry was my only nurse. I afterwards learned that, knowing my father’s advanced age and unfitness for so long a journey, and how wretched my sickness would make Elizabeth, he spared them this grief by concealing the extent of my disorder. He knew that I could not have a more kind and attentive nurse than himself; and, firm in the hope he felt of my recovery, he did not doubt that, instead of doing harm, he performed the kindest action that he could towards them.
+
+But I was in reality very ill, and surely nothing but the unbounded and unremitting attentions of my friend could have restored me to life. The form of the monster on whom I had bestowed existence was for ever before my eyes, and I raved incessantly concerning him. Doubtless my words surprised Henry; he at first believed them to be the wanderings of my disturbed imagination, but the pertinacity with which I continually recurred to the same subject persuaded him that my disorder indeed owed its origin to some uncommon and terrible event.
+
+By very slow degrees, and with frequent relapses that alarmed and grieved my friend, I recovered. I remember the first time I became capable of observing outward objects with any kind of pleasure, I perceived that the fallen leaves had disappeared and that the young buds were shooting forth from the trees that shaded my window. It was a divine spring, and the season contributed greatly to my convalescence. I felt also sentiments of joy and affection revive in my bosom; my gloom disappeared, and in a short time I became as cheerful as before I was attacked by the fatal passion.
+
+“Dearest Clerval,” exclaimed I, “how kind, how very good you are to me. This whole winter, instead of being spent in study, as you promised yourself, has been consumed in my sick room. How shall I ever repay you? I feel the greatest remorse for the disappointment of which I have been the occasion, but you will forgive me.”
+
+“You will repay me entirely if you do not discompose yourself, but get well as fast as you can; and since you appear in such good spirits, I may speak to you on one subject, may I not?”
+
+I trembled. One subject! What could it be? Could he allude to an object on whom I dared not even think?
+
+“Compose yourself,” said Clerval, who observed my change of colour, “I will not mention it if it agitates you; but your father and cousin would be very happy if they received a letter from you in your own handwriting. They hardly know how ill you have been and are uneasy at your long silence.”
+
+“Is that all, my dear Henry? How could you suppose that my first thought would not fly towards those dear, dear friends whom I love and who are so deserving of my love?”
+
+“If this is your present temper, my friend, you will perhaps be glad to see a letter that has been lying here some days for you; it is from your cousin, I believe.”
+
+Chapter 6
+Clerval then put the following letter into my hands. It was from my own Elizabeth:
+
+“My dearest Cousin,
+
+“You have been ill, very ill, and even the constant letters of dear kind Henry are not sufficient to reassure me on your account. You are forbidden to write—to hold a pen; yet one word from you, dear Victor, is necessary to calm our apprehensions. For a long time I have thought that each post would bring this line, and my persuasions have restrained my uncle from undertaking a journey to Ingolstadt. I have prevented his encountering the inconveniences and perhaps dangers of so long a journey, yet how often have I regretted not being able to perform it myself! I figure to myself that the task of attending on your sickbed has devolved on some mercenary old nurse, who could never guess your wishes nor minister to them with the care and affection of your poor cousin. Yet that is over now: Clerval writes that indeed you are getting better. I eagerly hope that you will confirm this intelligence soon in your own handwriting.
+
+“Get well—and return to us. You will find a happy, cheerful home and friends who love you dearly. Your father’s health is vigorous, and he asks but to see you, but to be assured that you are well; and not a care will ever cloud his benevolent countenance. How pleased you would be to remark the improvement of our Ernest! He is now sixteen and full of activity and spirit. He is desirous to be a true Swiss and to enter into foreign service, but we cannot part with him, at least until his elder brother returns to us. My uncle is not pleased with the idea of a military career in a distant country, but Ernest never had your powers of application. He looks upon study as an odious fetter; his time is spent in the open air, climbing the hills or rowing on the lake. I fear that he will become an idler unless we yield the point and permit him to enter on the profession which he has selected.
+
+“Little alteration, except the growth of our dear children, has taken place since you left us. The blue lake and snow-clad mountains—they never change; and I think our placid home and our contented hearts are regulated by the same immutable laws. My trifling occupations take up my time and amuse me, and I am rewarded for any exertions by seeing none but happy, kind faces around me. Since you left us, but one change has taken place in our little household. Do you remember on what occasion Justine Moritz entered our family? Probably you do not; I will relate her history, therefore in a few words. Madame Moritz, her mother, was a widow with four children, of whom Justine was the third. This girl had always been the favourite of her father, but through a strange perversity, her mother could not endure her, and after the death of M. Moritz, treated her very ill. My aunt observed this, and when Justine was twelve years of age, prevailed on her mother to allow her to live at our house. The republican institutions of our country have produced simpler and happier manners than those which prevail in the great monarchies that surround it. Hence there is less distinction between the several classes of its inhabitants; and the lower orders, being neither so poor nor so despised, their manners are more refined and moral. A servant in Geneva does not mean the same thing as a servant in France and England. Justine, thus received in our family, learned the duties of a servant, a condition which, in our fortunate country, does not include the idea of ignorance and a sacrifice of the dignity of a human being.
+
+“Justine, you may remember, was a great favourite of yours; and I recollect you once remarked that if you were in an ill humour, one glance from Justine could dissipate it, for the same reason that Ariosto gives concerning the beauty of Angelica—she looked so frank-hearted and happy. My aunt conceived a great attachment for her, by which she was induced to give her an education superior to that which she had at first intended. This benefit was fully repaid; Justine was the most grateful little creature in the world: I do not mean that she made any professions I never heard one pass her lips, but you could see by her eyes that she almost adored her protectress. Although her disposition was gay and in many respects inconsiderate, yet she paid the greatest attention to every gesture of my aunt. She thought her the model of all excellence and endeavoured to imitate her phraseology and manners, so that even now she often reminds me of her.
+
+“When my dearest aunt died every one was too much occupied in their own grief to notice poor Justine, who had attended her during her illness with the most anxious affection. Poor Justine was very ill; but other trials were reserved for her.
+
+“One by one, her brothers and sister died; and her mother, with the exception of her neglected daughter, was left childless. The conscience of the woman was troubled; she began to think that the deaths of her favourites was a judgement from heaven to chastise her partiality. She was a Roman Catholic; and I believe her confessor confirmed the idea which she had conceived. Accordingly, a few months after your departure for Ingolstadt, Justine was called home by her repentant mother. Poor girl! She wept when she quitted our house; she was much altered since the death of my aunt; grief had given softness and a winning mildness to her manners, which had before been remarkable for vivacity. Nor was her residence at her mother’s house of a nature to restore her gaiety. The poor woman was very vacillating in her repentance. She sometimes begged Justine to forgive her unkindness, but much oftener accused her of having caused the deaths of her brothers and sister. Perpetual fretting at length threw Madame Moritz into a decline, which at first increased her irritability, but she is now at peace for ever. She died on the first approach of cold weather, at the beginning of this last winter. Justine has just returned to us; and I assure you I love her tenderly. She is very clever and gentle, and extremely pretty; as I mentioned before, her mien and her expression continually remind me of my dear aunt.
+
+“I must say also a few words to you, my dear cousin, of little darling William. I wish you could see him; he is very tall of his age, with sweet laughing blue eyes, dark eyelashes, and curling hair. When he smiles, two little dimples appear on each cheek, which are rosy with health. He has already had one or two little wives, but Louisa Biron is his favourite, a pretty little girl of five years of age.
+
+“Now, dear Victor, I dare say you wish to be indulged in a little gossip concerning the good people of Geneva. The pretty Miss Mansfield has already received the congratulatory visits on her approaching marriage with a young Englishman, John Melbourne, Esq. Her ugly sister, Manon, married M. Duvillard, the rich banker, last autumn. Your favourite schoolfellow, Louis Manoir, has suffered several misfortunes since the departure of Clerval from Geneva. But he has already recovered his spirits, and is reported to be on the point of marrying a lively pretty Frenchwoman, Madame Tavernier. She is a widow, and much older than Manoir; but she is very much admired, and a favourite with everybody.
+
+“I have written myself into better spirits, dear cousin; but my anxiety returns upon me as I conclude. Write, dearest Victor,—one line—one word will be a blessing to us. Ten thousand thanks to Henry for his kindness, his affection, and his many letters; we are sincerely grateful. Adieu! my cousin; take care of yourself; and, I entreat you, write!
+
+“Elizabeth Lavenza.
+
+“Geneva, March 18th, 17—.”
+
+“Dear, dear Elizabeth!” I exclaimed, when I had read her letter: “I will write instantly and relieve them from the anxiety they must feel.” I wrote, and this exertion greatly fatigued me; but my convalescence had commenced, and proceeded regularly. In another fortnight I was able to leave my chamber.
+
+One of my first duties on my recovery was to introduce Clerval to the several professors of the university. In doing this, I underwent a kind of rough usage, ill befitting the wounds that my mind had sustained. Ever since the fatal night, the end of my labours, and the beginning of my misfortunes, I had conceived a violent antipathy even to the name of natural philosophy. When I was otherwise quite restored to health, the sight of a chemical instrument would renew all the agony of my nervous symptoms. Henry saw this, and had removed all my apparatus from my view. He had also changed my apartment; for he perceived that I had acquired a dislike for the room which had previously been my laboratory. But these cares of Clerval were made of no avail when I visited the professors. M. Waldman inflicted torture when he praised, with kindness and warmth, the astonishing progress I had made in the sciences. He soon perceived that I disliked the subject; but not guessing the real cause, he attributed my feelings to modesty, and changed the subject from my improvement, to the science itself, with a desire, as I evidently saw, of drawing me out. What could I do? He meant to please, and he tormented me. I felt as if he had placed carefully, one by one, in my view those instruments which were to be afterwards used in putting me to a slow and cruel death. I writhed under his words, yet dared not exhibit the pain I felt. Clerval, whose eyes and feelings were always quick in discerning the sensations of others, declined the subject, alleging, in excuse, his total ignorance; and the conversation took a more general turn. I thanked my friend from my heart, but I did not speak. I saw plainly that he was surprised, but he never attempted to draw my secret from me; and although I loved him with a mixture of affection and reverence that knew no bounds, yet I could never persuade myself to confide in him that event which was so often present to my recollection, but which I feared the detail to another would only impress more deeply.
+
+M. Krempe was not equally docile; and in my condition at that time, of almost insupportable sensitiveness, his harsh blunt encomiums gave me even more pain than the benevolent approbation of M. Waldman. “D—n the fellow!” cried he; “why, M. Clerval, I assure you he has outstript us all. Ay, stare if you please; but it is nevertheless true. A youngster who, but a few years ago, believed in Cornelius Agrippa as firmly as in the gospel, has now set himself at the head of the university; and if he is not soon pulled down, we shall all be out of countenance.—Ay, ay,” continued he, observing my face expressive of suffering, “M. Frankenstein is modest; an excellent quality in a young man. Young men should be diffident of themselves, you know, M. Clerval: I was myself when young; but that wears out in a very short time.”
+
+M. Krempe had now commenced an eulogy on himself, which happily turned the conversation from a subject that was so annoying to me.
+
+Clerval had never sympathised in my tastes for natural science; and his literary pursuits differed wholly from those which had occupied me. He came to the university with the design of making himself complete master of the oriental languages, and thus he should open a field for the plan of life he had marked out for himself. Resolved to pursue no inglorious career, he turned his eyes toward the East, as affording scope for his spirit of enterprise. The Persian, Arabic, and Sanskrit languages engaged his attention, and I was easily induced to enter on the same studies. Idleness had ever been irksome to me, and now that I wished to fly from reflection, and hated my former studies, I felt great relief in being the fellow-pupil with my friend, and found not only instruction but consolation in the works of the orientalists. I did not, like him, attempt a critical knowledge of their dialects, for I did not contemplate making any other use of them than temporary amusement. I read merely to understand their meaning, and they well repaid my labours. Their melancholy is soothing, and their joy elevating, to a degree I never experienced in studying the authors of any other country. When you read their writings, life appears to consist in a warm sun and a garden of roses,—in the smiles and frowns of a fair enemy, and the fire that consumes your own heart. How different from the manly and heroical poetry of Greece and Rome!
+
+Summer passed away in these occupations, and my return to Geneva was fixed for the latter end of autumn; but being delayed by several accidents, winter and snow arrived, the roads were deemed impassable, and my journey was retarded until the ensuing spring. I felt this delay very bitterly; for I longed to see my native town and my beloved friends. My return had only been delayed so long, from an unwillingness to leave Clerval in a strange place, before he had become acquainted with any of its inhabitants. The winter, however, was spent cheerfully; and although the spring was uncommonly late, when it came its beauty compensated for its dilatoriness.
+
+The month of May had already commenced, and I expected the letter daily which was to fix the date of my departure, when Henry proposed a pedestrian tour in the environs of Ingolstadt, that I might bid a personal farewell to the country I had so long inhabited. I acceded with pleasure to this proposition: I was fond of exercise, and Clerval had always been my favourite companion in the ramble of this nature that I had taken among the scenes of my native country.
+
+We passed a fortnight in these perambulations: my health and spirits had long been restored, and they gained additional strength from the salubrious air I breathed, the natural incidents of our progress, and the conversation of my friend. Study had before secluded me from the intercourse of my fellow-creatures, and rendered me unsocial; but Clerval called forth the better feelings of my heart; he again taught me to love the aspect of nature, and the cheerful faces of children. Excellent friend! how sincerely you did love me, and endeavour to elevate my mind until it was on a level with your own. A selfish pursuit had cramped and narrowed me, until your gentleness and affection warmed and opened my senses; I became the same happy creature who, a few years ago, loved and beloved by all, had no sorrow or care. When happy, inanimate nature had the power of bestowing on me the most delightful sensations. A serene sky and verdant fields filled me with ecstasy. The present season was indeed divine; the flowers of spring bloomed in the hedges, while those of summer were already in bud. I was undisturbed by thoughts which during the preceding year had pressed upon me, notwithstanding my endeavours to throw them off, with an invincible burden.
+
+Henry rejoiced in my gaiety, and sincerely sympathised in my feelings: he exerted himself to amuse me, while he expressed the sensations that filled his soul. The resources of his mind on this occasion were truly astonishing: his conversation was full of imagination; and very often, in imitation of the Persian and Arabic writers, he invented tales of wonderful fancy and passion. At other times he repeated my favourite poems, or drew me out into arguments, which he supported with great ingenuity.
+
+We returned to our college on a Sunday afternoon: the peasants were dancing, and every one we met appeared gay and happy. My own spirits were high, and I bounded along with feelings of unbridled joy and hilarity.
+
+Chapter 7
+On my return, I found the following letter from my father:—
+
+“My dear Victor,
+
+“You have probably waited impatiently for a letter to fix the date of your return to us; and I was at first tempted to write only a few lines, merely mentioning the day on which I should expect you. But that would be a cruel kindness, and I dare not do it. What would be your surprise, my son, when you expected a happy and glad welcome, to behold, on the contrary, tears and wretchedness? And how, Victor, can I relate our misfortune? Absence cannot have rendered you callous to our joys and griefs; and how shall I inflict pain on my long absent son? I wish to prepare you for the woeful news, but I know it is impossible; even now your eye skims over the page to seek the words which are to convey to you the horrible tidings.
+
+“William is dead!—that sweet child, whose smiles delighted and warmed my heart, who was so gentle, yet so gay! Victor, he is murdered!
+
+“I will not attempt to console you; but will simply relate the circumstances of the transaction.
+
+“Last Thursday (May 7th), I, my niece, and your two brothers, went to walk in Plainpalais. The evening was warm and serene, and we prolonged our walk farther than usual. It was already dusk before we thought of returning; and then we discovered that William and Ernest, who had gone on before, were not to be found. We accordingly rested on a seat until they should return. Presently Ernest came, and enquired if we had seen his brother; he said, that he had been playing with him, that William had run away to hide himself, and that he vainly sought for him, and afterwards waited for a long time, but that he did not return.
+
+“This account rather alarmed us, and we continued to search for him until night fell, when Elizabeth conjectured that he might have returned to the house. He was not there. We returned again, with torches; for I could not rest, when I thought that my sweet boy had lost himself, and was exposed to all the damps and dews of night; Elizabeth also suffered extreme anguish. About five in the morning I discovered my lovely boy, whom the night before I had seen blooming and active in health, stretched on the grass livid and motionless; the print of the murder’s finger was on his neck.
+
+“He was conveyed home, and the anguish that was visible in my countenance betrayed the secret to Elizabeth. She was very earnest to see the corpse. At first I attempted to prevent her but she persisted, and entering the room where it lay, hastily examined the neck of the victim, and clasping her hands exclaimed, ‘O God! I have murdered my darling child!’
+
+“She fainted, and was restored with extreme difficulty. When she again lived, it was only to weep and sigh. She told me, that that same evening William had teased her to let him wear a very valuable miniature that she possessed of your mother. This picture is gone, and was doubtless the temptation which urged the murderer to the deed. We have no trace of him at present, although our exertions to discover him are unremitted; but they will not restore my beloved William!
+
+“Come, dearest Victor; you alone can console Elizabeth. She weeps continually, and accuses herself unjustly as the cause of his death; her words pierce my heart. We are all unhappy; but will not that be an additional motive for you, my son, to return and be our comforter? Your dear mother! Alas, Victor! I now say, Thank God she did not live to witness the cruel, miserable death of her youngest darling!
+
+“Come, Victor; not brooding thoughts of vengeance against the assassin, but with feelings of peace and gentleness, that will heal, instead of festering, the wounds of our minds. Enter the house of mourning, my friend, but with kindness and affection for those who love you, and not with hatred for your enemies.
+
+“Your affectionate and afflicted father,
+“Alphonse Frankenstein.
+
+“Geneva, May 12th, 17—.”
+
+Clerval, who had watched my countenance as I read this letter, was surprised to observe the despair that succeeded the joy I at first expressed on receiving news from my friends. I threw the letter on the table, and covered my face with my hands.
+
+“My dear Frankenstein,” exclaimed Henry, when he perceived me weep with bitterness, “are you always to be unhappy? My dear friend, what has happened?”
+
+I motioned him to take up the letter, while I walked up and down the room in the extremest agitation. Tears also gushed from the eyes of Clerval, as he read the account of my misfortune.
+
+“I can offer you no consolation, my friend,” said he; “your disaster is irreparable. What do you intend to do?”
+
+“To go instantly to Geneva: come with me, Henry, to order the horses.”
+
+During our walk, Clerval endeavoured to say a few words of consolation; he could only express his heartfelt sympathy. “Poor William!” said he, “dear lovely child, he now sleeps with his angel mother! Who that had seen him bright and joyous in his young beauty, but must weep over his untimely loss! To die so miserably; to feel the murderer’s grasp! How much more a murdered that could destroy radiant innocence! Poor little fellow! one only consolation have we; his friends mourn and weep, but he is at rest. The pang is over, his sufferings are at an end for ever. A sod covers his gentle form, and he knows no pain. He can no longer be a subject for pity; we must reserve that for his miserable survivors.”
+
+Clerval spoke thus as we hurried through the streets; the words impressed themselves on my mind and I remembered them afterwards in solitude. But now, as soon as the horses arrived, I hurried into a cabriolet, and bade farewell to my friend.
+
+My journey was very melancholy. At first I wished to hurry on, for I longed to console and sympathise with my loved and sorrowing friends; but when I drew near my native town, I slackened my progress. I could hardly sustain the multitude of feelings that crowded into my mind. I passed through scenes familiar to my youth, but which I had not seen for nearly six years. How altered every thing might be during that time! One sudden and desolating change had taken place; but a thousand little circumstances might have by degrees worked other alterations, which, although they were done more tranquilly, might not be the less decisive. Fear overcame me; I dared no advance, dreading a thousand nameless evils that made me tremble, although I was unable to define them.
+
+I remained two days at Lausanne, in this painful state of mind. I contemplated the lake: the waters were placid; all around was calm; and the snowy mountains, “the palaces of nature,” were not changed. By degrees the calm and heavenly scene restored me, and I continued my journey towards Geneva.
+
+The road ran by the side of the lake, which became narrower as I approached my native town. I discovered more distinctly the black sides of Jura, and the bright summit of Mont Blanc. I wept like a child. “Dear mountains! my own beautiful lake! how do you welcome your wanderer? Your summits are clear; the sky and lake are blue and placid. Is this to prognosticate peace, or to mock at my unhappiness?”
+
+I fear, my friend, that I shall render myself tedious by dwelling on these preliminary circumstances; but they were days of comparative happiness, and I think of them with pleasure. My country, my beloved country! who but a native can tell the delight I took in again beholding thy streams, thy mountains, and, more than all, thy lovely lake!
+
+Yet, as I drew nearer home, grief and fear again overcame me. Night also closed around; and when I could hardly see the dark mountains, I felt still more gloomily. The picture appeared a vast and dim scene of evil, and I foresaw obscurely that I was destined to become the most wretched of human beings. Alas! I prophesied truly, and failed only in one single circumstance, that in all the misery I imagined and dreaded, I did not conceive the hundredth part of the anguish I was destined to endure.
+
+It was completely dark when I arrived in the environs of Geneva; the gates of the town were already shut; and I was obliged to pass the night at Secheron, a village at the distance of half a league from the city. The sky was serene; and, as I was unable to rest, I resolved to visit the spot where my poor William had been murdered. As I could not pass through the town, I was obliged to cross the lake in a boat to arrive at Plainpalais. During this short voyage I saw the lightning playing on the summit of Mont Blanc in the most beautiful figures. The storm appeared to approach rapidly, and, on landing, I ascended a low hill, that I might observe its progress. It advanced; the heavens were clouded, and I soon felt the rain coming slowly in large drops, but its violence quickly increased.
+
+I quitted my seat, and walked on, although the darkness and storm increased every minute, and the thunder burst with a terrific crash over my head. It was echoed from Salêve, the Juras, and the Alps of Savoy; vivid flashes of lightning dazzled my eyes, illuminating the lake, making it appear like a vast sheet of fire; then for an instant every thing seemed of a pitchy darkness, until the eye recovered itself from the preceding flash. The storm, as is often the case in Switzerland, appeared at once in various parts of the heavens. The most violent storm hung exactly north of the town, over the part of the lake which lies between the promontory of Belrive and the village of Copêt. Another storm enlightened Jura with faint flashes; and another darkened and sometimes disclosed the Môle, a peaked mountain to the east of the lake.
+
+While I watched the tempest, so beautiful yet terrific, I wandered on with a hasty step. This noble war in the sky elevated my spirits; I clasped my hands, and exclaimed aloud, “William, dear angel! this is thy funeral, this thy dirge!” As I said these words, I perceived in the gloom a figure which stole from behind a clump of trees near me; I stood fixed, gazing intently: I could not be mistaken. A flash of lightning illuminated the object, and discovered its shape plainly to me; its gigantic stature, and the deformity of its aspect more hideous than belongs to humanity, instantly informed me that it was the wretch, the filthy dæmon, to whom I had given life. What did he there? Could he be (I shuddered at the conception) the murderer of my brother? No sooner did that idea cross my imagination, than I became convinced of its truth; my teeth chattered, and I was forced to lean against a tree for support. The figure passed me quickly, and I lost it in the gloom. Nothing in human shape could have destroyed the fair child. He was the murderer! I could not doubt it. The mere presence of the idea was an irresistible proof of the fact. I thought of pursuing the devil; but it would have been in vain, for another flash discovered him to me hanging among the rocks of the nearly perpendicular ascent of Mont Salêve, a hill that bounds Plainpalais on the south. He soon reached the summit, and disappeared.
+
+I remained motionless. The thunder ceased; but the rain still continued, and the scene was enveloped in an impenetrable darkness. I revolved in my mind the events which I had until now sought to forget: the whole train of my progress toward the creation; the appearance of the works of my own hands at my bedside; its departure. Two years had now nearly elapsed since the night on which he first received life; and was this his first crime? Alas! I had turned loose into the world a depraved wretch, whose delight was in carnage and misery; had he not murdered my brother?
+
+No one can conceive the anguish I suffered during the remainder of the night, which I spent, cold and wet, in the open air. But I did not feel the inconvenience of the weather; my imagination was busy in scenes of evil and despair. I considered the being whom I had cast among mankind, and endowed with the will and power to effect purposes of horror, such as the deed which he had now done, nearly in the light of my own vampire, my own spirit let loose from the grave, and forced to destroy all that was dear to me.
+
+Day dawned; and I directed my steps towards the town. The gates were open, and I hastened to my father’s house. My first thought was to discover what I knew of the murderer, and cause instant pursuit to be made. But I paused when I reflected on the story that I had to tell. A being whom I myself had formed, and endued with life, had met me at midnight among the precipices of an inaccessible mountain. I remembered also the nervous fever with which I had been seized just at the time that I dated my creation, and which would give an air of delirium to a tale otherwise so utterly improbable. I well knew that if any other had communicated such a relation to me, I should have looked upon it as the ravings of insanity. Besides, the strange nature of the animal would elude all pursuit, even if I were so far credited as to persuade my relatives to commence it. And then of what use would be pursuit? Who could arrest a creature capable of scaling the overhanging sides of Mont Salêve? These reflections determined me, and I resolved to remain silent.
+
+It was about five in the morning when I entered my father’s house. I told the servants not to disturb the family, and went into the library to attend their usual hour of rising.
+
+Six years had elapsed, passed in a dream but for one indelible trace, and I stood in the same place where I had last embraced my father before my departure for Ingolstadt. Beloved and venerable parent! He still remained to me. I gazed on the picture of my mother, which stood over the mantel-piece. It was an historical subject, painted at my father’s desire, and represented Caroline Beaufort in an agony of despair, kneeling by the coffin of her dead father. Her garb was rustic, and her cheek pale; but there was an air of dignity and beauty, that hardly permitted the sentiment of pity. Below this picture was a miniature of William; and my tears flowed when I looked upon it. While I was thus engaged, Ernest entered: he had heard me arrive, and hastened to welcome me: “Welcome, my dearest Victor,” said he. “Ah! I wish you had come three months ago, and then you would have found us all joyous and delighted. You come to us now to share a misery which nothing can alleviate; yet your presence will, I hope, revive our father, who seems sinking under his misfortune; and your persuasions will induce poor Elizabeth to cease her vain and tormenting self-accusations.—Poor William! he was our darling and our pride!”
+
+Tears, unrestrained, fell from my brother’s eyes; a sense of mortal agony crept over my frame. Before, I had only imagined the wretchedness of my desolated home; the reality came on me as a new, and a not less terrible, disaster. I tried to calm Ernest; I enquired more minutely concerning my father, and here I named my cousin.
+
+“She most of all,” said Ernest, “requires consolation; she accused herself of having caused the death of my brother, and that made her very wretched. But since the murderer has been discovered—”
+
+“The murderer discovered! Good God! how can that be? who could attempt to pursue him? It is impossible; one might as well try to overtake the winds, or confine a mountain-stream with a straw. I saw him too; he was free last night!”
+
+“I do not know what you mean,” replied my brother, in accents of wonder, “but to us the discovery we have made completes our misery. No one would believe it at first; and even now Elizabeth will not be convinced, notwithstanding all the evidence. Indeed, who would credit that Justine Moritz, who was so amiable, and fond of all the family, could suddenly become so capable of so frightful, so appalling a crime?”
+
+“Justine Moritz! Poor, poor girl, is she the accused? But it is wrongfully; every one knows that; no one believes it, surely, Ernest?”
+
+“No one did at first; but several circumstances came out, that have almost forced conviction upon us; and her own behaviour has been so confused, as to add to the evidence of facts a weight that, I fear, leaves no hope for doubt. But she will be tried today, and you will then hear all.”
+
+He then related that, the morning on which the murder of poor William had been discovered, Justine had been taken ill, and confined to her bed for several days. During this interval, one of the servants, happening to examine the apparel she had worn on the night of the murder, had discovered in her pocket the picture of my mother, which had been judged to be the temptation of the murderer. The servant instantly showed it to one of the others, who, without saying a word to any of the family, went to a magistrate; and, upon their deposition, Justine was apprehended. On being charged with the fact, the poor girl confirmed the suspicion in a great measure by her extreme confusion of manner.
+
+This was a strange tale, but it did not shake my faith; and I replied earnestly, “You are all mistaken; I know the murderer. Justine, poor, good Justine, is innocent.”
+
+At that instant my father entered. I saw unhappiness deeply impressed on his countenance, but he endeavoured to welcome me cheerfully; and, after we had exchanged our mournful greeting, would have introduced some other topic than that of our disaster, had not Ernest exclaimed, “Good God, papa! Victor says that he knows who was the murderer of poor William.”
+
+“We do also, unfortunately,” replied my father, “for indeed I had rather have been for ever ignorant than have discovered so much depravity and ungratitude in one I valued so highly.”
+
+“My dear father, you are mistaken; Justine is innocent.”
+
+“If she is, God forbid that she should suffer as guilty. She is to be tried today, and I hope, I sincerely hope, that she will be acquitted.”
+
+This speech calmed me. I was firmly convinced in my own mind that Justine, and indeed every human being, was guiltless of this murder. I had no fear, therefore, that any circumstantial evidence could be brought forward strong enough to convict her. My tale was not one to announce publicly; its astounding horror would be looked upon as madness by the vulgar. Did any one indeed exist, except I, the creator, who would believe, unless his senses convinced him, in the existence of the living monument of presumption and rash ignorance which I had let loose upon the world?
+
+We were soon joined by Elizabeth. Time had altered her since I last beheld her; it had endowed her with loveliness surpassing the beauty of her childish years. There was the same candour, the same vivacity, but it was allied to an expression more full of sensibility and intellect. She welcomed me with the greatest affection. “Your arrival, my dear cousin,” said she, “fills me with hope. You perhaps will find some means to justify my poor guiltless Justine. Alas! who is safe, if she be convicted of crime? I rely on her innocence as certainly as I do upon my own. Our misfortune is doubly hard to us; we have not only lost that lovely darling boy, but this poor girl, whom I sincerely love, is to be torn away by even a worse fate. If she is condemned, I never shall know joy more. But she will not, I am sure she will not; and then I shall be happy again, even after the sad death of my little William.”
+
+“She is innocent, my Elizabeth,” said I, “and that shall be proved; fear nothing, but let your spirits be cheered by the assurance of her acquittal.”
+
+“How kind and generous you are! every one else believes in her guilt, and that made me wretched, for I knew that it was impossible: and to see every one else prejudiced in so deadly a manner rendered me hopeless and despairing.” She wept.
+
+“Dearest niece,” said my father, “dry your tears. If she is, as you believe, innocent, rely on the justice of our laws, and the activity with which I shall prevent the slightest shadow of partiality.”
+
+Chapter 8
+We passed a few sad hours until eleven o’clock, when the trial was to commence. My father and the rest of the family being obliged to attend as witnesses, I accompanied them to the court. During the whole of this wretched mockery of justice I suffered living torture. It was to be decided whether the result of my curiosity and lawless devices would cause the death of two of my fellow beings: one a smiling babe full of innocence and joy, the other far more dreadfully murdered, with every aggravation of infamy that could make the murder memorable in horror. Justine also was a girl of merit and possessed qualities which promised to render her life happy; now all was to be obliterated in an ignominious grave, and I the cause! A thousand times rather would I have confessed myself guilty of the crime ascribed to Justine, but I was absent when it was committed, and such a declaration would have been considered as the ravings of a madman and would not have exculpated her who suffered through me.
+
+The appearance of Justine was calm. She was dressed in mourning, and her countenance, always engaging, was rendered, by the solemnity of her feelings, exquisitely beautiful. Yet she appeared confident in innocence and did not tremble, although gazed on and execrated by thousands, for all the kindness which her beauty might otherwise have excited was obliterated in the minds of the spectators by the imagination of the enormity she was supposed to have committed. She was tranquil, yet her tranquillity was evidently constrained; and as her confusion had before been adduced as a proof of her guilt, she worked up her mind to an appearance of courage. When she entered the court she threw her eyes round it and quickly discovered where we were seated. A tear seemed to dim her eye when she saw us, but she quickly recovered herself, and a look of sorrowful affection seemed to attest her utter guiltlessness.
+
+The trial began, and after the advocate against her had stated the charge, several witnesses were called. Several strange facts combined against her, which might have staggered anyone who had not such proof of her innocence as I had. She had been out the whole of the night on which the murder had been committed and towards morning had been perceived by a market-woman not far from the spot where the body of the murdered child had been afterwards found. The woman asked her what she did there, but she looked very strangely and only returned a confused and unintelligible answer. She returned to the house about eight o’clock, and when one inquired where she had passed the night, she replied that she had been looking for the child and demanded earnestly if anything had been heard concerning him. When shown the body, she fell into violent hysterics and kept her bed for several days. The picture was then produced which the servant had found in her pocket; and when Elizabeth, in a faltering voice, proved that it was the same which, an hour before the child had been missed, she had placed round his neck, a murmur of horror and indignation filled the court.
+
+Justine was called on for her defence. As the trial had proceeded, her countenance had altered. Surprise, horror, and misery were strongly expressed. Sometimes she struggled with her tears, but when she was desired to plead, she collected her powers and spoke in an audible although variable voice.
+
+“God knows,” she said, “how entirely I am innocent. But I do not pretend that my protestations should acquit me; I rest my innocence on a plain and simple explanation of the facts which have been adduced against me, and I hope the character I have always borne will incline my judges to a favourable interpretation where any circumstance appears doubtful or suspicious.”
+
+She then related that, by the permission of Elizabeth, she had passed the evening of the night on which the murder had been committed at the house of an aunt at Chêne, a village situated at about a league from Geneva. On her return, at about nine o’clock, she met a man who asked her if she had seen anything of the child who was lost. She was alarmed by this account and passed several hours in looking for him, when the gates of Geneva were shut, and she was forced to remain several hours of the night in a barn belonging to a cottage, being unwilling to call up the inhabitants, to whom she was well known. Most of the night she spent here watching; towards morning she believed that she slept for a few minutes; some steps disturbed her, and she awoke. It was dawn, and she quitted her asylum, that she might again endeavour to find my brother. If she had gone near the spot where his body lay, it was without her knowledge. That she had been bewildered when questioned by the market-woman was not surprising, since she had passed a sleepless night and the fate of poor William was yet uncertain. Concerning the picture she could give no account.
+
+“I know,” continued the unhappy victim, “how heavily and fatally this one circumstance weighs against me, but I have no power of explaining it; and when I have expressed my utter ignorance, I am only left to conjecture concerning the probabilities by which it might have been placed in my pocket. But here also I am checked. I believe that I have no enemy on earth, and none surely would have been so wicked as to destroy me wantonly. Did the murderer place it there? I know of no opportunity afforded him for so doing; or, if I had, why should he have stolen the jewel, to part with it again so soon?
+
+“I commit my cause to the justice of my judges, yet I see no room for hope. I beg permission to have a few witnesses examined concerning my character, and if their testimony shall not overweigh my supposed guilt, I must be condemned, although I would pledge my salvation on my innocence.”
+
+Several witnesses were called who had known her for many years, and they spoke well of her; but fear and hatred of the crime of which they supposed her guilty rendered them timorous and unwilling to come forward. Elizabeth saw even this last resource, her excellent dispositions and irreproachable conduct, about to fail the accused, when, although violently agitated, she desired permission to address the court.
+
+“I am,” said she, “the cousin of the unhappy child who was murdered, or rather his sister, for I was educated by and have lived with his parents ever since and even long before his birth. It may therefore be judged indecent in me to come forward on this occasion, but when I see a fellow creature about to perish through the cowardice of her pretended friends, I wish to be allowed to speak, that I may say what I know of her character. I am well acquainted with the accused. I have lived in the same house with her, at one time for five and at another for nearly two years. During all that period she appeared to me the most amiable and benevolent of human creatures. She nursed Madame Frankenstein, my aunt, in her last illness, with the greatest affection and care and afterwards attended her own mother during a tedious illness, in a manner that excited the admiration of all who knew her, after which she again lived in my uncle’s house, where she was beloved by all the family. She was warmly attached to the child who is now dead and acted towards him like a most affectionate mother. For my own part, I do not hesitate to say that, notwithstanding all the evidence produced against her, I believe and rely on her perfect innocence. She had no temptation for such an action; as to the bauble on which the chief proof rests, if she had earnestly desired it, I should have willingly given it to her, so much do I esteem and value her.”
+
+A murmur of approbation followed Elizabeth’s simple and powerful appeal, but it was excited by her generous interference, and not in favour of poor Justine, on whom the public indignation was turned with renewed violence, charging her with the blackest ingratitude. She herself wept as Elizabeth spoke, but she did not answer. My own agitation and anguish was extreme during the whole trial. I believed in her innocence; I knew it. Could the dæmon who had (I did not for a minute doubt) murdered my brother also in his hellish sport have betrayed the innocent to death and ignominy? I could not sustain the horror of my situation, and when I perceived that the popular voice and the countenances of the judges had already condemned my unhappy victim, I rushed out of the court in agony. The tortures of the accused did not equal mine; she was sustained by innocence, but the fangs of remorse tore my bosom and would not forgo their hold.
+
+I passed a night of unmingled wretchedness. In the morning I went to the court; my lips and throat were parched. I dared not ask the fatal question, but I was known, and the officer guessed the cause of my visit. The ballots had been thrown; they were all black, and Justine was condemned.
+
+I cannot pretend to describe what I then felt. I had before experienced sensations of horror, and I have endeavoured to bestow upon them adequate expressions, but words cannot convey an idea of the heart-sickening despair that I then endured. The person to whom I addressed myself added that Justine had already confessed her guilt. “That evidence,” he observed, “was hardly required in so glaring a case, but I am glad of it, and, indeed, none of our judges like to condemn a criminal upon circumstantial evidence, be it ever so decisive.”
+
+This was strange and unexpected intelligence; what could it mean? Had my eyes deceived me? And was I really as mad as the whole world would believe me to be if I disclosed the object of my suspicions? I hastened to return home, and Elizabeth eagerly demanded the result.
+
+“My cousin,” replied I, “it is decided as you may have expected; all judges had rather that ten innocent should suffer than that one guilty should escape. But she has confessed.”
+
+This was a dire blow to poor Elizabeth, who had relied with firmness upon Justine’s innocence. “Alas!” said she. “How shall I ever again believe in human goodness? Justine, whom I loved and esteemed as my sister, how could she put on those smiles of innocence only to betray? Her mild eyes seemed incapable of any severity or guile, and yet she has committed a murder.”
+
+Soon after we heard that the poor victim had expressed a desire to see my cousin. My father wished her not to go but said that he left it to her own judgment and feelings to decide. “Yes,” said Elizabeth, “I will go, although she is guilty; and you, Victor, shall accompany me; I cannot go alone.” The idea of this visit was torture to me, yet I could not refuse.
+
+We entered the gloomy prison chamber and beheld Justine sitting on some straw at the farther end; her hands were manacled, and her head rested on her knees. She rose on seeing us enter, and when we were left alone with her, she threw herself at the feet of Elizabeth, weeping bitterly. My cousin wept also.
+
+“Oh, Justine!” said she. “Why did you rob me of my last consolation? I relied on your innocence, and although I was then very wretched, I was not so miserable as I am now.”
+
+“And do you also believe that I am so very, very wicked? Do you also join with my enemies to crush me, to condemn me as a murderer?” Her voice was suffocated with sobs.
+
+“Rise, my poor girl,” said Elizabeth; “why do you kneel, if you are innocent? I am not one of your enemies, I believed you guiltless, notwithstanding every evidence, until I heard that you had yourself declared your guilt. That report, you say, is false; and be assured, dear Justine, that nothing can shake my confidence in you for a moment, but your own confession.”
+
+“I did confess, but I confessed a lie. I confessed, that I might obtain absolution; but now that falsehood lies heavier at my heart than all my other sins. The God of heaven forgive me! Ever since I was condemned, my confessor has besieged me; he threatened and menaced, until I almost began to think that I was the monster that he said I was. He threatened excommunication and hell fire in my last moments if I continued obdurate. Dear lady, I had none to support me; all looked on me as a wretch doomed to ignominy and perdition. What could I do? In an evil hour I subscribed to a lie; and now only am I truly miserable.”
+
+She paused, weeping, and then continued, “I thought with horror, my sweet lady, that you should believe your Justine, whom your blessed aunt had so highly honoured, and whom you loved, was a creature capable of a crime which none but the devil himself could have perpetrated. Dear William! dearest blessed child! I soon shall see you again in heaven, where we shall all be happy; and that consoles me, going as I am to suffer ignominy and death.”
+
+“Oh, Justine! Forgive me for having for one moment distrusted you. Why did you confess? But do not mourn, dear girl. Do not fear. I will proclaim, I will prove your innocence. I will melt the stony hearts of your enemies by my tears and prayers. You shall not die! You, my playfellow, my companion, my sister, perish on the scaffold! No! No! I never could survive so horrible a misfortune.”
+
+Justine shook her head mournfully. “I do not fear to die,” she said; “that pang is past. God raises my weakness and gives me courage to endure the worst. I leave a sad and bitter world; and if you remember me and think of me as of one unjustly condemned, I am resigned to the fate awaiting me. Learn from me, dear lady, to submit in patience to the will of heaven!”
+
+During this conversation I had retired to a corner of the prison room, where I could conceal the horrid anguish that possessed me. Despair! Who dared talk of that? The poor victim, who on the morrow was to pass the awful boundary between life and death, felt not, as I did, such deep and bitter agony. I gnashed my teeth and ground them together, uttering a groan that came from my inmost soul. Justine started. When she saw who it was, she approached me and said, “Dear sir, you are very kind to visit me; you, I hope, do not believe that I am guilty?”
+
+I could not answer. “No, Justine,” said Elizabeth; “he is more convinced of your innocence than I was, for even when he heard that you had confessed, he did not credit it.”
+
+“I truly thank him. In these last moments I feel the sincerest gratitude towards those who think of me with kindness. How sweet is the affection of others to such a wretch as I am! It removes more than half my misfortune, and I feel as if I could die in peace now that my innocence is acknowledged by you, dear lady, and your cousin.”
+
+Thus the poor sufferer tried to comfort others and herself. She indeed gained the resignation she desired. But I, the true murderer, felt the never-dying worm alive in my bosom, which allowed of no hope or consolation. Elizabeth also wept and was unhappy, but hers also was the misery of innocence, which, like a cloud that passes over the fair moon, for a while hides but cannot tarnish its brightness. Anguish and despair had penetrated into the core of my heart; I bore a hell within me which nothing could extinguish. We stayed several hours with Justine, and it was with great difficulty that Elizabeth could tear herself away. “I wish,” cried she, “that I were to die with you; I cannot live in this world of misery.”
+
+Justine assumed an air of cheerfulness, while she with difficulty repressed her bitter tears. She embraced Elizabeth and said in a voice of half-suppressed emotion, “Farewell, sweet lady, dearest Elizabeth, my beloved and only friend; may heaven, in its bounty, bless and preserve you; may this be the last misfortune that you will ever suffer! Live, and be happy, and make others so.”
+
+And on the morrow Justine died. Elizabeth’s heart-rending eloquence failed to move the judges from their settled conviction in the criminality of the saintly sufferer. My passionate and indignant appeals were lost upon them. And when I received their cold answers and heard the harsh, unfeeling reasoning of these men, my purposed avowal died away on my lips. Thus I might proclaim myself a madman, but not revoke the sentence passed upon my wretched victim. She perished on the scaffold as a murderess!
+
+From the tortures of my own heart, I turned to contemplate the deep and voiceless grief of my Elizabeth. This also was my doing! And my father’s woe, and the desolation of that late so smiling home all was the work of my thrice-accursed hands! Ye weep, unhappy ones, but these are not your last tears! Again shall you raise the funeral wail, and the sound of your lamentations shall again and again be heard! Frankenstein, your son, your kinsman, your early, much-loved friend; he who would spend each vital drop of blood for your sakes, who has no thought nor sense of joy except as it is mirrored also in your dear countenances, who would fill the air with blessings and spend his life in serving you—he bids you weep, to shed countless tears; happy beyond his hopes, if thus inexorable fate be satisfied, and if the destruction pause before the peace of the grave have succeeded to your sad torments!
+
+Thus spoke my prophetic soul, as, torn by remorse, horror, and despair, I beheld those I loved spend vain sorrow upon the graves of William and Justine, the first hapless victims to my unhallowed arts.
+
+Chapter 9
+Nothing is more painful to the human mind than, after the feelings have been worked up by a quick succession of events, the dead calmness of inaction and certainty which follows and deprives the soul both of hope and fear. Justine died, she rested, and I was alive. The blood flowed freely in my veins, but a weight of despair and remorse pressed on my heart which nothing could remove. Sleep fled from my eyes; I wandered like an evil spirit, for I had committed deeds of mischief beyond description horrible, and more, much more (I persuaded myself) was yet behind. Yet my heart overflowed with kindness and the love of virtue. I had begun life with benevolent intentions and thirsted for the moment when I should put them in practice and make myself useful to my fellow beings. Now all was blasted; instead of that serenity of conscience which allowed me to look back upon the past with self-satisfaction, and from thence to gather promise of new hopes, I was seized by remorse and the sense of guilt, which hurried me away to a hell of intense tortures such as no language can describe.
+
+This state of mind preyed upon my health, which had perhaps never entirely recovered from the first shock it had sustained. I shunned the face of man; all sound of joy or complacency was torture to me; solitude was my only consolation—deep, dark, deathlike solitude.
+
+My father observed with pain the alteration perceptible in my disposition and habits and endeavoured by arguments deduced from the feelings of his serene conscience and guiltless life to inspire me with fortitude and awaken in me the courage to dispel the dark cloud which brooded over me. “Do you think, Victor,” said he, “that I do not suffer also? No one could love a child more than I loved your brother”—tears came into his eyes as he spoke—“but is it not a duty to the survivors that we should refrain from augmenting their unhappiness by an appearance of immoderate grief? It is also a duty owed to yourself, for excessive sorrow prevents improvement or enjoyment, or even the discharge of daily usefulness, without which no man is fit for society.”
+
+This advice, although good, was totally inapplicable to my case; I should have been the first to hide my grief and console my friends if remorse had not mingled its bitterness, and terror its alarm, with my other sensations. Now I could only answer my father with a look of despair and endeavour to hide myself from his view.
+
+About this time we retired to our house at Belrive. This change was particularly agreeable to me. The shutting of the gates regularly at ten o’clock and the impossibility of remaining on the lake after that hour had rendered our residence within the walls of Geneva very irksome to me. I was now free. Often, after the rest of the family had retired for the night, I took the boat and passed many hours upon the water. Sometimes, with my sails set, I was carried by the wind; and sometimes, after rowing into the middle of the lake, I left the boat to pursue its own course and gave way to my own miserable reflections. I was often tempted, when all was at peace around me, and I the only unquiet thing that wandered restless in a scene so beautiful and heavenly—if I except some bat, or the frogs, whose harsh and interrupted croaking was heard only when I approached the shore—often, I say, I was tempted to plunge into the silent lake, that the waters might close over me and my calamities for ever. But I was restrained, when I thought of the heroic and suffering Elizabeth, whom I tenderly loved, and whose existence was bound up in mine. I thought also of my father and surviving brother; should I by my base desertion leave them exposed and unprotected to the malice of the fiend whom I had let loose among them?
+
+At these moments I wept bitterly and wished that peace would revisit my mind only that I might afford them consolation and happiness. But that could not be. Remorse extinguished every hope. I had been the author of unalterable evils, and I lived in daily fear lest the monster whom I had created should perpetrate some new wickedness. I had an obscure feeling that all was not over and that he would still commit some signal crime, which by its enormity should almost efface the recollection of the past. There was always scope for fear so long as anything I loved remained behind. My abhorrence of this fiend cannot be conceived. When I thought of him I gnashed my teeth, my eyes became inflamed, and I ardently wished to extinguish that life which I had so thoughtlessly bestowed. When I reflected on his crimes and malice, my hatred and revenge burst all bounds of moderation. I would have made a pilgrimage to the highest peak of the Andes, could I, when there, have precipitated him to their base. I wished to see him again, that I might wreak the utmost extent of abhorrence on his head and avenge the deaths of William and Justine.
+
+Our house was the house of mourning. My father’s health was deeply shaken by the horror of the recent events. Elizabeth was sad and desponding; she no longer took delight in her ordinary occupations; all pleasure seemed to her sacrilege toward the dead; eternal woe and tears she then thought was the just tribute she should pay to innocence so blasted and destroyed. She was no longer that happy creature who in earlier youth wandered with me on the banks of the lake and talked with ecstasy of our future prospects. The first of those sorrows which are sent to wean us from the earth had visited her, and its dimming influence quenched her dearest smiles.
+
+“When I reflect, my dear cousin,” said she, “on the miserable death of Justine Moritz, I no longer see the world and its works as they before appeared to me. Before, I looked upon the accounts of vice and injustice that I read in books or heard from others as tales of ancient days or imaginary evils; at least they were remote and more familiar to reason than to the imagination; but now misery has come home, and men appear to me as monsters thirsting for each other’s blood. Yet I am certainly unjust. Everybody believed that poor girl to be guilty; and if she could have committed the crime for which she suffered, assuredly she would have been the most depraved of human creatures. For the sake of a few jewels, to have murdered the son of her benefactor and friend, a child whom she had nursed from its birth, and appeared to love as if it had been her own! I could not consent to the death of any human being, but certainly I should have thought such a creature unfit to remain in the society of men. But she was innocent. I know, I feel she was innocent; you are of the same opinion, and that confirms me. Alas! Victor, when falsehood can look so like the truth, who can assure themselves of certain happiness? I feel as if I were walking on the edge of a precipice, towards which thousands are crowding and endeavouring to plunge me into the abyss. William and Justine were assassinated, and the murderer escapes; he walks about the world free, and perhaps respected. But even if I were condemned to suffer on the scaffold for the same crimes, I would not change places with such a wretch.”
+
+I listened to this discourse with the extremest agony. I, not in deed, but in effect, was the true murderer. Elizabeth read my anguish in my countenance, and kindly taking my hand, said, “My dearest friend, you must calm yourself. These events have affected me, God knows how deeply; but I am not so wretched as you are. There is an expression of despair, and sometimes of revenge, in your countenance that makes me tremble. Dear Victor, banish these dark passions. Remember the friends around you, who centre all their hopes in you. Have we lost the power of rendering you happy? Ah! While we love, while we are true to each other, here in this land of peace and beauty, your native country, we may reap every tranquil blessing—what can disturb our peace?”
+
+And could not such words from her whom I fondly prized before every other gift of fortune suffice to chase away the fiend that lurked in my heart? Even as she spoke I drew near to her, as if in terror, lest at that very moment the destroyer had been near to rob me of her.
+
+Thus not the tenderness of friendship, nor the beauty of earth, nor of heaven, could redeem my soul from woe; the very accents of love were ineffectual. I was encompassed by a cloud which no beneficial influence could penetrate. The wounded deer dragging its fainting limbs to some untrodden brake, there to gaze upon the arrow which had pierced it, and to die, was but a type of me.
+
+Sometimes I could cope with the sullen despair that overwhelmed me, but sometimes the whirlwind passions of my soul drove me to seek, by bodily exercise and by change of place, some relief from my intolerable sensations. It was during an access of this kind that I suddenly left my home, and bending my steps towards the near Alpine valleys, sought in the magnificence, the eternity of such scenes, to forget myself and my ephemeral, because human, sorrows. My wanderings were directed towards the valley of Chamounix. I had visited it frequently during my boyhood. Six years had passed since then: I was a wreck, but nought had changed in those savage and enduring scenes.
+
+I performed the first part of my journey on horseback. I afterwards hired a mule, as the more sure-footed and least liable to receive injury on these rugged roads. The weather was fine; it was about the middle of the month of August, nearly two months after the death of Justine, that miserable epoch from which I dated all my woe. The weight upon my spirit was sensibly lightened as I plunged yet deeper in the ravine of Arve. The immense mountains and precipices that overhung me on every side, the sound of the river raging among the rocks, and the dashing of the waterfalls around spoke of a power mighty as Omnipotence—and I ceased to fear or to bend before any being less almighty than that which had created and ruled the elements, here displayed in their most terrific guise. Still, as I ascended higher, the valley assumed a more magnificent and astonishing character. Ruined castles hanging on the precipices of piny mountains, the impetuous Arve, and cottages every here and there peeping forth from among the trees formed a scene of singular beauty. But it was augmented and rendered sublime by the mighty Alps, whose white and shining pyramids and domes towered above all, as belonging to another earth, the habitations of another race of beings.
+
+I passed the bridge of Pélissier, where the ravine, which the river forms, opened before me, and I began to ascend the mountain that overhangs it. Soon after, I entered the valley of Chamounix. This valley is more wonderful and sublime, but not so beautiful and picturesque as that of Servox, through which I had just passed. The high and snowy mountains were its immediate boundaries, but I saw no more ruined castles and fertile fields. Immense glaciers approached the road; I heard the rumbling thunder of the falling avalanche and marked the smoke of its passage. Mont Blanc, the supreme and magnificent Mont Blanc, raised itself from the surrounding aiguilles, and its tremendous dôme overlooked the valley.
+
+A tingling long-lost sense of pleasure often came across me during this journey. Some turn in the road, some new object suddenly perceived and recognised, reminded me of days gone by, and were associated with the lighthearted gaiety of boyhood. The very winds whispered in soothing accents, and maternal Nature bade me weep no more. Then again the kindly influence ceased to act—I found myself fettered again to grief and indulging in all the misery of reflection. Then I spurred on my animal, striving so to forget the world, my fears, and more than all, myself—or, in a more desperate fashion, I alighted and threw myself on the grass, weighed down by horror and despair.
+
+At length I arrived at the village of Chamounix. Exhaustion succeeded to the extreme fatigue both of body and of mind which I had endured. For a short space of time I remained at the window watching the pallid lightnings that played above Mont Blanc and listening to the rushing of the Arve, which pursued its noisy way beneath. The same lulling sounds acted as a lullaby to my too keen sensations; when I placed my head upon my pillow, sleep crept over me; I felt it as it came and blessed the giver of oblivion.
+
+Chapter 10
+I spent the following day roaming through the valley. I stood beside the sources of the Arveiron, which take their rise in a glacier, that with slow pace is advancing down from the summit of the hills to barricade the valley. The abrupt sides of vast mountains were before me; the icy wall of the glacier overhung me; a few shattered pines were scattered around; and the solemn silence of this glorious presence-chamber of imperial Nature was broken only by the brawling waves or the fall of some vast fragment, the thunder sound of the avalanche or the cracking, reverberated along the mountains, of the accumulated ice, which, through the silent working of immutable laws, was ever and anon rent and torn, as if it had been but a plaything in their hands. These sublime and magnificent scenes afforded me the greatest consolation that I was capable of receiving. They elevated me from all littleness of feeling, and although they did not remove my grief, they subdued and tranquillised it. In some degree, also, they diverted my mind from the thoughts over which it had brooded for the last month. I retired to rest at night; my slumbers, as it were, waited on and ministered to by the assemblance of grand shapes which I had contemplated during the day. They congregated round me; the unstained snowy mountain-top, the glittering pinnacle, the pine woods, and ragged bare ravine, the eagle, soaring amidst the clouds—they all gathered round me and bade me be at peace.
+
+Where had they fled when the next morning I awoke? All of soul-inspiriting fled with sleep, and dark melancholy clouded every thought. The rain was pouring in torrents, and thick mists hid the summits of the mountains, so that I even saw not the faces of those mighty friends. Still I would penetrate their misty veil and seek them in their cloudy retreats. What were rain and storm to me? My mule was brought to the door, and I resolved to ascend to the summit of Montanvert. I remembered the effect that the view of the tremendous and ever-moving glacier had produced upon my mind when I first saw it. It had then filled me with a sublime ecstasy that gave wings to the soul and allowed it to soar from the obscure world to light and joy. The sight of the awful and majestic in nature had indeed always the effect of solemnising my mind and causing me to forget the passing cares of life. I determined to go without a guide, for I was well acquainted with the path, and the presence of another would destroy the solitary grandeur of the scene.
+
+The ascent is precipitous, but the path is cut into continual and short windings, which enable you to surmount the perpendicularity of the mountain. It is a scene terrifically desolate. In a thousand spots the traces of the winter avalanche may be perceived, where trees lie broken and strewed on the ground, some entirely destroyed, others bent, leaning upon the jutting rocks of the mountain or transversely upon other trees. The path, as you ascend higher, is intersected by ravines of snow, down which stones continually roll from above; one of them is particularly dangerous, as the slightest sound, such as even speaking in a loud voice, produces a concussion of air sufficient to draw destruction upon the head of the speaker. The pines are not tall or luxuriant, but they are sombre and add an air of severity to the scene. I looked on the valley beneath; vast mists were rising from the rivers which ran through it and curling in thick wreaths around the opposite mountains, whose summits were hid in the uniform clouds, while rain poured from the dark sky and added to the melancholy impression I received from the objects around me. Alas! Why does man boast of sensibilities superior to those apparent in the brute; it only renders them more necessary beings. If our impulses were confined to hunger, thirst, and desire, we might be nearly free; but now we are moved by every wind that blows and a chance word or scene that that word may convey to us.
+
+We rest; a dream has power to poison sleep.
+    We rise; one wand’ring thought pollutes the day.
+We feel, conceive, or reason; laugh or weep,
+    Embrace fond woe, or cast our cares away;
+It is the same: for, be it joy or sorrow,
+    The path of its departure still is free.
+Man’s yesterday may ne’er be like his morrow;
+    Nought may endure but mutability!
+
+It was nearly noon when I arrived at the top of the ascent. For some time I sat upon the rock that overlooks the sea of ice. A mist covered both that and the surrounding mountains. Presently a breeze dissipated the cloud, and I descended upon the glacier. The surface is very uneven, rising like the waves of a troubled sea, descending low, and interspersed by rifts that sink deep. The field of ice is almost a league in width, but I spent nearly two hours in crossing it. The opposite mountain is a bare perpendicular rock. From the side where I now stood Montanvert was exactly opposite, at the distance of a league; and above it rose Mont Blanc, in awful majesty. I remained in a recess of the rock, gazing on this wonderful and stupendous scene. The sea, or rather the vast river of ice, wound among its dependent mountains, whose aerial summits hung over its recesses. Their icy and glittering peaks shone in the sunlight over the clouds. My heart, which was before sorrowful, now swelled with something like joy; I exclaimed, “Wandering spirits, if indeed ye wander, and do not rest in your narrow beds, allow me this faint happiness, or take me, as your companion, away from the joys of life.”
+
+As I said this I suddenly beheld the figure of a man, at some distance, advancing towards me with superhuman speed. He bounded over the crevices in the ice, among which I had walked with caution; his stature, also, as he approached, seemed to exceed that of man. I was troubled; a mist came over my eyes, and I felt a faintness seize me, but I was quickly restored by the cold gale of the mountains. I perceived, as the shape came nearer (sight tremendous and abhorred!) that it was the wretch whom I had created. I trembled with rage and horror, resolving to wait his approach and then close with him in mortal combat. He approached; his countenance bespoke bitter anguish, combined with disdain and malignity, while its unearthly ugliness rendered it almost too horrible for human eyes. But I scarcely observed this; rage and hatred had at first deprived me of utterance, and I recovered only to overwhelm him with words expressive of furious detestation and contempt.
+
+“Devil,” I exclaimed, “do you dare approach me? And do not you fear the fierce vengeance of my arm wreaked on your miserable head? Begone, vile insect! Or rather, stay, that I may trample you to dust! And, oh! That I could, with the extinction of your miserable existence, restore those victims whom you have so diabolically murdered!”
+
+“I expected this reception,” said the dæmon. “All men hate the wretched; how, then, must I be hated, who am miserable beyond all living things! Yet you, my creator, detest and spurn me, thy creature, to whom thou art bound by ties only dissoluble by the annihilation of one of us. You purpose to kill me. How dare you sport thus with life? Do your duty towards me, and I will do mine towards you and the rest of mankind. If you will comply with my conditions, I will leave them and you at peace; but if you refuse, I will glut the maw of death, until it be satiated with the blood of your remaining friends.”
+
+“Abhorred monster! Fiend that thou art! The tortures of hell are too mild a vengeance for thy crimes. Wretched devil! You reproach me with your creation, come on, then, that I may extinguish the spark which I so negligently bestowed.”
+
+My rage was without bounds; I sprang on him, impelled by all the feelings which can arm one being against the existence of another.
+
+He easily eluded me and said,
+
+“Be calm! I entreat you to hear me before you give vent to your hatred on my devoted head. Have I not suffered enough, that you seek to increase my misery? Life, although it may only be an accumulation of anguish, is dear to me, and I will defend it. Remember, thou hast made me more powerful than thyself; my height is superior to thine, my joints more supple. But I will not be tempted to set myself in opposition to thee. I am thy creature, and I will be even mild and docile to my natural lord and king if thou wilt also perform thy part, the which thou owest me. Oh, Frankenstein, be not equitable to every other and trample upon me alone, to whom thy justice, and even thy clemency and affection, is most due. Remember that I am thy creature; I ought to be thy Adam, but I am rather the fallen angel, whom thou drivest from joy for no misdeed. Everywhere I see bliss, from which I alone am irrevocably excluded. I was benevolent and good; misery made me a fiend. Make me happy, and I shall again be virtuous.”
+
+“Begone! I will not hear you. There can be no community between you and me; we are enemies. Begone, or let us try our strength in a fight, in which one must fall.”
+
+“How can I move thee? Will no entreaties cause thee to turn a favourable eye upon thy creature, who implores thy goodness and compassion? Believe me, Frankenstein, I was benevolent; my soul glowed with love and humanity; but am I not alone, miserably alone? You, my creator, abhor me; what hope can I gather from your fellow creatures, who owe me nothing? They spurn and hate me. The desert mountains and dreary glaciers are my refuge. I have wandered here many days; the caves of ice, which I only do not fear, are a dwelling to me, and the only one which man does not grudge. These bleak skies I hail, for they are kinder to me than your fellow beings. If the multitude of mankind knew of my existence, they would do as you do, and arm themselves for my destruction. Shall I not then hate them who abhor me? I will keep no terms with my enemies. I am miserable, and they shall share my wretchedness. Yet it is in your power to recompense me, and deliver them from an evil which it only remains for you to make so great, that not only you and your family, but thousands of others, shall be swallowed up in the whirlwinds of its rage. Let your compassion be moved, and do not disdain me. Listen to my tale; when you have heard that, abandon or commiserate me, as you shall judge that I deserve. But hear me. The guilty are allowed, by human laws, bloody as they are, to speak in their own defence before they are condemned. Listen to me, Frankenstein. You accuse me of murder, and yet you would, with a satisfied conscience, destroy your own creature. Oh, praise the eternal justice of man! Yet I ask you not to spare me; listen to me, and then, if you can, and if you will, destroy the work of your hands.”
+
+“Why do you call to my remembrance,” I rejoined, “circumstances of which I shudder to reflect, that I have been the miserable origin and author? Cursed be the day, abhorred devil, in which you first saw light! Cursed (although I curse myself) be the hands that formed you! You have made me wretched beyond expression. You have left me no power to consider whether I am just to you or not. Begone! Relieve me from the sight of your detested form.”
+
+“Thus I relieve thee, my creator,” he said, and placed his hated hands before my eyes, which I flung from me with violence; “thus I take from thee a sight which you abhor. Still thou canst listen to me and grant me thy compassion. By the virtues that I once possessed, I demand this from you. Hear my tale; it is long and strange, and the temperature of this place is not fitting to your fine sensations; come to the hut upon the mountain. The sun is yet high in the heavens; before it descends to hide itself behind your snowy precipices and illuminate another world, you will have heard my story and can decide. On you it rests, whether I quit for ever the neighbourhood of man and lead a harmless life, or become the scourge of your fellow creatures and the author of your own speedy ruin.”
+
+As he said this he led the way across the ice; I followed. My heart was full, and I did not answer him, but as I proceeded, I weighed the various arguments that he had used and determined at least to listen to his tale. I was partly urged by curiosity, and compassion confirmed my resolution. I had hitherto supposed him to be the murderer of my brother, and I eagerly sought a confirmation or denial of this opinion. For the first time, also, I felt what the duties of a creator towards his creature were, and that I ought to render him happy before I complained of his wickedness. These motives urged me to comply with his demand. We crossed the ice, therefore, and ascended the opposite rock. The air was cold, and the rain again began to descend; we entered the hut, the fiend with an air of exultation, I with a heavy heart and depressed spirits. But I consented to listen, and seating myself by the fire which my odious companion had lighted, he thus began his tale.
+
+Chapter 11
+“It is with considerable difficulty that I remember the original era of my being; all the events of that period appear confused and indistinct. A strange multiplicity of sensations seized me, and I saw, felt, heard, and smelt at the same time; and it was, indeed, a long time before I learned to distinguish between the operations of my various senses. By degrees, I remember, a stronger light pressed upon my nerves, so that I was obliged to shut my eyes. Darkness then came over me and troubled me, but hardly had I felt this when, by opening my eyes, as I now suppose, the light poured in upon me again. I walked and, I believe, descended, but I presently found a great alteration in my sensations. Before, dark and opaque bodies had surrounded me, impervious to my touch or sight; but I now found that I could wander on at liberty, with no obstacles which I could not either surmount or avoid. The light became more and more oppressive to me, and the heat wearying me as I walked, I sought a place where I could receive shade. This was the forest near Ingolstadt; and here I lay by the side of a brook resting from my fatigue, until I felt tormented by hunger and thirst. This roused me from my nearly dormant state, and I ate some berries which I found hanging on the trees or lying on the ground. I slaked my thirst at the brook, and then lying down, was overcome by sleep.
+
+“It was dark when I awoke; I felt cold also, and half frightened, as it were, instinctively, finding myself so desolate. Before I had quitted your apartment, on a sensation of cold, I had covered myself with some clothes, but these were insufficient to secure me from the dews of night. I was a poor, helpless, miserable wretch; I knew, and could distinguish, nothing; but feeling pain invade me on all sides, I sat down and wept.
+
+“Soon a gentle light stole over the heavens and gave me a sensation of pleasure. I started up and beheld a radiant form rise from among the trees. [The moon] I gazed with a kind of wonder. It moved slowly, but it enlightened my path, and I again went out in search of berries. I was still cold when under one of the trees I found a huge cloak, with which I covered myself, and sat down upon the ground. No distinct ideas occupied my mind; all was confused. I felt light, and hunger, and thirst, and darkness; innumerable sounds rang in my ears, and on all sides various scents saluted me; the only object that I could distinguish was the bright moon, and I fixed my eyes on that with pleasure.
+
+“Several changes of day and night passed, and the orb of night had greatly lessened, when I began to distinguish my sensations from each other. I gradually saw plainly the clear stream that supplied me with drink and the trees that shaded me with their foliage. I was delighted when I first discovered that a pleasant sound, which often saluted my ears, proceeded from the throats of the little winged animals who had often intercepted the light from my eyes. I began also to observe, with greater accuracy, the forms that surrounded me and to perceive the boundaries of the radiant roof of light which canopied me. Sometimes I tried to imitate the pleasant songs of the birds but was unable. Sometimes I wished to express my sensations in my own mode, but the uncouth and inarticulate sounds which broke from me frightened me into silence again.
+
+“The moon had disappeared from the night, and again, with a lessened form, showed itself, while I still remained in the forest. My sensations had by this time become distinct, and my mind received every day additional ideas. My eyes became accustomed to the light and to perceive objects in their right forms; I distinguished the insect from the herb, and by degrees, one herb from another. I found that the sparrow uttered none but harsh notes, whilst those of the blackbird and thrush were sweet and enticing.
+
+“One day, when I was oppressed by cold, I found a fire which had been left by some wandering beggars, and was overcome with delight at the warmth I experienced from it. In my joy I thrust my hand into the live embers, but quickly drew it out again with a cry of pain. How strange, I thought, that the same cause should produce such opposite effects! I examined the materials of the fire, and to my joy found it to be composed of wood. I quickly collected some branches, but they were wet and would not burn. I was pained at this and sat still watching the operation of the fire. The wet wood which I had placed near the heat dried and itself became inflamed. I reflected on this, and by touching the various branches, I discovered the cause and busied myself in collecting a great quantity of wood, that I might dry it and have a plentiful supply of fire. When night came on and brought sleep with it, I was in the greatest fear lest my fire should be extinguished. I covered it carefully with dry wood and leaves and placed wet branches upon it; and then, spreading my cloak, I lay on the ground and sank into sleep.
+
+“It was morning when I awoke, and my first care was to visit the fire. I uncovered it, and a gentle breeze quickly fanned it into a flame. I observed this also and contrived a fan of branches, which roused the embers when they were nearly extinguished. When night came again I found, with pleasure, that the fire gave light as well as heat and that the discovery of this element was useful to me in my food, for I found some of the offals that the travellers had left had been roasted, and tasted much more savoury than the berries I gathered from the trees. I tried, therefore, to dress my food in the same manner, placing it on the live embers. I found that the berries were spoiled by this operation, and the nuts and roots much improved.
+
+“Food, however, became scarce, and I often spent the whole day searching in vain for a few acorns to assuage the pangs of hunger. When I found this, I resolved to quit the place that I had hitherto inhabited, to seek for one where the few wants I experienced would be more easily satisfied. In this emigration I exceedingly lamented the loss of the fire which I had obtained through accident and knew not how to reproduce it. I gave several hours to the serious consideration of this difficulty, but I was obliged to relinquish all attempt to supply it, and wrapping myself up in my cloak, I struck across the wood towards the setting sun. I passed three days in these rambles and at length discovered the open country. A great fall of snow had taken place the night before, and the fields were of one uniform white; the appearance was disconsolate, and I found my feet chilled by the cold damp substance that covered the ground.
+
+“It was about seven in the morning, and I longed to obtain food and shelter; at length I perceived a small hut, on a rising ground, which had doubtless been built for the convenience of some shepherd. This was a new sight to me, and I examined the structure with great curiosity. Finding the door open, I entered. An old man sat in it, near a fire, over which he was preparing his breakfast. He turned on hearing a noise, and perceiving me, shrieked loudly, and quitting the hut, ran across the fields with a speed of which his debilitated form hardly appeared capable. His appearance, different from any I had ever before seen, and his flight somewhat surprised me. But I was enchanted by the appearance of the hut; here the snow and rain could not penetrate; the ground was dry; and it presented to me then as exquisite and divine a retreat as Pandæmonium appeared to the dæmons of hell after their sufferings in the lake of fire. I greedily devoured the remnants of the shepherd’s breakfast, which consisted of bread, cheese, milk, and wine; the latter, however, I did not like. Then, overcome by fatigue, I lay down among some straw and fell asleep.
+
+“It was noon when I awoke, and allured by the warmth of the sun, which shone brightly on the white ground, I determined to recommence my travels; and, depositing the remains of the peasant’s breakfast in a wallet I found, I proceeded across the fields for several hours, until at sunset I arrived at a village. How miraculous did this appear! The huts, the neater cottages, and stately houses engaged my admiration by turns. The vegetables in the gardens, the milk and cheese that I saw placed at the windows of some of the cottages, allured my appetite. One of the best of these I entered, but I had hardly placed my foot within the door before the children shrieked, and one of the women fainted. The whole village was roused; some fled, some attacked me, until, grievously bruised by stones and many other kinds of missile weapons, I escaped to the open country and fearfully took refuge in a low hovel, quite bare, and making a wretched appearance after the palaces I had beheld in the village. This hovel however, joined a cottage of a neat and pleasant appearance, but after my late dearly bought experience, I dared not enter it. My place of refuge was constructed of wood, but so low that I could with difficulty sit upright in it. No wood, however, was placed on the earth, which formed the floor, but it was dry; and although the wind entered it by innumerable chinks, I found it an agreeable asylum from the snow and rain.
+
+“Here, then, I retreated and lay down happy to have found a shelter, however miserable, from the inclemency of the season, and still more from the barbarity of man. As soon as morning dawned I crept from my kennel, that I might view the adjacent cottage and discover if I could remain in the habitation I had found. It was situated against the back of the cottage and surrounded on the sides which were exposed by a pig sty and a clear pool of water. One part was open, and by that I had crept in; but now I covered every crevice by which I might be perceived with stones and wood, yet in such a manner that I might move them on occasion to pass out; all the light I enjoyed came through the sty, and that was sufficient for me.
+
+“Having thus arranged my dwelling and carpeted it with clean straw, I retired, for I saw the figure of a man at a distance, and I remembered too well my treatment the night before to trust myself in his power. I had first, however, provided for my sustenance for that day by a loaf of coarse bread, which I purloined, and a cup with which I could drink more conveniently than from my hand of the pure water which flowed by my retreat. The floor was a little raised, so that it was kept perfectly dry, and by its vicinity to the chimney of the cottage it was tolerably warm.
+
+“Being thus provided, I resolved to reside in this hovel until something should occur which might alter my determination. It was indeed a paradise compared to the bleak forest, my former residence, the rain-dropping branches, and dank earth. I ate my breakfast with pleasure and was about to remove a plank to procure myself a little water when I heard a step, and looking through a small chink, I beheld a young creature, with a pail on her head, passing before my hovel. The girl was young and of gentle demeanour, unlike what I have since found cottagers and farmhouse servants to be. Yet she was meanly dressed, a coarse blue petticoat and a linen jacket being her only garb; her fair hair was plaited but not adorned: she looked patient yet sad. I lost sight of her, and in about a quarter of an hour she returned bearing the pail, which was now partly filled with milk. As she walked along, seemingly incommoded by the burden, a young man met her, whose countenance expressed a deeper despondence. Uttering a few sounds with an air of melancholy, he took the pail from her head and bore it to the cottage himself. She followed, and they disappeared. Presently I saw the young man again, with some tools in his hand, cross the field behind the cottage; and the girl was also busied, sometimes in the house and sometimes in the yard.
+
+“On examining my dwelling, I found that one of the windows of the cottage had formerly occupied a part of it, but the panes had been filled up with wood. In one of these was a small and almost imperceptible chink through which the eye could just penetrate. Through this crevice a small room was visible, whitewashed and clean but very bare of furniture. In one corner, near a small fire, sat an old man, leaning his head on his hands in a disconsolate attitude. The young girl was occupied in arranging the cottage; but presently she took something out of a drawer, which employed her hands, and she sat down beside the old man, who, taking up an instrument, began to play and to produce sounds sweeter than the voice of the thrush or the nightingale. It was a lovely sight, even to me, poor wretch who had never beheld aught beautiful before. The silver hair and benevolent countenance of the aged cottager won my reverence, while the gentle manners of the girl enticed my love. He played a sweet mournful air which I perceived drew tears from the eyes of his amiable companion, of which the old man took no notice, until she sobbed audibly; he then pronounced a few sounds, and the fair creature, leaving her work, knelt at his feet. He raised her and smiled with such kindness and affection that I felt sensations of a peculiar and overpowering nature; they were a mixture of pain and pleasure, such as I had never before experienced, either from hunger or cold, warmth or food; and I withdrew from the window, unable to bear these emotions.
+
+“Soon after this the young man returned, bearing on his shoulders a load of wood. The girl met him at the door, helped to relieve him of his burden, and taking some of the fuel into the cottage, placed it on the fire; then she and the youth went apart into a nook of the cottage, and he showed her a large loaf and a piece of cheese. She seemed pleased and went into the garden for some roots and plants, which she placed in water, and then upon the fire. She afterwards continued her work, whilst the young man went into the garden and appeared busily employed in digging and pulling up roots. After he had been employed thus about an hour, the young woman joined him and they entered the cottage together.
+
+“The old man had, in the meantime, been pensive, but on the appearance of his companions he assumed a more cheerful air, and they sat down to eat. The meal was quickly dispatched. The young woman was again occupied in arranging the cottage, the old man walked before the cottage in the sun for a few minutes, leaning on the arm of the youth. Nothing could exceed in beauty the contrast between these two excellent creatures. One was old, with silver hairs and a countenance beaming with benevolence and love; the younger was slight and graceful in his figure, and his features were moulded with the finest symmetry, yet his eyes and attitude expressed the utmost sadness and despondency. The old man returned to the cottage, and the youth, with tools different from those he had used in the morning, directed his steps across the fields.
+
+“Night quickly shut in, but to my extreme wonder, I found that the cottagers had a means of prolonging light by the use of tapers, and was delighted to find that the setting of the sun did not put an end to the pleasure I experienced in watching my human neighbours. In the evening the young girl and her companion were employed in various occupations which I did not understand; and the old man again took up the instrument which produced the divine sounds that had enchanted me in the morning. So soon as he had finished, the youth began, not to play, but to utter sounds that were monotonous, and neither resembling the harmony of the old man’s instrument nor the songs of the birds; I since found that he read aloud, but at that time I knew nothing of the science of words or letters.
+
+“The family, after having been thus occupied for a short time, extinguished their lights and retired, as I conjectured, to rest.”
+
+Chapter 12
+“I lay on my straw, but I could not sleep. I thought of the occurrences of the day. What chiefly struck me was the gentle manners of these people, and I longed to join them, but dared not. I remembered too well the treatment I had suffered the night before from the barbarous villagers, and resolved, whatever course of conduct I might hereafter think it right to pursue, that for the present I would remain quietly in my hovel, watching and endeavouring to discover the motives which influenced their actions.
+
+“The cottagers arose the next morning before the sun. The young woman arranged the cottage and prepared the food, and the youth departed after the first meal.
+
+“This day was passed in the same routine as that which preceded it. The young man was constantly employed out of doors, and the girl in various laborious occupations within. The old man, whom I soon perceived to be blind, employed his leisure hours on his instrument or in contemplation. Nothing could exceed the love and respect which the younger cottagers exhibited towards their venerable companion. They performed towards him every little office of affection and duty with gentleness, and he rewarded them by his benevolent smiles.
+
+“They were not entirely happy. The young man and his companion often went apart and appeared to weep. I saw no cause for their unhappiness, but I was deeply affected by it. If such lovely creatures were miserable, it was less strange that I, an imperfect and solitary being, should be wretched. Yet why were these gentle beings unhappy? They possessed a delightful house (for such it was in my eyes) and every luxury; they had a fire to warm them when chill and delicious viands when hungry; they were dressed in excellent clothes; and, still more, they enjoyed one another’s company and speech, interchanging each day looks of affection and kindness. What did their tears imply? Did they really express pain? I was at first unable to solve these questions, but perpetual attention and time explained to me many appearances which were at first enigmatic.
+
+“A considerable period elapsed before I discovered one of the causes of the uneasiness of this amiable family: it was poverty, and they suffered that evil in a very distressing degree. Their nourishment consisted entirely of the vegetables of their garden and the milk of one cow, which gave very little during the winter, when its masters could scarcely procure food to support it. They often, I believe, suffered the pangs of hunger very poignantly, especially the two younger cottagers, for several times they placed food before the old man when they reserved none for themselves.
+
+“This trait of kindness moved me sensibly. I had been accustomed, during the night, to steal a part of their store for my own consumption, but when I found that in doing this I inflicted pain on the cottagers, I abstained and satisfied myself with berries, nuts, and roots which I gathered from a neighbouring wood.
+
+“I discovered also another means through which I was enabled to assist their labours. I found that the youth spent a great part of each day in collecting wood for the family fire, and during the night I often took his tools, the use of which I quickly discovered, and brought home firing sufficient for the consumption of several days.
+
+“I remember, the first time that I did this, the young woman, when she opened the door in the morning, appeared greatly astonished on seeing a great pile of wood on the outside. She uttered some words in a loud voice, and the youth joined her, who also expressed surprise. I observed, with pleasure, that he did not go to the forest that day, but spent it in repairing the cottage and cultivating the garden.
+
+“By degrees I made a discovery of still greater moment. I found that these people possessed a method of communicating their experience and feelings to one another by articulate sounds. I perceived that the words they spoke sometimes produced pleasure or pain, smiles or sadness, in the minds and countenances of the hearers. This was indeed a godlike science, and I ardently desired to become acquainted with it. But I was baffled in every attempt I made for this purpose. Their pronunciation was quick, and the words they uttered, not having any apparent connection with visible objects, I was unable to discover any clue by which I could unravel the mystery of their reference. By great application, however, and after having remained during the space of several revolutions of the moon in my hovel, I discovered the names that were given to some of the most familiar objects of discourse; I learned and applied the words, fire, milk, bread, and wood. I learned also the names of the cottagers themselves. The youth and his companion had each of them several names, but the old man had only one, which was father. The girl was called sister or Agatha, and the youth Felix, brother, or son. I cannot describe the delight I felt when I learned the ideas appropriated to each of these sounds and was able to pronounce them. I distinguished several other words without being able as yet to understand or apply them, such as good, dearest, unhappy.
+
+“I spent the winter in this manner. The gentle manners and beauty of the cottagers greatly endeared them to me; when they were unhappy, I felt depressed; when they rejoiced, I sympathised in their joys. I saw few human beings besides them, and if any other happened to enter the cottage, their harsh manners and rude gait only enhanced to me the superior accomplishments of my friends. The old man, I could perceive, often endeavoured to encourage his children, as sometimes I found that he called them, to cast off their melancholy. He would talk in a cheerful accent, with an expression of goodness that bestowed pleasure even upon me. Agatha listened with respect, her eyes sometimes filled with tears, which she endeavoured to wipe away unperceived; but I generally found that her countenance and tone were more cheerful after having listened to the exhortations of her father. It was not thus with Felix. He was always the saddest of the group, and even to my unpractised senses, he appeared to have suffered more deeply than his friends. But if his countenance was more sorrowful, his voice was more cheerful than that of his sister, especially when he addressed the old man.
+
+“I could mention innumerable instances which, although slight, marked the dispositions of these amiable cottagers. In the midst of poverty and want, Felix carried with pleasure to his sister the first little white flower that peeped out from beneath the snowy ground. Early in the morning, before she had risen, he cleared away the snow that obstructed her path to the milk-house, drew water from the well, and brought the wood from the outhouse, where, to his perpetual astonishment, he found his store always replenished by an invisible hand. In the day, I believe, he worked sometimes for a neighbouring farmer, because he often went forth and did not return until dinner, yet brought no wood with him. At other times he worked in the garden, but as there was little to do in the frosty season, he read to the old man and Agatha.
+
+“This reading had puzzled me extremely at first, but by degrees I discovered that he uttered many of the same sounds when he read as when he talked. I conjectured, therefore, that he found on the paper signs for speech which he understood, and I ardently longed to comprehend these also; but how was that possible when I did not even understand the sounds for which they stood as signs? I improved, however, sensibly in this science, but not sufficiently to follow up any kind of conversation, although I applied my whole mind to the endeavour, for I easily perceived that, although I eagerly longed to discover myself to the cottagers, I ought not to make the attempt until I had first become master of their language, which knowledge might enable me to make them overlook the deformity of my figure, for with this also the contrast perpetually presented to my eyes had made me acquainted.
+
+“I had admired the perfect forms of my cottagers—their grace, beauty, and delicate complexions; but how was I terrified when I viewed myself in a transparent pool! At first I started back, unable to believe that it was indeed I who was reflected in the mirror; and when I became fully convinced that I was in reality the monster that I am, I was filled with the bitterest sensations of despondence and mortification. Alas! I did not yet entirely know the fatal effects of this miserable deformity.
+
+“As the sun became warmer and the light of day longer, the snow vanished, and I beheld the bare trees and the black earth. From this time Felix was more employed, and the heart-moving indications of impending famine disappeared. Their food, as I afterwards found, was coarse, but it was wholesome; and they procured a sufficiency of it. Several new kinds of plants sprang up in the garden, which they dressed; and these signs of comfort increased daily as the season advanced.
+
+“The old man, leaning on his son, walked each day at noon, when it did not rain, as I found it was called when the heavens poured forth its waters. This frequently took place, but a high wind quickly dried the earth, and the season became far more pleasant than it had been.
+
+“My mode of life in my hovel was uniform. During the morning I attended the motions of the cottagers, and when they were dispersed in various occupations, I slept; the remainder of the day was spent in observing my friends. When they had retired to rest, if there was any moon or the night was star-light, I went into the woods and collected my own food and fuel for the cottage. When I returned, as often as it was necessary, I cleared their path from the snow and performed those offices that I had seen done by Felix. I afterwards found that these labours, performed by an invisible hand, greatly astonished them; and once or twice I heard them, on these occasions, utter the words good spirit, wonderful; but I did not then understand the signification of these terms.
+
+“My thoughts now became more active, and I longed to discover the motives and feelings of these lovely creatures; I was inquisitive to know why Felix appeared so miserable and Agatha so sad. I thought (foolish wretch!) that it might be in my power to restore happiness to these deserving people. When I slept or was absent, the forms of the venerable blind father, the gentle Agatha, and the excellent Felix flitted before me. I looked upon them as superior beings who would be the arbiters of my future destiny. I formed in my imagination a thousand pictures of presenting myself to them, and their reception of me. I imagined that they would be disgusted, until, by my gentle demeanour and conciliating words, I should first win their favour and afterwards their love.
+
+“These thoughts exhilarated me and led me to apply with fresh ardour to the acquiring the art of language. My organs were indeed harsh, but supple; and although my voice was very unlike the soft music of their tones, yet I pronounced such words as I understood with tolerable ease. It was as the ass and the lap-dog; yet surely the gentle ass whose intentions were affectionate, although his manners were rude, deserved better treatment than blows and execration.
+
+“The pleasant showers and genial warmth of spring greatly altered the aspect of the earth. Men who before this change seemed to have been hid in caves dispersed themselves and were employed in various arts of cultivation. The birds sang in more cheerful notes, and the leaves began to bud forth on the trees. Happy, happy earth! Fit habitation for gods, which, so short a time before, was bleak, damp, and unwholesome. My spirits were elevated by the enchanting appearance of nature; the past was blotted from my memory, the present was tranquil, and the future gilded by bright rays of hope and anticipations of joy.”
+
+Chapter 13
+“I now hasten to the more moving part of my story. I shall relate events that impressed me with feelings which, from what I had been, have made me what I am.
+
+“Spring advanced rapidly; the weather became fine and the skies cloudless. It surprised me that what before was desert and gloomy should now bloom with the most beautiful flowers and verdure. My senses were gratified and refreshed by a thousand scents of delight and a thousand sights of beauty.
+
+“It was on one of these days, when my cottagers periodically rested from labour—the old man played on his guitar, and the children listened to him—that I observed the countenance of Felix was melancholy beyond expression; he sighed frequently, and once his father paused in his music, and I conjectured by his manner that he inquired the cause of his son’s sorrow. Felix replied in a cheerful accent, and the old man was recommencing his music when someone tapped at the door.
+
+“It was a lady on horseback, accompanied by a country-man as a guide. The lady was dressed in a dark suit and covered with a thick black veil. Agatha asked a question, to which the stranger only replied by pronouncing, in a sweet accent, the name of Felix. Her voice was musical but unlike that of either of my friends. On hearing this word, Felix came up hastily to the lady, who, when she saw him, threw up her veil, and I beheld a countenance of angelic beauty and expression. Her hair of a shining raven black, and curiously braided; her eyes were dark, but gentle, although animated; her features of a regular proportion, and her complexion wondrously fair, each cheek tinged with a lovely pink.
+
+“Felix seemed ravished with delight when he saw her, every trait of sorrow vanished from his face, and it instantly expressed a degree of ecstatic joy, of which I could hardly have believed it capable; his eyes sparkled, as his cheek flushed with pleasure; and at that moment I thought him as beautiful as the stranger. She appeared affected by different feelings; wiping a few tears from her lovely eyes, she held out her hand to Felix, who kissed it rapturously and called her, as well as I could distinguish, his sweet Arabian. She did not appear to understand him, but smiled. He assisted her to dismount, and dismissing her guide, conducted her into the cottage. Some conversation took place between him and his father, and the young stranger knelt at the old man’s feet and would have kissed his hand, but he raised her and embraced her affectionately.
+
+“I soon perceived that although the stranger uttered articulate sounds and appeared to have a language of her own, she was neither understood by nor herself understood the cottagers. They made many signs which I did not comprehend, but I saw that her presence diffused gladness through the cottage, dispelling their sorrow as the sun dissipates the morning mists. Felix seemed peculiarly happy and with smiles of delight welcomed his Arabian. Agatha, the ever-gentle Agatha, kissed the hands of the lovely stranger, and pointing to her brother, made signs which appeared to me to mean that he had been sorrowful until she came. Some hours passed thus, while they, by their countenances, expressed joy, the cause of which I did not comprehend. Presently I found, by the frequent recurrence of some sound which the stranger repeated after them, that she was endeavouring to learn their language; and the idea instantly occurred to me that I should make use of the same instructions to the same end. The stranger learned about twenty words at the first lesson; most of them, indeed, were those which I had before understood, but I profited by the others.
+
+“As night came on, Agatha and the Arabian retired early. When they separated Felix kissed the hand of the stranger and said, ‘Good night sweet Safie.’ He sat up much longer, conversing with his father, and by the frequent repetition of her name I conjectured that their lovely guest was the subject of their conversation. I ardently desired to understand them, and bent every faculty towards that purpose, but found it utterly impossible.
+
+“The next morning Felix went out to his work, and after the usual occupations of Agatha were finished, the Arabian sat at the feet of the old man, and taking his guitar, played some airs so entrancingly beautiful that they at once drew tears of sorrow and delight from my eyes. She sang, and her voice flowed in a rich cadence, swelling or dying away like a nightingale of the woods.
+
+“When she had finished, she gave the guitar to Agatha, who at first declined it. She played a simple air, and her voice accompanied it in sweet accents, but unlike the wondrous strain of the stranger. The old man appeared enraptured and said some words which Agatha endeavoured to explain to Safie, and by which he appeared to wish to express that she bestowed on him the greatest delight by her music.
+
+“The days now passed as peaceably as before, with the sole alteration that joy had taken place of sadness in the countenances of my friends. Safie was always gay and happy; she and I improved rapidly in the knowledge of language, so that in two months I began to comprehend most of the words uttered by my protectors.
+
+“In the meanwhile also the black ground was covered with herbage, and the green banks interspersed with innumerable flowers, sweet to the scent and the eyes, stars of pale radiance among the moonlight woods; the sun became warmer, the nights clear and balmy; and my nocturnal rambles were an extreme pleasure to me, although they were considerably shortened by the late setting and early rising of the sun, for I never ventured abroad during daylight, fearful of meeting with the same treatment I had formerly endured in the first village which I entered.
+
+“My days were spent in close attention, that I might more speedily master the language; and I may boast that I improved more rapidly than the Arabian, who understood very little and conversed in broken accents, whilst I comprehended and could imitate almost every word that was spoken.
+
+“While I improved in speech, I also learned the science of letters as it was taught to the stranger, and this opened before me a wide field for wonder and delight.
+
+“The book from which Felix instructed Safie was Volney’s Ruins of Empires. I should not have understood the purport of this book had not Felix, in reading it, given very minute explanations. He had chosen this work, he said, because the declamatory style was framed in imitation of the Eastern authors. Through this work I obtained a cursory knowledge of history and a view of the several empires at present existing in the world; it gave me an insight into the manners, governments, and religions of the different nations of the earth. I heard of the slothful Asiatics, of the stupendous genius and mental activity of the Grecians, of the wars and wonderful virtue of the early Romans—of their subsequent degenerating—of the decline of that mighty empire, of chivalry, Christianity, and kings. I heard of the discovery of the American hemisphere and wept with Safie over the hapless fate of its original inhabitants.
+
+“These wonderful narrations inspired me with strange feelings. Was man, indeed, at once so powerful, so virtuous and magnificent, yet so vicious and base? He appeared at one time a mere scion of the evil principle and at another as all that can be conceived of noble and godlike. To be a great and virtuous man appeared the highest honour that can befall a sensitive being; to be base and vicious, as many on record have been, appeared the lowest degradation, a condition more abject than that of the blind mole or harmless worm. For a long time I could not conceive how one man could go forth to murder his fellow, or even why there were laws and governments; but when I heard details of vice and bloodshed, my wonder ceased and I turned away with disgust and loathing.
+
+“Every conversation of the cottagers now opened new wonders to me. While I listened to the instructions which Felix bestowed upon the Arabian, the strange system of human society was explained to me. I heard of the division of property, of immense wealth and squalid poverty, of rank, descent, and noble blood.
+
+“The words induced me to turn towards myself. I learned that the possessions most esteemed by your fellow creatures were high and unsullied descent united with riches. A man might be respected with only one of these advantages, but without either he was considered, except in very rare instances, as a vagabond and a slave, doomed to waste his powers for the profits of the chosen few! And what was I? Of my creation and creator I was absolutely ignorant, but I knew that I possessed no money, no friends, no kind of property. I was, besides, endued with a figure hideously deformed and loathsome; I was not even of the same nature as man. I was more agile than they and could subsist upon coarser diet; I bore the extremes of heat and cold with less injury to my frame; my stature far exceeded theirs. When I looked around I saw and heard of none like me. Was I, then, a monster, a blot upon the earth, from which all men fled and whom all men disowned?
+
+“I cannot describe to you the agony that these reflections inflicted upon me; I tried to dispel them, but sorrow only increased with knowledge. Oh, that I had for ever remained in my native wood, nor known nor felt beyond the sensations of hunger, thirst, and heat!
+
+“Of what a strange nature is knowledge! It clings to the mind when it has once seized on it like a lichen on the rock. I wished sometimes to shake off all thought and feeling, but I learned that there was but one means to overcome the sensation of pain, and that was death—a state which I feared yet did not understand. I admired virtue and good feelings and loved the gentle manners and amiable qualities of my cottagers, but I was shut out from intercourse with them, except through means which I obtained by stealth, when I was unseen and unknown, and which rather increased than satisfied the desire I had of becoming one among my fellows. The gentle words of Agatha and the animated smiles of the charming Arabian were not for me. The mild exhortations of the old man and the lively conversation of the loved Felix were not for me. Miserable, unhappy wretch!
+
+“Other lessons were impressed upon me even more deeply. I heard of the difference of sexes, and the birth and growth of children, how the father doted on the smiles of the infant, and the lively sallies of the older child, how all the life and cares of the mother were wrapped up in the precious charge, how the mind of youth expanded and gained knowledge, of brother, sister, and all the various relationships which bind one human being to another in mutual bonds.
+
+“But where were my friends and relations? No father had watched my infant days, no mother had blessed me with smiles and caresses; or if they had, all my past life was now a blot, a blind vacancy in which I distinguished nothing. From my earliest remembrance I had been as I then was in height and proportion. I had never yet seen a being resembling me or who claimed any intercourse with me. What was I? The question again recurred, to be answered only with groans.
+
+“I will soon explain to what these feelings tended, but allow me now to return to the cottagers, whose story excited in me such various feelings of indignation, delight, and wonder, but which all terminated in additional love and reverence for my protectors (for so I loved, in an innocent, half-painful self-deceit, to call them).”
+
+Chapter 14
+“Some time elapsed before I learned the history of my friends. It was one which could not fail to impress itself deeply on my mind, unfolding as it did a number of circumstances, each interesting and wonderful to one so utterly inexperienced as I was.
+
+“The name of the old man was De Lacey. He was descended from a good family in France, where he had lived for many years in affluence, respected by his superiors and beloved by his equals. His son was bred in the service of his country, and Agatha had ranked with ladies of the highest distinction. A few months before my arrival they had lived in a large and luxurious city called Paris, surrounded by friends and possessed of every enjoyment which virtue, refinement of intellect, or taste, accompanied by a moderate fortune, could afford.
+
+“The father of Safie had been the cause of their ruin. He was a Turkish merchant and had inhabited Paris for many years, when, for some reason which I could not learn, he became obnoxious to the government. He was seized and cast into prison the very day that Safie arrived from Constantinople to join him. He was tried and condemned to death. The injustice of his sentence was very flagrant; all Paris was indignant; and it was judged that his religion and wealth rather than the crime alleged against him had been the cause of his condemnation.
+
+“Felix had accidentally been present at the trial; his horror and indignation were uncontrollable when he heard the decision of the court. He made, at that moment, a solemn vow to deliver him and then looked around for the means. After many fruitless attempts to gain admittance to the prison, he found a strongly grated window in an unguarded part of the building, which lighted the dungeon of the unfortunate Muhammadan, who, loaded with chains, waited in despair the execution of the barbarous sentence. Felix visited the grate at night and made known to the prisoner his intentions in his favour. The Turk, amazed and delighted, endeavoured to kindle the zeal of his deliverer by promises of reward and wealth. Felix rejected his offers with contempt, yet when he saw the lovely Safie, who was allowed to visit her father and who by her gestures expressed her lively gratitude, the youth could not help owning to his own mind that the captive possessed a treasure which would fully reward his toil and hazard.
+
+“The Turk quickly perceived the impression that his daughter had made on the heart of Felix and endeavoured to secure him more entirely in his interests by the promise of her hand in marriage so soon as he should be conveyed to a place of safety. Felix was too delicate to accept this offer, yet he looked forward to the probability of the event as to the consummation of his happiness.
+
+“During the ensuing days, while the preparations were going forward for the escape of the merchant, the zeal of Felix was warmed by several letters that he received from this lovely girl, who found means to express her thoughts in the language of her lover by the aid of an old man, a servant of her father who understood French. She thanked him in the most ardent terms for his intended services towards her parent, and at the same time she gently deplored her own fate.
+
+“I have copies of these letters, for I found means, during my residence in the hovel, to procure the implements of writing; and the letters were often in the hands of Felix or Agatha. Before I depart I will give them to you; they will prove the truth of my tale; but at present, as the sun is already far declined, I shall only have time to repeat the substance of them to you.
+
+“Safie related that her mother was a Christian Arab, seized and made a slave by the Turks; recommended by her beauty, she had won the heart of the father of Safie, who married her. The young girl spoke in high and enthusiastic terms of her mother, who, born in freedom, spurned the bondage to which she was now reduced. She instructed her daughter in the tenets of her religion and taught her to aspire to higher powers of intellect and an independence of spirit forbidden to the female followers of Muhammad. This lady died, but her lessons were indelibly impressed on the mind of Safie, who sickened at the prospect of again returning to Asia and being immured within the walls of a harem, allowed only to occupy herself with infantile amusements, ill-suited to the temper of her soul, now accustomed to grand ideas and a noble emulation for virtue. The prospect of marrying a Christian and remaining in a country where women were allowed to take a rank in society was enchanting to her.
+
+“The day for the execution of the Turk was fixed, but on the night previous to it he quitted his prison and before morning was distant many leagues from Paris. Felix had procured passports in the name of his father, sister, and himself. He had previously communicated his plan to the former, who aided the deceit by quitting his house, under the pretence of a journey and concealed himself, with his daughter, in an obscure part of Paris.
+
+“Felix conducted the fugitives through France to Lyons and across Mont Cenis to Leghorn, where the merchant had decided to wait a favourable opportunity of passing into some part of the Turkish dominions.
+
+“Safie resolved to remain with her father until the moment of his departure, before which time the Turk renewed his promise that she should be united to his deliverer; and Felix remained with them in expectation of that event; and in the meantime he enjoyed the society of the Arabian, who exhibited towards him the simplest and tenderest affection. They conversed with one another through the means of an interpreter, and sometimes with the interpretation of looks; and Safie sang to him the divine airs of her native country.
+
+“The Turk allowed this intimacy to take place and encouraged the hopes of the youthful lovers, while in his heart he had formed far other plans. He loathed the idea that his daughter should be united to a Christian, but he feared the resentment of Felix if he should appear lukewarm, for he knew that he was still in the power of his deliverer if he should choose to betray him to the Italian state which they inhabited. He revolved a thousand plans by which he should be enabled to prolong the deceit until it might be no longer necessary, and secretly to take his daughter with him when he departed. His plans were facilitated by the news which arrived from Paris.
+
+“The government of France were greatly enraged at the escape of their victim and spared no pains to detect and punish his deliverer. The plot of Felix was quickly discovered, and De Lacey and Agatha were thrown into prison. The news reached Felix and roused him from his dream of pleasure. His blind and aged father and his gentle sister lay in a noisome dungeon while he enjoyed the free air and the society of her whom he loved. This idea was torture to him. He quickly arranged with the Turk that if the latter should find a favourable opportunity for escape before Felix could return to Italy, Safie should remain as a boarder at a convent at Leghorn; and then, quitting the lovely Arabian, he hastened to Paris and delivered himself up to the vengeance of the law, hoping to free De Lacey and Agatha by this proceeding.
+
+“He did not succeed. They remained confined for five months before the trial took place, the result of which deprived them of their fortune and condemned them to a perpetual exile from their native country.
+
+“They found a miserable asylum in the cottage in Germany, where I discovered them. Felix soon learned that the treacherous Turk, for whom he and his family endured such unheard-of oppression, on discovering that his deliverer was thus reduced to poverty and ruin, became a traitor to good feeling and honour and had quitted Italy with his daughter, insultingly sending Felix a pittance of money to aid him, as he said, in some plan of future maintenance.
+
+“Such were the events that preyed on the heart of Felix and rendered him, when I first saw him, the most miserable of his family. He could have endured poverty, and while this distress had been the meed of his virtue, he gloried in it; but the ingratitude of the Turk and the loss of his beloved Safie were misfortunes more bitter and irreparable. The arrival of the Arabian now infused new life into his soul.
+
+“When the news reached Leghorn that Felix was deprived of his wealth and rank, the merchant commanded his daughter to think no more of her lover, but to prepare to return to her native country. The generous nature of Safie was outraged by this command; she attempted to expostulate with her father, but he left her angrily, reiterating his tyrannical mandate.
+
+“A few days after, the Turk entered his daughter’s apartment and told her hastily that he had reason to believe that his residence at Leghorn had been divulged and that he should speedily be delivered up to the French government; he had consequently hired a vessel to convey him to Constantinople, for which city he should sail in a few hours. He intended to leave his daughter under the care of a confidential servant, to follow at her leisure with the greater part of his property, which had not yet arrived at Leghorn.
+
+“When alone, Safie resolved in her own mind the plan of conduct that it would become her to pursue in this emergency. A residence in Turkey was abhorrent to her; her religion and her feelings were alike averse to it. By some papers of her father which fell into her hands she heard of the exile of her lover and learnt the name of the spot where he then resided. She hesitated some time, but at length she formed her determination. Taking with her some jewels that belonged to her and a sum of money, she quitted Italy with an attendant, a native of Leghorn, but who understood the common language of Turkey, and departed for Germany.
+
+“She arrived in safety at a town about twenty leagues from the cottage of De Lacey, when her attendant fell dangerously ill. Safie nursed her with the most devoted affection, but the poor girl died, and the Arabian was left alone, unacquainted with the language of the country and utterly ignorant of the customs of the world. She fell, however, into good hands. The Italian had mentioned the name of the spot for which they were bound, and after her death the woman of the house in which they had lived took care that Safie should arrive in safety at the cottage of her lover.”
+
+Chapter 15
+“Such was the history of my beloved cottagers. It impressed me deeply. I learned, from the views of social life which it developed, to admire their virtues and to deprecate the vices of mankind.
+
+“As yet I looked upon crime as a distant evil, benevolence and generosity were ever present before me, inciting within me a desire to become an actor in the busy scene where so many admirable qualities were called forth and displayed. But in giving an account of the progress of my intellect, I must not omit a circumstance which occurred in the beginning of the month of August of the same year.
+
+“One night during my accustomed visit to the neighbouring wood where I collected my own food and brought home firing for my protectors, I found on the ground a leathern portmanteau containing several articles of dress and some books. I eagerly seized the prize and returned with it to my hovel. Fortunately the books were written in the language, the elements of which I had acquired at the cottage; they consisted of Paradise Lost, a volume of Plutarch’s Lives, and the Sorrows of Werter. The possession of these treasures gave me extreme delight; I now continually studied and exercised my mind upon these histories, whilst my friends were employed in their ordinary occupations.
+
+“I can hardly describe to you the effect of these books. They produced in me an infinity of new images and feelings, that sometimes raised me to ecstasy, but more frequently sunk me into the lowest dejection. In the Sorrows of Werter, besides the interest of its simple and affecting story, so many opinions are canvassed and so many lights thrown upon what had hitherto been to me obscure subjects that I found in it a never-ending source of speculation and astonishment. The gentle and domestic manners it described, combined with lofty sentiments and feelings, which had for their object something out of self, accorded well with my experience among my protectors and with the wants which were for ever alive in my own bosom. But I thought Werter himself a more divine being than I had ever beheld or imagined; his character contained no pretension, but it sank deep. The disquisitions upon death and suicide were calculated to fill me with wonder. I did not pretend to enter into the merits of the case, yet I inclined towards the opinions of the hero, whose extinction I wept, without precisely understanding it.
+
+“As I read, however, I applied much personally to my own feelings and condition. I found myself similar yet at the same time strangely unlike to the beings concerning whom I read and to whose conversation I was a listener. I sympathised with and partly understood them, but I was unformed in mind; I was dependent on none and related to none. ‘The path of my departure was free,’ and there was none to lament my annihilation. My person was hideous and my stature gigantic. What did this mean? Who was I? What was I? Whence did I come? What was my destination? These questions continually recurred, but I was unable to solve them.
+
+“The volume of Plutarch’s Lives which I possessed contained the histories of the first founders of the ancient republics. This book had a far different effect upon me from the Sorrows of Werter. I learned from Werter’s imaginations despondency and gloom, but Plutarch taught me high thoughts; he elevated me above the wretched sphere of my own reflections, to admire and love the heroes of past ages. Many things I read surpassed my understanding and experience. I had a very confused knowledge of kingdoms, wide extents of country, mighty rivers, and boundless seas. But I was perfectly unacquainted with towns and large assemblages of men. The cottage of my protectors had been the only school in which I had studied human nature, but this book developed new and mightier scenes of action. I read of men concerned in public affairs, governing or massacring their species. I felt the greatest ardour for virtue rise within me, and abhorrence for vice, as far as I understood the signification of those terms, relative as they were, as I applied them, to pleasure and pain alone. Induced by these feelings, I was of course led to admire peaceable lawgivers, Numa, Solon, and Lycurgus, in preference to Romulus and Theseus. The patriarchal lives of my protectors caused these impressions to take a firm hold on my mind; perhaps, if my first introduction to humanity had been made by a young soldier, burning for glory and slaughter, I should have been imbued with different sensations.
+
+“But Paradise Lost excited different and far deeper emotions. I read it, as I had read the other volumes which had fallen into my hands, as a true history. It moved every feeling of wonder and awe that the picture of an omnipotent God warring with his creatures was capable of exciting. I often referred the several situations, as their similarity struck me, to my own. Like Adam, I was apparently united by no link to any other being in existence; but his state was far different from mine in every other respect. He had come forth from the hands of God a perfect creature, happy and prosperous, guarded by the especial care of his Creator; he was allowed to converse with and acquire knowledge from beings of a superior nature, but I was wretched, helpless, and alone. Many times I considered Satan as the fitter emblem of my condition, for often, like him, when I viewed the bliss of my protectors, the bitter gall of envy rose within me.
+
+“Another circumstance strengthened and confirmed these feelings. Soon after my arrival in the hovel I discovered some papers in the pocket of the dress which I had taken from your laboratory. At first I had neglected them, but now that I was able to decipher the characters in which they were written, I began to study them with diligence. It was your journal of the four months that preceded my creation. You minutely described in these papers every step you took in the progress of your work; this history was mingled with accounts of domestic occurrences. You doubtless recollect these papers. Here they are. Everything is related in them which bears reference to my accursed origin; the whole detail of that series of disgusting circumstances which produced it is set in view; the minutest description of my odious and loathsome person is given, in language which painted your own horrors and rendered mine indelible. I sickened as I read. ‘Hateful day when I received life!’ I exclaimed in agony. ‘Accursed creator! Why did you form a monster so hideous that even you turned from me in disgust? God, in pity, made man beautiful and alluring, after his own image; but my form is a filthy type of yours, more horrid even from the very resemblance. Satan had his companions, fellow devils, to admire and encourage him, but I am solitary and abhorred.’
+
+“These were the reflections of my hours of despondency and solitude; but when I contemplated the virtues of the cottagers, their amiable and benevolent dispositions, I persuaded myself that when they should become acquainted with my admiration of their virtues they would compassionate me and overlook my personal deformity. Could they turn from their door one, however monstrous, who solicited their compassion and friendship? I resolved, at least, not to despair, but in every way to fit myself for an interview with them which would decide my fate. I postponed this attempt for some months longer, for the importance attached to its success inspired me with a dread lest I should fail. Besides, I found that my understanding improved so much with every day’s experience that I was unwilling to commence this undertaking until a few more months should have added to my sagacity.
+
+“Several changes, in the meantime, took place in the cottage. The presence of Safie diffused happiness among its inhabitants, and I also found that a greater degree of plenty reigned there. Felix and Agatha spent more time in amusement and conversation, and were assisted in their labours by servants. They did not appear rich, but they were contented and happy; their feelings were serene and peaceful, while mine became every day more tumultuous. Increase of knowledge only discovered to me more clearly what a wretched outcast I was. I cherished hope, it is true, but it vanished when I beheld my person reflected in water or my shadow in the moonshine, even as that frail image and that inconstant shade.
+
+“I endeavoured to crush these fears and to fortify myself for the trial which in a few months I resolved to undergo; and sometimes I allowed my thoughts, unchecked by reason, to ramble in the fields of Paradise, and dared to fancy amiable and lovely creatures sympathising with my feelings and cheering my gloom; their angelic countenances breathed smiles of consolation. But it was all a dream; no Eve soothed my sorrows nor shared my thoughts; I was alone. I remembered Adam’s supplication to his Creator. But where was mine? He had abandoned me, and in the bitterness of my heart I cursed him.
+
+“Autumn passed thus. I saw, with surprise and grief, the leaves decay and fall, and nature again assume the barren and bleak appearance it had worn when I first beheld the woods and the lovely moon. Yet I did not heed the bleakness of the weather; I was better fitted by my conformation for the endurance of cold than heat. But my chief delights were the sight of the flowers, the birds, and all the gay apparel of summer; when those deserted me, I turned with more attention towards the cottagers. Their happiness was not decreased by the absence of summer. They loved and sympathised with one another; and their joys, depending on each other, were not interrupted by the casualties that took place around them. The more I saw of them, the greater became my desire to claim their protection and kindness; my heart yearned to be known and loved by these amiable creatures; to see their sweet looks directed towards me with affection was the utmost limit of my ambition. I dared not think that they would turn them from me with disdain and horror. The poor that stopped at their door were never driven away. I asked, it is true, for greater treasures than a little food or rest: I required kindness and sympathy; but I did not believe myself utterly unworthy of it.
+
+“The winter advanced, and an entire revolution of the seasons had taken place since I awoke into life. My attention at this time was solely directed towards my plan of introducing myself into the cottage of my protectors. I revolved many projects, but that on which I finally fixed was to enter the dwelling when the blind old man should be alone. I had sagacity enough to discover that the unnatural hideousness of my person was the chief object of horror with those who had formerly beheld me. My voice, although harsh, had nothing terrible in it; I thought, therefore, that if in the absence of his children I could gain the good will and mediation of the old De Lacey, I might by his means be tolerated by my younger protectors.
+
+“One day, when the sun shone on the red leaves that strewed the ground and diffused cheerfulness, although it denied warmth, Safie, Agatha, and Felix departed on a long country walk, and the old man, at his own desire, was left alone in the cottage. When his children had departed, he took up his guitar and played several mournful but sweet airs, more sweet and mournful than I had ever heard him play before. At first his countenance was illuminated with pleasure, but as he continued, thoughtfulness and sadness succeeded; at length, laying aside the instrument, he sat absorbed in reflection.
+
+“My heart beat quick; this was the hour and moment of trial, which would decide my hopes or realise my fears. The servants were gone to a neighbouring fair. All was silent in and around the cottage; it was an excellent opportunity; yet, when I proceeded to execute my plan, my limbs failed me and I sank to the ground. Again I rose, and exerting all the firmness of which I was master, removed the planks which I had placed before my hovel to conceal my retreat. The fresh air revived me, and with renewed determination I approached the door of their cottage.
+
+“I knocked. ‘Who is there?’ said the old man. ‘Come in.’
+
+“I entered. ‘Pardon this intrusion,’ said I; ‘I am a traveller in want of a little rest; you would greatly oblige me if you would allow me to remain a few minutes before the fire.’
+
+“‘Enter,’ said De Lacey, ‘and I will try in what manner I can to relieve your wants; but, unfortunately, my children are from home, and as I am blind, I am afraid I shall find it difficult to procure food for you.’
+
+“‘Do not trouble yourself, my kind host; I have food; it is warmth and rest only that I need.’
+
+“I sat down, and a silence ensued. I knew that every minute was precious to me, yet I remained irresolute in what manner to commence the interview, when the old man addressed me.
+
+‘By your language, stranger, I suppose you are my countryman; are you French?’
+
+“‘No; but I was educated by a French family and understand that language only. I am now going to claim the protection of some friends, whom I sincerely love, and of whose favour I have some hopes.’
+
+“‘Are they Germans?’
+
+“‘No, they are French. But let us change the subject. I am an unfortunate and deserted creature, I look around and I have no relation or friend upon earth. These amiable people to whom I go have never seen me and know little of me. I am full of fears, for if I fail there, I am an outcast in the world for ever.’
+
+“‘Do not despair. To be friendless is indeed to be unfortunate, but the hearts of men, when unprejudiced by any obvious self-interest, are full of brotherly love and charity. Rely, therefore, on your hopes; and if these friends are good and amiable, do not despair.’
+
+“‘They are kind—they are the most excellent creatures in the world; but, unfortunately, they are prejudiced against me. I have good dispositions; my life has been hitherto harmless and in some degree beneficial; but a fatal prejudice clouds their eyes, and where they ought to see a feeling and kind friend, they behold only a detestable monster.’
+
+“‘That is indeed unfortunate; but if you are really blameless, cannot you undeceive them?’
+
+“‘I am about to undertake that task; and it is on that account that I feel so many overwhelming terrors. I tenderly love these friends; I have, unknown to them, been for many months in the habits of daily kindness towards them; but they believe that I wish to injure them, and it is that prejudice which I wish to overcome.’
+
+“‘Where do these friends reside?’
+
+“‘Near this spot.’
+
+“The old man paused and then continued, ‘If you will unreservedly confide to me the particulars of your tale, I perhaps may be of use in undeceiving them. I am blind and cannot judge of your countenance, but there is something in your words which persuades me that you are sincere. I am poor and an exile, but it will afford me true pleasure to be in any way serviceable to a human creature.’
+
+“‘Excellent man! I thank you and accept your generous offer. You raise me from the dust by this kindness; and I trust that, by your aid, I shall not be driven from the society and sympathy of your fellow creatures.’
+
+“‘Heaven forbid! Even if you were really criminal, for that can only drive you to desperation, and not instigate you to virtue. I also am unfortunate; I and my family have been condemned, although innocent; judge, therefore, if I do not feel for your misfortunes.’
+
+“‘How can I thank you, my best and only benefactor? From your lips first have I heard the voice of kindness directed towards me; I shall be for ever grateful; and your present humanity assures me of success with those friends whom I am on the point of meeting.’
+
+“‘May I know the names and residence of those friends?’
+
+“I paused. This, I thought, was the moment of decision, which was to rob me of or bestow happiness on me for ever. I struggled vainly for firmness sufficient to answer him, but the effort destroyed all my remaining strength; I sank on the chair and sobbed aloud. At that moment I heard the steps of my younger protectors. I had not a moment to lose, but seizing the hand of the old man, I cried, ‘Now is the time! Save and protect me! You and your family are the friends whom I seek. Do not you desert me in the hour of trial!’
+
+“‘Great God!’ exclaimed the old man. ‘Who are you?’
+
+“At that instant the cottage door was opened, and Felix, Safie, and Agatha entered. Who can describe their horror and consternation on beholding me? Agatha fainted, and Safie, unable to attend to her friend, rushed out of the cottage. Felix darted forward, and with supernatural force tore me from his father, to whose knees I clung, in a transport of fury, he dashed me to the ground and struck me violently with a stick. I could have torn him limb from limb, as the lion rends the antelope. But my heart sank within me as with bitter sickness, and I refrained. I saw him on the point of repeating his blow, when, overcome by pain and anguish, I quitted the cottage, and in the general tumult escaped unperceived to my hovel.”
+
+Chapter 16
+“Cursed, cursed creator! Why did I live? Why, in that instant, did I not extinguish the spark of existence which you had so wantonly bestowed? I know not; despair had not yet taken possession of me; my feelings were those of rage and revenge. I could with pleasure have destroyed the cottage and its inhabitants and have glutted myself with their shrieks and misery.
+
+“When night came I quitted my retreat and wandered in the wood; and now, no longer restrained by the fear of discovery, I gave vent to my anguish in fearful howlings. I was like a wild beast that had broken the toils, destroying the objects that obstructed me and ranging through the wood with a stag-like swiftness. Oh! What a miserable night I passed! The cold stars shone in mockery, and the bare trees waved their branches above me; now and then the sweet voice of a bird burst forth amidst the universal stillness. All, save I, were at rest or in enjoyment; I, like the arch-fiend, bore a hell within me, and finding myself unsympathised with, wished to tear up the trees, spread havoc and destruction around me, and then to have sat down and enjoyed the ruin.
+
+“But this was a luxury of sensation that could not endure; I became fatigued with excess of bodily exertion and sank on the damp grass in the sick impotence of despair. There was none among the myriads of men that existed who would pity or assist me; and should I feel kindness towards my enemies? No: from that moment I declared everlasting war against the species, and more than all, against him who had formed me and sent me forth to this insupportable misery.
+
+“The sun rose; I heard the voices of men and knew that it was impossible to return to my retreat during that day. Accordingly I hid myself in some thick underwood, determining to devote the ensuing hours to reflection on my situation.
+
+“The pleasant sunshine and the pure air of day restored me to some degree of tranquillity; and when I considered what had passed at the cottage, I could not help believing that I had been too hasty in my conclusions. I had certainly acted imprudently. It was apparent that my conversation had interested the father in my behalf, and I was a fool in having exposed my person to the horror of his children. I ought to have familiarised the old De Lacey to me, and by degrees to have discovered myself to the rest of his family, when they should have been prepared for my approach. But I did not believe my errors to be irretrievable, and after much consideration I resolved to return to the cottage, seek the old man, and by my representations win him to my party.
+
+“These thoughts calmed me, and in the afternoon I sank into a profound sleep; but the fever of my blood did not allow me to be visited by peaceful dreams. The horrible scene of the preceding day was for ever acting before my eyes; the females were flying and the enraged Felix tearing me from his father’s feet. I awoke exhausted, and finding that it was already night, I crept forth from my hiding-place, and went in search of food.
+
+“When my hunger was appeased, I directed my steps towards the well-known path that conducted to the cottage. All there was at peace. I crept into my hovel and remained in silent expectation of the accustomed hour when the family arose. That hour passed, the sun mounted high in the heavens, but the cottagers did not appear. I trembled violently, apprehending some dreadful misfortune. The inside of the cottage was dark, and I heard no motion; I cannot describe the agony of this suspense.
+
+“Presently two countrymen passed by, but pausing near the cottage, they entered into conversation, using violent gesticulations; but I did not understand what they said, as they spoke the language of the country, which differed from that of my protectors. Soon after, however, Felix approached with another man; I was surprised, as I knew that he had not quitted the cottage that morning, and waited anxiously to discover from his discourse the meaning of these unusual appearances.
+
+“‘Do you consider,’ said his companion to him, ‘that you will be obliged to pay three months’ rent and to lose the produce of your garden? I do not wish to take any unfair advantage, and I beg therefore that you will take some days to consider of your determination.’
+
+“‘It is utterly useless,’ replied Felix; ‘we can never again inhabit your cottage. The life of my father is in the greatest danger, owing to the dreadful circumstance that I have related. My wife and my sister will never recover from their horror. I entreat you not to reason with me any more. Take possession of your tenement and let me fly from this place.’
+
+“Felix trembled violently as he said this. He and his companion entered the cottage, in which they remained for a few minutes, and then departed. I never saw any of the family of De Lacey more.
+
+“I continued for the remainder of the day in my hovel in a state of utter and stupid despair. My protectors had departed and had broken the only link that held me to the world. For the first time the feelings of revenge and hatred filled my bosom, and I did not strive to control them, but allowing myself to be borne away by the stream, I bent my mind towards injury and death. When I thought of my friends, of the mild voice of De Lacey, the gentle eyes of Agatha, and the exquisite beauty of the Arabian, these thoughts vanished and a gush of tears somewhat soothed me. But again when I reflected that they had spurned and deserted me, anger returned, a rage of anger, and unable to injure anything human, I turned my fury towards inanimate objects. As night advanced, I placed a variety of combustibles around the cottage, and after having destroyed every vestige of cultivation in the garden, I waited with forced impatience until the moon had sunk to commence my operations.
+
+“As the night advanced, a fierce wind arose from the woods and quickly dispersed the clouds that had loitered in the heavens; the blast tore along like a mighty avalanche and produced a kind of insanity in my spirits that burst all bounds of reason and reflection. I lighted the dry branch of a tree and danced with fury around the devoted cottage, my eyes still fixed on the western horizon, the edge of which the moon nearly touched. A part of its orb was at length hid, and I waved my brand; it sank, and with a loud scream I fired the straw, and heath, and bushes, which I had collected. The wind fanned the fire, and the cottage was quickly enveloped by the flames, which clung to it and licked it with their forked and destroying tongues.
+
+“As soon as I was convinced that no assistance could save any part of the habitation, I quitted the scene and sought for refuge in the woods.
+
+“And now, with the world before me, whither should I bend my steps? I resolved to fly far from the scene of my misfortunes; but to me, hated and despised, every country must be equally horrible. At length the thought of you crossed my mind. I learned from your papers that you were my father, my creator; and to whom could I apply with more fitness than to him who had given me life? Among the lessons that Felix had bestowed upon Safie, geography had not been omitted; I had learned from these the relative situations of the different countries of the earth. You had mentioned Geneva as the name of your native town, and towards this place I resolved to proceed.
+
+“But how was I to direct myself? I knew that I must travel in a southwesterly direction to reach my destination, but the sun was my only guide. I did not know the names of the towns that I was to pass through, nor could I ask information from a single human being; but I did not despair. From you only could I hope for succour, although towards you I felt no sentiment but that of hatred. Unfeeling, heartless creator! You had endowed me with perceptions and passions and then cast me abroad an object for the scorn and horror of mankind. But on you only had I any claim for pity and redress, and from you I determined to seek that justice which I vainly attempted to gain from any other being that wore the human form.
+
+“My travels were long and the sufferings I endured intense. It was late in autumn when I quitted the district where I had so long resided. I travelled only at night, fearful of encountering the visage of a human being. Nature decayed around me, and the sun became heatless; rain and snow poured around me; mighty rivers were frozen; the surface of the earth was hard and chill, and bare, and I found no shelter. Oh, earth! How often did I imprecate curses on the cause of my being! The mildness of my nature had fled, and all within me was turned to gall and bitterness. The nearer I approached to your habitation, the more deeply did I feel the spirit of revenge enkindled in my heart. Snow fell, and the waters were hardened, but I rested not. A few incidents now and then directed me, and I possessed a map of the country; but I often wandered wide from my path. The agony of my feelings allowed me no respite; no incident occurred from which my rage and misery could not extract its food; but a circumstance that happened when I arrived on the confines of Switzerland, when the sun had recovered its warmth and the earth again began to look green, confirmed in an especial manner the bitterness and horror of my feelings.
+
+“I generally rested during the day and travelled only when I was secured by night from the view of man. One morning, however, finding that my path lay through a deep wood, I ventured to continue my journey after the sun had risen; the day, which was one of the first of spring, cheered even me by the loveliness of its sunshine and the balminess of the air. I felt emotions of gentleness and pleasure, that had long appeared dead, revive within me. Half surprised by the novelty of these sensations, I allowed myself to be borne away by them, and forgetting my solitude and deformity, dared to be happy. Soft tears again bedewed my cheeks, and I even raised my humid eyes with thankfulness towards the blessed sun, which bestowed such joy upon me.
+
+“I continued to wind among the paths of the wood, until I came to its boundary, which was skirted by a deep and rapid river, into which many of the trees bent their branches, now budding with the fresh spring. Here I paused, not exactly knowing what path to pursue, when I heard the sound of voices, that induced me to conceal myself under the shade of a cypress. I was scarcely hid when a young girl came running towards the spot where I was concealed, laughing, as if she ran from someone in sport. She continued her course along the precipitous sides of the river, when suddenly her foot slipped, and she fell into the rapid stream. I rushed from my hiding-place and with extreme labour, from the force of the current, saved her and dragged her to shore. She was senseless, and I endeavoured by every means in my power to restore animation, when I was suddenly interrupted by the approach of a rustic, who was probably the person from whom she had playfully fled. On seeing me, he darted towards me, and tearing the girl from my arms, hastened towards the deeper parts of the wood. I followed speedily, I hardly knew why; but when the man saw me draw near, he aimed a gun, which he carried, at my body and fired. I sank to the ground, and my injurer, with increased swiftness, escaped into the wood.
+
+“This was then the reward of my benevolence! I had saved a human being from destruction, and as a recompense I now writhed under the miserable pain of a wound which shattered the flesh and bone. The feelings of kindness and gentleness which I had entertained but a few moments before gave place to hellish rage and gnashing of teeth. Inflamed by pain, I vowed eternal hatred and vengeance to all mankind. But the agony of my wound overcame me; my pulses paused, and I fainted.
+
+“For some weeks I led a miserable life in the woods, endeavouring to cure the wound which I had received. The ball had entered my shoulder, and I knew not whether it had remained there or passed through; at any rate I had no means of extracting it. My sufferings were augmented also by the oppressive sense of the injustice and ingratitude of their infliction. My daily vows rose for revenge—a deep and deadly revenge, such as would alone compensate for the outrages and anguish I had endured.
+
+“After some weeks my wound healed, and I continued my journey. The labours I endured were no longer to be alleviated by the bright sun or gentle breezes of spring; all joy was but a mockery which insulted my desolate state and made me feel more painfully that I was not made for the enjoyment of pleasure.
+
+“But my toils now drew near a close, and in two months from this time I reached the environs of Geneva.
+
+“It was evening when I arrived, and I retired to a hiding-place among the fields that surround it to meditate in what manner I should apply to you. I was oppressed by fatigue and hunger and far too unhappy to enjoy the gentle breezes of evening or the prospect of the sun setting behind the stupendous mountains of Jura.
+
+“At this time a slight sleep relieved me from the pain of reflection, which was disturbed by the approach of a beautiful child, who came running into the recess I had chosen, with all the sportiveness of infancy. Suddenly, as I gazed on him, an idea seized me that this little creature was unprejudiced and had lived too short a time to have imbibed a horror of deformity. If, therefore, I could seize him and educate him as my companion and friend, I should not be so desolate in this peopled earth.
+
+“Urged by this impulse, I seized on the boy as he passed and drew him towards me. As soon as he beheld my form, he placed his hands before his eyes and uttered a shrill scream; I drew his hand forcibly from his face and said, ‘Child, what is the meaning of this? I do not intend to hurt you; listen to me.’
+
+“He struggled violently. ‘Let me go,’ he cried; ‘monster! Ugly wretch! You wish to eat me and tear me to pieces. You are an ogre. Let me go, or I will tell my papa.’
+
+“‘Boy, you will never see your father again; you must come with me.’
+
+“‘Hideous monster! Let me go. My papa is a syndic—he is M. Frankenstein—he will punish you. You dare not keep me.’
+
+“‘Frankenstein! you belong then to my enemy—to him towards whom I have sworn eternal revenge; you shall be my first victim.’
+
+“The child still struggled and loaded me with epithets which carried despair to my heart; I grasped his throat to silence him, and in a moment he lay dead at my feet.
+
+“I gazed on my victim, and my heart swelled with exultation and hellish triumph; clapping my hands, I exclaimed, ‘I too can create desolation; my enemy is not invulnerable; this death will carry despair to him, and a thousand other miseries shall torment and destroy him.’
+
+“As I fixed my eyes on the child, I saw something glittering on his breast. I took it; it was a portrait of a most lovely woman. In spite of my malignity, it softened and attracted me. For a few moments I gazed with delight on her dark eyes, fringed by deep lashes, and her lovely lips; but presently my rage returned; I remembered that I was for ever deprived of the delights that such beautiful creatures could bestow and that she whose resemblance I contemplated would, in regarding me, have changed that air of divine benignity to one expressive of disgust and affright.
+
+“Can you wonder that such thoughts transported me with rage? I only wonder that at that moment, instead of venting my sensations in exclamations and agony, I did not rush among mankind and perish in the attempt to destroy them.
+
+“While I was overcome by these feelings, I left the spot where I had committed the murder, and seeking a more secluded hiding-place, I entered a barn which had appeared to me to be empty. A woman was sleeping on some straw; she was young, not indeed so beautiful as her whose portrait I held, but of an agreeable aspect and blooming in the loveliness of youth and health. Here, I thought, is one of those whose joy-imparting smiles are bestowed on all but me. And then I bent over her and whispered, ‘Awake, fairest, thy lover is near—he who would give his life but to obtain one look of affection from thine eyes; my beloved, awake!’
+
+“The sleeper stirred; a thrill of terror ran through me. Should she indeed awake, and see me, and curse me, and denounce the murderer? Thus would she assuredly act if her darkened eyes opened and she beheld me. The thought was madness; it stirred the fiend within me—not I, but she, shall suffer; the murder I have committed because I am for ever robbed of all that she could give me, she shall atone. The crime had its source in her; be hers the punishment! Thanks to the lessons of Felix and the sanguinary laws of man, I had learned now to work mischief. I bent over her and placed the portrait securely in one of the folds of her dress. She moved again, and I fled.
+
+“For some days I haunted the spot where these scenes had taken place, sometimes wishing to see you, sometimes resolved to quit the world and its miseries for ever. At length I wandered towards these mountains, and have ranged through their immense recesses, consumed by a burning passion which you alone can gratify. We may not part until you have promised to comply with my requisition. I am alone and miserable; man will not associate with me; but one as deformed and horrible as myself would not deny herself to me. My companion must be of the same species and have the same defects. This being you must create.”
+
+Chapter 17
+The being finished speaking and fixed his looks upon me in the expectation of a reply. But I was bewildered, perplexed, and unable to arrange my ideas sufficiently to understand the full extent of his proposition. He continued,
+
+“You must create a female for me with whom I can live in the interchange of those sympathies necessary for my being. This you alone can do, and I demand it of you as a right which you must not refuse to concede.”
+
+The latter part of his tale had kindled anew in me the anger that had died away while he narrated his peaceful life among the cottagers, and as he said this I could no longer suppress the rage that burned within me.
+
+“I do refuse it,” I replied; “and no torture shall ever extort a consent from me. You may render me the most miserable of men, but you shall never make me base in my own eyes. Shall I create another like yourself, whose joint wickedness might desolate the world. Begone! I have answered you; you may torture me, but I will never consent.”
+
+“You are in the wrong,” replied the fiend; “and instead of threatening, I am content to reason with you. I am malicious because I am miserable. Am I not shunned and hated by all mankind? You, my creator, would tear me to pieces and triumph; remember that, and tell me why I should pity man more than he pities me? You would not call it murder if you could precipitate me into one of those ice-rifts and destroy my frame, the work of your own hands. Shall I respect man when he condemns me? Let him live with me in the interchange of kindness, and instead of injury I would bestow every benefit upon him with tears of gratitude at his acceptance. But that cannot be; the human senses are insurmountable barriers to our union. Yet mine shall not be the submission of abject slavery. I will revenge my injuries; if I cannot inspire love, I will cause fear, and chiefly towards you my arch-enemy, because my creator, do I swear inextinguishable hatred. Have a care; I will work at your destruction, nor finish until I desolate your heart, so that you shall curse the hour of your birth.”
+
+A fiendish rage animated him as he said this; his face was wrinkled into contortions too horrible for human eyes to behold; but presently he calmed himself and proceeded—
+
+“I intended to reason. This passion is detrimental to me, for you do not reflect that you are the cause of its excess. If any being felt emotions of benevolence towards me, I should return them a hundred and a hundredfold; for that one creature’s sake I would make peace with the whole kind! But I now indulge in dreams of bliss that cannot be realised. What I ask of you is reasonable and moderate; I demand a creature of another sex, but as hideous as myself; the gratification is small, but it is all that I can receive, and it shall content me. It is true, we shall be monsters, cut off from all the world; but on that account we shall be more attached to one another. Our lives will not be happy, but they will be harmless and free from the misery I now feel. Oh! My creator, make me happy; let me feel gratitude towards you for one benefit! Let me see that I excite the sympathy of some existing thing; do not deny me my request!”
+
+I was moved. I shuddered when I thought of the possible consequences of my consent, but I felt that there was some justice in his argument. His tale and the feelings he now expressed proved him to be a creature of fine sensations, and did I not as his maker owe him all the portion of happiness that it was in my power to bestow? He saw my change of feeling and continued,
+
+“If you consent, neither you nor any other human being shall ever see us again; I will go to the vast wilds of South America. My food is not that of man; I do not destroy the lamb and the kid to glut my appetite; acorns and berries afford me sufficient nourishment. My companion will be of the same nature as myself and will be content with the same fare. We shall make our bed of dried leaves; the sun will shine on us as on man and will ripen our food. The picture I present to you is peaceful and human, and you must feel that you could deny it only in the wantonness of power and cruelty. Pitiless as you have been towards me, I now see compassion in your eyes; let me seize the favourable moment and persuade you to promise what I so ardently desire.”
+
+“You propose,” replied I, “to fly from the habitations of man, to dwell in those wilds where the beasts of the field will be your only companions. How can you, who long for the love and sympathy of man, persevere in this exile? You will return and again seek their kindness, and you will meet with their detestation; your evil passions will be renewed, and you will then have a companion to aid you in the task of destruction. This may not be; cease to argue the point, for I cannot consent.”
+
+“How inconstant are your feelings! But a moment ago you were moved by my representations, and why do you again harden yourself to my complaints? I swear to you, by the earth which I inhabit, and by you that made me, that with the companion you bestow, I will quit the neighbourhood of man and dwell, as it may chance, in the most savage of places. My evil passions will have fled, for I shall meet with sympathy! My life will flow quietly away, and in my dying moments I shall not curse my maker.”
+
+His words had a strange effect upon me. I compassionated him and sometimes felt a wish to console him, but when I looked upon him, when I saw the filthy mass that moved and talked, my heart sickened and my feelings were altered to those of horror and hatred. I tried to stifle these sensations; I thought that as I could not sympathise with him, I had no right to withhold from him the small portion of happiness which was yet in my power to bestow.
+
+“You swear,” I said, “to be harmless; but have you not already shown a degree of malice that should reasonably make me distrust you? May not even this be a feint that will increase your triumph by affording a wider scope for your revenge?”
+
+“How is this? I must not be trifled with, and I demand an answer. If I have no ties and no affections, hatred and vice must be my portion; the love of another will destroy the cause of my crimes, and I shall become a thing of whose existence everyone will be ignorant. My vices are the children of a forced solitude that I abhor, and my virtues will necessarily arise when I live in communion with an equal. I shall feel the affections of a sensitive being and become linked to the chain of existence and events from which I am now excluded.”
+
+I paused some time to reflect on all he had related and the various arguments which he had employed. I thought of the promise of virtues which he had displayed on the opening of his existence and the subsequent blight of all kindly feeling by the loathing and scorn which his protectors had manifested towards him. His power and threats were not omitted in my calculations; a creature who could exist in the ice-caves of the glaciers and hide himself from pursuit among the ridges of inaccessible precipices was a being possessing faculties it would be vain to cope with. After a long pause of reflection I concluded that the justice due both to him and my fellow creatures demanded of me that I should comply with his request. Turning to him, therefore, I said,
+
+“I consent to your demand, on your solemn oath to quit Europe for ever, and every other place in the neighbourhood of man, as soon as I shall deliver into your hands a female who will accompany you in your exile.”
+
+“I swear,” he cried, “by the sun, and by the blue sky of heaven, and by the fire of love that burns my heart, that if you grant my prayer, while they exist you shall never behold me again. Depart to your home and commence your labours; I shall watch their progress with unutterable anxiety; and fear not but that when you are ready I shall appear.”
+
+Saying this, he suddenly quitted me, fearful, perhaps, of any change in my sentiments. I saw him descend the mountain with greater speed than the flight of an eagle, and quickly lost among the undulations of the sea of ice.
+
+His tale had occupied the whole day, and the sun was upon the verge of the horizon when he departed. I knew that I ought to hasten my descent towards the valley, as I should soon be encompassed in darkness; but my heart was heavy, and my steps slow. The labour of winding among the little paths of the mountain and fixing my feet firmly as I advanced perplexed me, occupied as I was by the emotions which the occurrences of the day had produced. Night was far advanced when I came to the halfway resting-place and seated myself beside the fountain. The stars shone at intervals as the clouds passed from over them; the dark pines rose before me, and every here and there a broken tree lay on the ground; it was a scene of wonderful solemnity and stirred strange thoughts within me. I wept bitterly, and clasping my hands in agony, I exclaimed, “Oh! stars and clouds and winds, ye are all about to mock me; if ye really pity me, crush sensation and memory; let me become as nought; but if not, depart, depart, and leave me in darkness.”
+
+These were wild and miserable thoughts, but I cannot describe to you how the eternal twinkling of the stars weighed upon me and how I listened to every blast of wind as if it were a dull ugly siroc on its way to consume me.
+
+Morning dawned before I arrived at the village of Chamounix; I took no rest, but returned immediately to Geneva. Even in my own heart I could give no expression to my sensations—they weighed on me with a mountain’s weight and their excess destroyed my agony beneath them. Thus I returned home, and entering the house, presented myself to the family. My haggard and wild appearance awoke intense alarm, but I answered no question, scarcely did I speak. I felt as if I were placed under a ban—as if I had no right to claim their sympathies—as if never more might I enjoy companionship with them. Yet even thus I loved them to adoration; and to save them, I resolved to dedicate myself to my most abhorred task. The prospect of such an occupation made every other circumstance of existence pass before me like a dream, and that thought only had to me the reality of life.
+
+Chapter 18
+Day after day, week after week, passed away on my return to Geneva; and I could not collect the courage to recommence my work. I feared the vengeance of the disappointed fiend, yet I was unable to overcome my repugnance to the task which was enjoined me. I found that I could not compose a female without again devoting several months to profound study and laborious disquisition. I had heard of some discoveries having been made by an English philosopher, the knowledge of which was material to my success, and I sometimes thought of obtaining my father’s consent to visit England for this purpose; but I clung to every pretence of delay and shrank from taking the first step in an undertaking whose immediate necessity began to appear less absolute to me. A change indeed had taken place in me; my health, which had hitherto declined, was now much restored; and my spirits, when unchecked by the memory of my unhappy promise, rose proportionably. My father saw this change with pleasure, and he turned his thoughts towards the best method of eradicating the remains of my melancholy, which every now and then would return by fits, and with a devouring blackness overcast the approaching sunshine. At these moments I took refuge in the most perfect solitude. I passed whole days on the lake alone in a little boat, watching the clouds and listening to the rippling of the waves, silent and listless. But the fresh air and bright sun seldom failed to restore me to some degree of composure, and on my return I met the salutations of my friends with a readier smile and a more cheerful heart.
+
+It was after my return from one of these rambles that my father, calling me aside, thus addressed me,
+
+“I am happy to remark, my dear son, that you have resumed your former pleasures and seem to be returning to yourself. And yet you are still unhappy and still avoid our society. For some time I was lost in conjecture as to the cause of this, but yesterday an idea struck me, and if it is well founded, I conjure you to avow it. Reserve on such a point would be not only useless, but draw down treble misery on us all.”
+
+I trembled violently at his exordium, and my father continued—
+
+“I confess, my son, that I have always looked forward to your marriage with our dear Elizabeth as the tie of our domestic comfort and the stay of my declining years. You were attached to each other from your earliest infancy; you studied together, and appeared, in dispositions and tastes, entirely suited to one another. But so blind is the experience of man that what I conceived to be the best assistants to my plan may have entirely destroyed it. You, perhaps, regard her as your sister, without any wish that she might become your wife. Nay, you may have met with another whom you may love; and considering yourself as bound in honour to Elizabeth, this struggle may occasion the poignant misery which you appear to feel.”
+
+“My dear father, reassure yourself. I love my cousin tenderly and sincerely. I never saw any woman who excited, as Elizabeth does, my warmest admiration and affection. My future hopes and prospects are entirely bound up in the expectation of our union.”
+
+“The expression of your sentiments of this subject, my dear Victor, gives me more pleasure than I have for some time experienced. If you feel thus, we shall assuredly be happy, however present events may cast a gloom over us. But it is this gloom which appears to have taken so strong a hold of your mind that I wish to dissipate. Tell me, therefore, whether you object to an immediate solemnisation of the marriage. We have been unfortunate, and recent events have drawn us from that everyday tranquillity befitting my years and infirmities. You are younger; yet I do not suppose, possessed as you are of a competent fortune, that an early marriage would at all interfere with any future plans of honour and utility that you may have formed. Do not suppose, however, that I wish to dictate happiness to you or that a delay on your part would cause me any serious uneasiness. Interpret my words with candour and answer me, I conjure you, with confidence and sincerity.”
+
+I listened to my father in silence and remained for some time incapable of offering any reply. I revolved rapidly in my mind a multitude of thoughts and endeavoured to arrive at some conclusion. Alas! To me the idea of an immediate union with my Elizabeth was one of horror and dismay. I was bound by a solemn promise which I had not yet fulfilled and dared not break, or if I did, what manifold miseries might not impend over me and my devoted family! Could I enter into a festival with this deadly weight yet hanging round my neck and bowing me to the ground? I must perform my engagement and let the monster depart with his mate before I allowed myself to enjoy the delight of a union from which I expected peace.
+
+I remembered also the necessity imposed upon me of either journeying to England or entering into a long correspondence with those philosophers of that country whose knowledge and discoveries were of indispensable use to me in my present undertaking. The latter method of obtaining the desired intelligence was dilatory and unsatisfactory; besides, I had an insurmountable aversion to the idea of engaging myself in my loathsome task in my father’s house while in habits of familiar intercourse with those I loved. I knew that a thousand fearful accidents might occur, the slightest of which would disclose a tale to thrill all connected with me with horror. I was aware also that I should often lose all self-command, all capacity of hiding the harrowing sensations that would possess me during the progress of my unearthly occupation. I must absent myself from all I loved while thus employed. Once commenced, it would quickly be achieved, and I might be restored to my family in peace and happiness. My promise fulfilled, the monster would depart for ever. Or (so my fond fancy imaged) some accident might meanwhile occur to destroy him and put an end to my slavery for ever.
+
+These feelings dictated my answer to my father. I expressed a wish to visit England, but concealing the true reasons of this request, I clothed my desires under a guise which excited no suspicion, while I urged my desire with an earnestness that easily induced my father to comply. After so long a period of an absorbing melancholy that resembled madness in its intensity and effects, he was glad to find that I was capable of taking pleasure in the idea of such a journey, and he hoped that change of scene and varied amusement would, before my return, have restored me entirely to myself.
+
+The duration of my absence was left to my own choice; a few months, or at most a year, was the period contemplated. One paternal kind precaution he had taken to ensure my having a companion. Without previously communicating with me, he had, in concert with Elizabeth, arranged that Clerval should join me at Strasburgh. This interfered with the solitude I coveted for the prosecution of my task; yet at the commencement of my journey the presence of my friend could in no way be an impediment, and truly I rejoiced that thus I should be saved many hours of lonely, maddening reflection. Nay, Henry might stand between me and the intrusion of my foe. If I were alone, would he not at times force his abhorred presence on me to remind me of my task or to contemplate its progress?
+
+To England, therefore, I was bound, and it was understood that my union with Elizabeth should take place immediately on my return. My father’s age rendered him extremely averse to delay. For myself, there was one reward I promised myself from my detested toils—one consolation for my unparalleled sufferings; it was the prospect of that day when, enfranchised from my miserable slavery, I might claim Elizabeth and forget the past in my union with her.
+
+I now made arrangements for my journey, but one feeling haunted me which filled me with fear and agitation. During my absence I should leave my friends unconscious of the existence of their enemy and unprotected from his attacks, exasperated as he might be by my departure. But he had promised to follow me wherever I might go, and would he not accompany me to England? This imagination was dreadful in itself, but soothing inasmuch as it supposed the safety of my friends. I was agonised with the idea of the possibility that the reverse of this might happen. But through the whole period during which I was the slave of my creature I allowed myself to be governed by the impulses of the moment; and my present sensations strongly intimated that the fiend would follow me and exempt my family from the danger of his machinations.
+
+It was in the latter end of September that I again quitted my native country. My journey had been my own suggestion, and Elizabeth therefore acquiesced, but she was filled with disquiet at the idea of my suffering, away from her, the inroads of misery and grief. It had been her care which provided me a companion in Clerval—and yet a man is blind to a thousand minute circumstances which call forth a woman’s sedulous attention. She longed to bid me hasten my return; a thousand conflicting emotions rendered her mute as she bade me a tearful, silent farewell.
+
+I threw myself into the carriage that was to convey me away, hardly knowing whither I was going, and careless of what was passing around. I remembered only, and it was with a bitter anguish that I reflected on it, to order that my chemical instruments should be packed to go with me. Filled with dreary imaginations, I passed through many beautiful and majestic scenes, but my eyes were fixed and unobserving. I could only think of the bourne of my travels and the work which was to occupy me whilst they endured.
+
+After some days spent in listless indolence, during which I traversed many leagues, I arrived at Strasburgh, where I waited two days for Clerval. He came. Alas, how great was the contrast between us! He was alive to every new scene, joyful when he saw the beauties of the setting sun, and more happy when he beheld it rise and recommence a new day. He pointed out to me the shifting colours of the landscape and the appearances of the sky. “This is what it is to live,” he cried; “now I enjoy existence! But you, my dear Frankenstein, wherefore are you desponding and sorrowful!” In truth, I was occupied by gloomy thoughts and neither saw the descent of the evening star nor the golden sunrise reflected in the Rhine. And you, my friend, would be far more amused with the journal of Clerval, who observed the scenery with an eye of feeling and delight, than in listening to my reflections. I, a miserable wretch, haunted by a curse that shut up every avenue to enjoyment.
+
+We had agreed to descend the Rhine in a boat from Strasburgh to Rotterdam, whence we might take shipping for London. During this voyage we passed many willowy islands and saw several beautiful towns. We stayed a day at Mannheim, and on the fifth from our departure from Strasburgh, arrived at Mainz. The course of the Rhine below Mainz becomes much more picturesque. The river descends rapidly and winds between hills, not high, but steep, and of beautiful forms. We saw many ruined castles standing on the edges of precipices, surrounded by black woods, high and inaccessible. This part of the Rhine, indeed, presents a singularly variegated landscape. In one spot you view rugged hills, ruined castles overlooking tremendous precipices, with the dark Rhine rushing beneath; and on the sudden turn of a promontory, flourishing vineyards with green sloping banks and a meandering river and populous towns occupy the scene.
+
+We travelled at the time of the vintage and heard the song of the labourers as we glided down the stream. Even I, depressed in mind, and my spirits continually agitated by gloomy feelings, even I was pleased. I lay at the bottom of the boat, and as I gazed on the cloudless blue sky, I seemed to drink in a tranquillity to which I had long been a stranger. And if these were my sensations, who can describe those of Henry? He felt as if he had been transported to Fairy-land and enjoyed a happiness seldom tasted by man. “I have seen,” he said, “the most beautiful scenes of my own country; I have visited the lakes of Lucerne and Uri, where the snowy mountains descend almost perpendicularly to the water, casting black and impenetrable shades, which would cause a gloomy and mournful appearance were it not for the most verdant islands that relieve the eye by their gay appearance; I have seen this lake agitated by a tempest, when the wind tore up whirlwinds of water and gave you an idea of what the water-spout must be on the great ocean; and the waves dash with fury the base of the mountain, where the priest and his mistress were overwhelmed by an avalanche and where their dying voices are still said to be heard amid the pauses of the nightly wind; I have seen the mountains of La Valais, and the Pays de Vaud; but this country, Victor, pleases me more than all those wonders. The mountains of Switzerland are more majestic and strange, but there is a charm in the banks of this divine river that I never before saw equalled. Look at that castle which overhangs yon precipice; and that also on the island, almost concealed amongst the foliage of those lovely trees; and now that group of labourers coming from among their vines; and that village half hid in the recess of the mountain. Oh, surely the spirit that inhabits and guards this place has a soul more in harmony with man than those who pile the glacier or retire to the inaccessible peaks of the mountains of our own country.”
+
+Clerval! Beloved friend! Even now it delights me to record your words and to dwell on the praise of which you are so eminently deserving. He was a being formed in the “very poetry of nature.” His wild and enthusiastic imagination was chastened by the sensibility of his heart. His soul overflowed with ardent affections, and his friendship was of that devoted and wondrous nature that the worldly-minded teach us to look for only in the imagination. But even human sympathies were not sufficient to satisfy his eager mind. The scenery of external nature, which others regard only with admiration, he loved with ardour:—
+
+——The sounding cataract
+Haunted him like a passion: the tall rock,
+The mountain, and the deep and gloomy wood,
+Their colours and their forms, were then to him
+An appetite; a feeling, and a love,
+That had no need of a remoter charm,
+By thought supplied, or any interest
+Unborrow’d from the eye.
+
+[Wordsworth’s “Tintern Abbey”.]
+
+And where does he now exist? Is this gentle and lovely being lost for ever? Has this mind, so replete with ideas, imaginations fanciful and magnificent, which formed a world, whose existence depended on the life of its creator;—has this mind perished? Does it now only exist in my memory? No, it is not thus; your form so divinely wrought, and beaming with beauty, has decayed, but your spirit still visits and consoles your unhappy friend.
+
+Pardon this gush of sorrow; these ineffectual words are but a slight tribute to the unexampled worth of Henry, but they soothe my heart, overflowing with the anguish which his remembrance creates. I will proceed with my tale.
+
+Beyond Cologne we descended to the plains of Holland; and we resolved to post the remainder of our way, for the wind was contrary and the stream of the river was too gentle to aid us.
+
+Our journey here lost the interest arising from beautiful scenery, but we arrived in a few days at Rotterdam, whence we proceeded by sea to England. It was on a clear morning, in the latter days of December, that I first saw the white cliffs of Britain. The banks of the Thames presented a new scene; they were flat but fertile, and almost every town was marked by the remembrance of some story. We saw Tilbury Fort and remembered the Spanish Armada, Gravesend, Woolwich, and Greenwich—places which I had heard of even in my country.
+
+At length we saw the numerous steeples of London, St. Paul’s towering above all, and the Tower famed in English history.
+
+Chapter 19
+London was our present point of rest; we determined to remain several months in this wonderful and celebrated city. Clerval desired the intercourse of the men of genius and talent who flourished at this time, but this was with me a secondary object; I was principally occupied with the means of obtaining the information necessary for the completion of my promise and quickly availed myself of the letters of introduction that I had brought with me, addressed to the most distinguished natural philosophers.
+
+If this journey had taken place during my days of study and happiness, it would have afforded me inexpressible pleasure. But a blight had come over my existence, and I only visited these people for the sake of the information they might give me on the subject in which my interest was so terribly profound. Company was irksome to me; when alone, I could fill my mind with the sights of heaven and earth; the voice of Henry soothed me, and I could thus cheat myself into a transitory peace. But busy, uninteresting, joyous faces brought back despair to my heart. I saw an insurmountable barrier placed between me and my fellow men; this barrier was sealed with the blood of William and Justine, and to reflect on the events connected with those names filled my soul with anguish.
+
+But in Clerval I saw the image of my former self; he was inquisitive and anxious to gain experience and instruction. The difference of manners which he observed was to him an inexhaustible source of instruction and amusement. He was also pursuing an object he had long had in view. His design was to visit India, in the belief that he had in his knowledge of its various languages, and in the views he had taken of its society, the means of materially assisting the progress of European colonization and trade. In Britain only could he further the execution of his plan. He was for ever busy, and the only check to his enjoyments was my sorrowful and dejected mind. I tried to conceal this as much as possible, that I might not debar him from the pleasures natural to one who was entering on a new scene of life, undisturbed by any care or bitter recollection. I often refused to accompany him, alleging another engagement, that I might remain alone. I now also began to collect the materials necessary for my new creation, and this was to me like the torture of single drops of water continually falling on the head. Every thought that was devoted to it was an extreme anguish, and every word that I spoke in allusion to it caused my lips to quiver, and my heart to palpitate.
+
+After passing some months in London, we received a letter from a person in Scotland who had formerly been our visitor at Geneva. He mentioned the beauties of his native country and asked us if those were not sufficient allurements to induce us to prolong our journey as far north as Perth, where he resided. Clerval eagerly desired to accept this invitation, and I, although I abhorred society, wished to view again mountains and streams and all the wondrous works with which Nature adorns her chosen dwelling-places.
+
+We had arrived in England at the beginning of October, and it was now February. We accordingly determined to commence our journey towards the north at the expiration of another month. In this expedition we did not intend to follow the great road to Edinburgh, but to visit Windsor, Oxford, Matlock, and the Cumberland lakes, resolving to arrive at the completion of this tour about the end of July. I packed up my chemical instruments and the materials I had collected, resolving to finish my labours in some obscure nook in the northern highlands of Scotland.
+
+We quitted London on the 27th of March and remained a few days at Windsor, rambling in its beautiful forest. This was a new scene to us mountaineers; the majestic oaks, the quantity of game, and the herds of stately deer were all novelties to us.
+
+From thence we proceeded to Oxford. As we entered this city, our minds were filled with the remembrance of the events that had been transacted there more than a century and a half before. It was here that Charles I. had collected his forces. This city had remained faithful to him, after the whole nation had forsaken his cause to join the standard of Parliament and liberty. The memory of that unfortunate king and his companions, the amiable Falkland, the insolent Goring, his queen, and son, gave a peculiar interest to every part of the city which they might be supposed to have inhabited. The spirit of elder days found a dwelling here, and we delighted to trace its footsteps. If these feelings had not found an imaginary gratification, the appearance of the city had yet in itself sufficient beauty to obtain our admiration. The colleges are ancient and picturesque; the streets are almost magnificent; and the lovely Isis, which flows beside it through meadows of exquisite verdure, is spread forth into a placid expanse of waters, which reflects its majestic assemblage of towers, and spires, and domes, embosomed among aged trees.
+
+I enjoyed this scene, and yet my enjoyment was embittered both by the memory of the past and the anticipation of the future. I was formed for peaceful happiness. During my youthful days discontent never visited my mind, and if I was ever overcome by ennui, the sight of what is beautiful in nature or the study of what is excellent and sublime in the productions of man could always interest my heart and communicate elasticity to my spirits. But I am a blasted tree; the bolt has entered my soul; and I felt then that I should survive to exhibit what I shall soon cease to be—a miserable spectacle of wrecked humanity, pitiable to others and intolerable to myself.
+
+We passed a considerable period at Oxford, rambling among its environs and endeavouring to identify every spot which might relate to the most animating epoch of English history. Our little voyages of discovery were often prolonged by the successive objects that presented themselves. We visited the tomb of the illustrious Hampden and the field on which that patriot fell. For a moment my soul was elevated from its debasing and miserable fears to contemplate the divine ideas of liberty and self-sacrifice of which these sights were the monuments and the remembrancers. For an instant I dared to shake off my chains and look around me with a free and lofty spirit, but the iron had eaten into my flesh, and I sank again, trembling and hopeless, into my miserable self.
+
+We left Oxford with regret and proceeded to Matlock, which was our next place of rest. The country in the neighbourhood of this village resembled, to a greater degree, the scenery of Switzerland; but everything is on a lower scale, and the green hills want the crown of distant white Alps which always attend on the piny mountains of my native country. We visited the wondrous cave and the little cabinets of natural history, where the curiosities are disposed in the same manner as in the collections at Servox and Chamounix. The latter name made me tremble when pronounced by Henry, and I hastened to quit Matlock, with which that terrible scene was thus associated.
+
+From Derby, still journeying northwards, we passed two months in Cumberland and Westmorland. I could now almost fancy myself among the Swiss mountains. The little patches of snow which yet lingered on the northern sides of the mountains, the lakes, and the dashing of the rocky streams were all familiar and dear sights to me. Here also we made some acquaintances, who almost contrived to cheat me into happiness. The delight of Clerval was proportionably greater than mine; his mind expanded in the company of men of talent, and he found in his own nature greater capacities and resources than he could have imagined himself to have possessed while he associated with his inferiors. “I could pass my life here,” said he to me; “and among these mountains I should scarcely regret Switzerland and the Rhine.”
+
+But he found that a traveller’s life is one that includes much pain amidst its enjoyments. His feelings are for ever on the stretch; and when he begins to sink into repose, he finds himself obliged to quit that on which he rests in pleasure for something new, which again engages his attention, and which also he forsakes for other novelties.
+
+We had scarcely visited the various lakes of Cumberland and Westmorland and conceived an affection for some of the inhabitants when the period of our appointment with our Scotch friend approached, and we left them to travel on. For my own part I was not sorry. I had now neglected my promise for some time, and I feared the effects of the dæmon’s disappointment. He might remain in Switzerland and wreak his vengeance on my relatives. This idea pursued me and tormented me at every moment from which I might otherwise have snatched repose and peace. I waited for my letters with feverish impatience; if they were delayed I was miserable and overcome by a thousand fears; and when they arrived and I saw the superscription of Elizabeth or my father, I hardly dared to read and ascertain my fate. Sometimes I thought that the fiend followed me and might expedite my remissness by murdering my companion. When these thoughts possessed me, I would not quit Henry for a moment, but followed him as his shadow, to protect him from the fancied rage of his destroyer. I felt as if I had committed some great crime, the consciousness of which haunted me. I was guiltless, but I had indeed drawn down a horrible curse upon my head, as mortal as that of crime.
+
+I visited Edinburgh with languid eyes and mind; and yet that city might have interested the most unfortunate being. Clerval did not like it so well as Oxford, for the antiquity of the latter city was more pleasing to him. But the beauty and regularity of the new town of Edinburgh, its romantic castle and its environs, the most delightful in the world, Arthur’s Seat, St. Bernard’s Well, and the Pentland Hills, compensated him for the change and filled him with cheerfulness and admiration. But I was impatient to arrive at the termination of my journey.
+
+We left Edinburgh in a week, passing through Coupar, St. Andrew’s, and along the banks of the Tay, to Perth, where our friend expected us. But I was in no mood to laugh and talk with strangers or enter into their feelings or plans with the good humour expected from a guest; and accordingly I told Clerval that I wished to make the tour of Scotland alone. “Do you,” said I, “enjoy yourself, and let this be our rendezvous. I may be absent a month or two; but do not interfere with my motions, I entreat you; leave me to peace and solitude for a short time; and when I return, I hope it will be with a lighter heart, more congenial to your own temper.”
+
+Henry wished to dissuade me, but seeing me bent on this plan, ceased to remonstrate. He entreated me to write often. “I had rather be with you,” he said, “in your solitary rambles, than with these Scotch people, whom I do not know; hasten, then, my dear friend, to return, that I may again feel myself somewhat at home, which I cannot do in your absence.”
+
+Having parted from my friend, I determined to visit some remote spot of Scotland and finish my work in solitude. I did not doubt but that the monster followed me and would discover himself to me when I should have finished, that he might receive his companion.
+
+With this resolution I traversed the northern highlands and fixed on one of the remotest of the Orkneys as the scene of my labours. It was a place fitted for such a work, being hardly more than a rock whose high sides were continually beaten upon by the waves. The soil was barren, scarcely affording pasture for a few miserable cows, and oatmeal for its inhabitants, which consisted of five persons, whose gaunt and scraggy limbs gave tokens of their miserable fare. Vegetables and bread, when they indulged in such luxuries, and even fresh water, was to be procured from the mainland, which was about five miles distant.
+
+On the whole island there were but three miserable huts, and one of these was vacant when I arrived. This I hired. It contained but two rooms, and these exhibited all the squalidness of the most miserable penury. The thatch had fallen in, the walls were unplastered, and the door was off its hinges. I ordered it to be repaired, bought some furniture, and took possession, an incident which would doubtless have occasioned some surprise had not all the senses of the cottagers been benumbed by want and squalid poverty. As it was, I lived ungazed at and unmolested, hardly thanked for the pittance of food and clothes which I gave, so much does suffering blunt even the coarsest sensations of men.
+
+In this retreat I devoted the morning to labour; but in the evening, when the weather permitted, I walked on the stony beach of the sea to listen to the waves as they roared and dashed at my feet. It was a monotonous yet ever-changing scene. I thought of Switzerland; it was far different from this desolate and appalling landscape. Its hills are covered with vines, and its cottages are scattered thickly in the plains. Its fair lakes reflect a blue and gentle sky, and when troubled by the winds, their tumult is but as the play of a lively infant when compared to the roarings of the giant ocean.
+
+In this manner I distributed my occupations when I first arrived, but as I proceeded in my labour, it became every day more horrible and irksome to me. Sometimes I could not prevail on myself to enter my laboratory for several days, and at other times I toiled day and night in order to complete my work. It was, indeed, a filthy process in which I was engaged. During my first experiment, a kind of enthusiastic frenzy had blinded me to the horror of my employment; my mind was intently fixed on the consummation of my labour, and my eyes were shut to the horror of my proceedings. But now I went to it in cold blood, and my heart often sickened at the work of my hands.
+
+Thus situated, employed in the most detestable occupation, immersed in a solitude where nothing could for an instant call my attention from the actual scene in which I was engaged, my spirits became unequal; I grew restless and nervous. Every moment I feared to meet my persecutor. Sometimes I sat with my eyes fixed on the ground, fearing to raise them lest they should encounter the object which I so much dreaded to behold. I feared to wander from the sight of my fellow creatures lest when alone he should come to claim his companion.
+
+In the mean time I worked on, and my labour was already considerably advanced. I looked towards its completion with a tremulous and eager hope, which I dared not trust myself to question but which was intermixed with obscure forebodings of evil that made my heart sicken in my bosom.
+
+Chapter 20
+I sat one evening in my laboratory; the sun had set, and the moon was just rising from the sea; I had not sufficient light for my employment, and I remained idle, in a pause of consideration of whether I should leave my labour for the night or hasten its conclusion by an unremitting attention to it. As I sat, a train of reflection occurred to me which led me to consider the effects of what I was now doing. Three years before, I was engaged in the same manner and had created a fiend whose unparalleled barbarity had desolated my heart and filled it for ever with the bitterest remorse. I was now about to form another being of whose dispositions I was alike ignorant; she might become ten thousand times more malignant than her mate and delight, for its own sake, in murder and wretchedness. He had sworn to quit the neighbourhood of man and hide himself in deserts, but she had not; and she, who in all probability was to become a thinking and reasoning animal, might refuse to comply with a compact made before her creation. They might even hate each other; the creature who already lived loathed his own deformity, and might he not conceive a greater abhorrence for it when it came before his eyes in the female form? She also might turn with disgust from him to the superior beauty of man; she might quit him, and he be again alone, exasperated by the fresh provocation of being deserted by one of his own species.
+
+Even if they were to leave Europe and inhabit the deserts of the new world, yet one of the first results of those sympathies for which the dæmon thirsted would be children, and a race of devils would be propagated upon the earth who might make the very existence of the species of man a condition precarious and full of terror. Had I right, for my own benefit, to inflict this curse upon everlasting generations? I had before been moved by the sophisms of the being I had created; I had been struck senseless by his fiendish threats; but now, for the first time, the wickedness of my promise burst upon me; I shuddered to think that future ages might curse me as their pest, whose selfishness had not hesitated to buy its own peace at the price, perhaps, of the existence of the whole human race.
+
+I trembled and my heart failed within me, when, on looking up, I saw by the light of the moon the dæmon at the casement. A ghastly grin wrinkled his lips as he gazed on me, where I sat fulfilling the task which he had allotted to me. Yes, he had followed me in my travels; he had loitered in forests, hid himself in caves, or taken refuge in wide and desert heaths; and he now came to mark my progress and claim the fulfilment of my promise.
+
+As I looked on him, his countenance expressed the utmost extent of malice and treachery. I thought with a sensation of madness on my promise of creating another like to him, and trembling with passion, tore to pieces the thing on which I was engaged. The wretch saw me destroy the creature on whose future existence he depended for happiness, and with a howl of devilish despair and revenge, withdrew.
+
+I left the room, and locking the door, made a solemn vow in my own heart never to resume my labours; and then, with trembling steps, I sought my own apartment. I was alone; none were near me to dissipate the gloom and relieve me from the sickening oppression of the most terrible reveries.
+
+Several hours passed, and I remained near my window gazing on the sea; it was almost motionless, for the winds were hushed, and all nature reposed under the eye of the quiet moon. A few fishing vessels alone specked the water, and now and then the gentle breeze wafted the sound of voices as the fishermen called to one another. I felt the silence, although I was hardly conscious of its extreme profundity, until my ear was suddenly arrested by the paddling of oars near the shore, and a person landed close to my house.
+
+In a few minutes after, I heard the creaking of my door, as if some one endeavoured to open it softly. I trembled from head to foot; I felt a presentiment of who it was and wished to rouse one of the peasants who dwelt in a cottage not far from mine; but I was overcome by the sensation of helplessness, so often felt in frightful dreams, when you in vain endeavour to fly from an impending danger, and was rooted to the spot.
+
+Presently I heard the sound of footsteps along the passage; the door opened, and the wretch whom I dreaded appeared. Shutting the door, he approached me and said in a smothered voice,
+
+“You have destroyed the work which you began; what is it that you intend? Do you dare to break your promise? I have endured toil and misery; I left Switzerland with you; I crept along the shores of the Rhine, among its willow islands and over the summits of its hills. I have dwelt many months in the heaths of England and among the deserts of Scotland. I have endured incalculable fatigue, and cold, and hunger; do you dare destroy my hopes?”
+
+“Begone! I do break my promise; never will I create another like yourself, equal in deformity and wickedness.”
+
+“Slave, I before reasoned with you, but you have proved yourself unworthy of my condescension. Remember that I have power; you believe yourself miserable, but I can make you so wretched that the light of day will be hateful to you. You are my creator, but I am your master; obey!”
+
+“The hour of my irresolution is past, and the period of your power is arrived. Your threats cannot move me to do an act of wickedness; but they confirm me in a determination of not creating you a companion in vice. Shall I, in cool blood, set loose upon the earth a dæmon whose delight is in death and wretchedness? Begone! I am firm, and your words will only exasperate my rage.”
+
+The monster saw my determination in my face and gnashed his teeth in the impotence of anger. “Shall each man,” cried he, “find a wife for his bosom, and each beast have his mate, and I be alone? I had feelings of affection, and they were requited by detestation and scorn. Man! You may hate, but beware! Your hours will pass in dread and misery, and soon the bolt will fall which must ravish from you your happiness for ever. Are you to be happy while I grovel in the intensity of my wretchedness? You can blast my other passions, but revenge remains—revenge, henceforth dearer than light or food! I may die, but first you, my tyrant and tormentor, shall curse the sun that gazes on your misery. Beware, for I am fearless and therefore powerful. I will watch with the wiliness of a snake, that I may sting with its venom. Man, you shall repent of the injuries you inflict.”
+
+“Devil, cease; and do not poison the air with these sounds of malice. I have declared my resolution to you, and I am no coward to bend beneath words. Leave me; I am inexorable.”
+
+“It is well. I go; but remember, I shall be with you on your wedding-night.”
+
+I started forward and exclaimed, “Villain! Before you sign my death-warrant, be sure that you are yourself safe.”
+
+I would have seized him, but he eluded me and quitted the house with precipitation. In a few moments I saw him in his boat, which shot across the waters with an arrowy swiftness and was soon lost amidst the waves.
+
+All was again silent, but his words rang in my ears. I burned with rage to pursue the murderer of my peace and precipitate him into the ocean. I walked up and down my room hastily and perturbed, while my imagination conjured up a thousand images to torment and sting me. Why had I not followed him and closed with him in mortal strife? But I had suffered him to depart, and he had directed his course towards the mainland. I shuddered to think who might be the next victim sacrificed to his insatiate revenge. And then I thought again of his words—“I will be with you on your wedding-night.” That, then, was the period fixed for the fulfilment of my destiny. In that hour I should die and at once satisfy and extinguish his malice. The prospect did not move me to fear; yet when I thought of my beloved Elizabeth, of her tears and endless sorrow, when she should find her lover so barbarously snatched from her, tears, the first I had shed for many months, streamed from my eyes, and I resolved not to fall before my enemy without a bitter struggle.
+
+The night passed away, and the sun rose from the ocean; my feelings became calmer, if it may be called calmness when the violence of rage sinks into the depths of despair. I left the house, the horrid scene of the last night’s contention, and walked on the beach of the sea, which I almost regarded as an insuperable barrier between me and my fellow creatures; nay, a wish that such should prove the fact stole across me. I desired that I might pass my life on that barren rock, wearily, it is true, but uninterrupted by any sudden shock of misery. If I returned, it was to be sacrificed or to see those whom I most loved die under the grasp of a dæmon whom I had myself created.
+
+I walked about the isle like a restless spectre, separated from all it loved and miserable in the separation. When it became noon, and the sun rose higher, I lay down on the grass and was overpowered by a deep sleep. I had been awake the whole of the preceding night, my nerves were agitated, and my eyes inflamed by watching and misery. The sleep into which I now sank refreshed me; and when I awoke, I again felt as if I belonged to a race of human beings like myself, and I began to reflect upon what had passed with greater composure; yet still the words of the fiend rang in my ears like a death-knell; they appeared like a dream, yet distinct and oppressive as a reality.
+
+The sun had far descended, and I still sat on the shore, satisfying my appetite, which had become ravenous, with an oaten cake, when I saw a fishing-boat land close to me, and one of the men brought me a packet; it contained letters from Geneva, and one from Clerval entreating me to join him. He said that he was wearing away his time fruitlessly where he was, that letters from the friends he had formed in London desired his return to complete the negotiation they had entered into for his Indian enterprise. He could not any longer delay his departure; but as his journey to London might be followed, even sooner than he now conjectured, by his longer voyage, he entreated me to bestow as much of my society on him as I could spare. He besought me, therefore, to leave my solitary isle and to meet him at Perth, that we might proceed southwards together. This letter in a degree recalled me to life, and I determined to quit my island at the expiration of two days.
+
+Yet, before I departed, there was a task to perform, on which I shuddered to reflect; I must pack up my chemical instruments, and for that purpose I must enter the room which had been the scene of my odious work, and I must handle those utensils the sight of which was sickening to me. The next morning, at daybreak, I summoned sufficient courage and unlocked the door of my laboratory. The remains of the half-finished creature, whom I had destroyed, lay scattered on the floor, and I almost felt as if I had mangled the living flesh of a human being. I paused to collect myself and then entered the chamber. With trembling hand I conveyed the instruments out of the room, but I reflected that I ought not to leave the relics of my work to excite the horror and suspicion of the peasants; and I accordingly put them into a basket, with a great quantity of stones, and laying them up, determined to throw them into the sea that very night; and in the meantime I sat upon the beach, employed in cleaning and arranging my chemical apparatus.
+
+Nothing could be more complete than the alteration that had taken place in my feelings since the night of the appearance of the dæmon. I had before regarded my promise with a gloomy despair as a thing that, with whatever consequences, must be fulfilled; but I now felt as if a film had been taken from before my eyes and that I for the first time saw clearly. The idea of renewing my labours did not for one instant occur to me; the threat I had heard weighed on my thoughts, but I did not reflect that a voluntary act of mine could avert it. I had resolved in my own mind that to create another like the fiend I had first made would be an act of the basest and most atrocious selfishness, and I banished from my mind every thought that could lead to a different conclusion.
+
+Between two and three in the morning the moon rose; and I then, putting my basket aboard a little skiff, sailed out about four miles from the shore. The scene was perfectly solitary; a few boats were returning towards land, but I sailed away from them. I felt as if I was about the commission of a dreadful crime and avoided with shuddering anxiety any encounter with my fellow creatures. At one time the moon, which had before been clear, was suddenly overspread by a thick cloud, and I took advantage of the moment of darkness and cast my basket into the sea; I listened to the gurgling sound as it sank and then sailed away from the spot. The sky became clouded, but the air was pure, although chilled by the northeast breeze that was then rising. But it refreshed me and filled me with such agreeable sensations that I resolved to prolong my stay on the water, and fixing the rudder in a direct position, stretched myself at the bottom of the boat. Clouds hid the moon, everything was obscure, and I heard only the sound of the boat as its keel cut through the waves; the murmur lulled me, and in a short time I slept soundly.
+
+I do not know how long I remained in this situation, but when I awoke I found that the sun had already mounted considerably. The wind was high, and the waves continually threatened the safety of my little skiff. I found that the wind was northeast and must have driven me far from the coast from which I had embarked. I endeavoured to change my course but quickly found that if I again made the attempt the boat would be instantly filled with water. Thus situated, my only resource was to drive before the wind. I confess that I felt a few sensations of terror. I had no compass with me and was so slenderly acquainted with the geography of this part of the world that the sun was of little benefit to me. I might be driven into the wide Atlantic and feel all the tortures of starvation or be swallowed up in the immeasurable waters that roared and buffeted around me. I had already been out many hours and felt the torment of a burning thirst, a prelude to my other sufferings. I looked on the heavens, which were covered by clouds that flew before the wind, only to be replaced by others; I looked upon the sea; it was to be my grave. “Fiend,” I exclaimed, “your task is already fulfilled!” I thought of Elizabeth, of my father, and of Clerval—all left behind, on whom the monster might satisfy his sanguinary and merciless passions. This idea plunged me into a reverie so despairing and frightful that even now, when the scene is on the point of closing before me for ever, I shudder to reflect on it.
+
+Some hours passed thus; but by degrees, as the sun declined towards the horizon, the wind died away into a gentle breeze and the sea became free from breakers. But these gave place to a heavy swell; I felt sick and hardly able to hold the rudder, when suddenly I saw a line of high land towards the south.
+
+Almost spent, as I was, by fatigue and the dreadful suspense I endured for several hours, this sudden certainty of life rushed like a flood of warm joy to my heart, and tears gushed from my eyes.
+
+How mutable are our feelings, and how strange is that clinging love we have of life even in the excess of misery! I constructed another sail with a part of my dress and eagerly steered my course towards the land. It had a wild and rocky appearance, but as I approached nearer I easily perceived the traces of cultivation. I saw vessels near the shore and found myself suddenly transported back to the neighbourhood of civilised man. I carefully traced the windings of the land and hailed a steeple which I at length saw issuing from behind a small promontory. As I was in a state of extreme debility, I resolved to sail directly towards the town, as a place where I could most easily procure nourishment. Fortunately I had money with me. As I turned the promontory I perceived a small neat town and a good harbour, which I entered, my heart bounding with joy at my unexpected escape.
+
+As I was occupied in fixing the boat and arranging the sails, several people crowded towards the spot. They seemed much surprised at my appearance, but instead of offering me any assistance, whispered together with gestures that at any other time might have produced in me a slight sensation of alarm. As it was, I merely remarked that they spoke English, and I therefore addressed them in that language. “My good friends,” said I, “will you be so kind as to tell me the name of this town and inform me where I am?”
+
+“You will know that soon enough,” replied a man with a hoarse voice. “Maybe you are come to a place that will not prove much to your taste, but you will not be consulted as to your quarters, I promise you.”
+
+I was exceedingly surprised on receiving so rude an answer from a stranger, and I was also disconcerted on perceiving the frowning and angry countenances of his companions. “Why do you answer me so roughly?” I replied. “Surely it is not the custom of Englishmen to receive strangers so inhospitably.”
+
+“I do not know,” said the man, “what the custom of the English may be, but it is the custom of the Irish to hate villains.”
+
+While this strange dialogue continued, I perceived the crowd rapidly increase. Their faces expressed a mixture of curiosity and anger, which annoyed and in some degree alarmed me. I inquired the way to the inn, but no one replied. I then moved forward, and a murmuring sound arose from the crowd as they followed and surrounded me, when an ill-looking man approaching tapped me on the shoulder and said, “Come, sir, you must follow me to Mr. Kirwin’s to give an account of yourself.”
+
+“Who is Mr. Kirwin? Why am I to give an account of myself? Is not this a free country?”
+
+“Ay, sir, free enough for honest folks. Mr. Kirwin is a magistrate, and you are to give an account of the death of a gentleman who was found murdered here last night.”
+
+This answer startled me, but I presently recovered myself. I was innocent; that could easily be proved; accordingly I followed my conductor in silence and was led to one of the best houses in the town. I was ready to sink from fatigue and hunger, but being surrounded by a crowd, I thought it politic to rouse all my strength, that no physical debility might be construed into apprehension or conscious guilt. Little did I then expect the calamity that was in a few moments to overwhelm me and extinguish in horror and despair all fear of ignominy or death.
+
+I must pause here, for it requires all my fortitude to recall the memory of the frightful events which I am about to relate, in proper detail, to my recollection.
+
+Chapter 21
+I was soon introduced into the presence of the magistrate, an old benevolent man with calm and mild manners. He looked upon me, however, with some degree of severity, and then, turning towards my conductors, he asked who appeared as witnesses on this occasion.
+
+About half a dozen men came forward; and, one being selected by the magistrate, he deposed that he had been out fishing the night before with his son and brother-in-law, Daniel Nugent, when, about ten o’clock, they observed a strong northerly blast rising, and they accordingly put in for port. It was a very dark night, as the moon had not yet risen; they did not land at the harbour, but, as they had been accustomed, at a creek about two miles below. He walked on first, carrying a part of the fishing tackle, and his companions followed him at some distance. As he was proceeding along the sands, he struck his foot against something and fell at his length on the ground. His companions came up to assist him, and by the light of their lantern they found that he had fallen on the body of a man, who was to all appearance dead. Their first supposition was that it was the corpse of some person who had been drowned and was thrown on shore by the waves, but on examination they found that the clothes were not wet and even that the body was not then cold. They instantly carried it to the cottage of an old woman near the spot and endeavoured, but in vain, to restore it to life. It appeared to be a handsome young man, about five and twenty years of age. He had apparently been strangled, for there was no sign of any violence except the black mark of fingers on his neck.
+
+The first part of this deposition did not in the least interest me, but when the mark of the fingers was mentioned I remembered the murder of my brother and felt myself extremely agitated; my limbs trembled, and a mist came over my eyes, which obliged me to lean on a chair for support. The magistrate observed me with a keen eye and of course drew an unfavourable augury from my manner.
+
+The son confirmed his father’s account, but when Daniel Nugent was called he swore positively that just before the fall of his companion, he saw a boat, with a single man in it, at a short distance from the shore; and as far as he could judge by the light of a few stars, it was the same boat in which I had just landed.
+
+A woman deposed that she lived near the beach and was standing at the door of her cottage, waiting for the return of the fishermen, about an hour before she heard of the discovery of the body, when she saw a boat with only one man in it push off from that part of the shore where the corpse was afterwards found.
+
+Another woman confirmed the account of the fishermen having brought the body into her house; it was not cold. They put it into a bed and rubbed it, and Daniel went to the town for an apothecary, but life was quite gone.
+
+Several other men were examined concerning my landing, and they agreed that, with the strong north wind that had arisen during the night, it was very probable that I had beaten about for many hours and had been obliged to return nearly to the same spot from which I had departed. Besides, they observed that it appeared that I had brought the body from another place, and it was likely that as I did not appear to know the shore, I might have put into the harbour ignorant of the distance of the town of —— from the place where I had deposited the corpse.
+
+Mr. Kirwin, on hearing this evidence, desired that I should be taken into the room where the body lay for interment, that it might be observed what effect the sight of it would produce upon me. This idea was probably suggested by the extreme agitation I had exhibited when the mode of the murder had been described. I was accordingly conducted, by the magistrate and several other persons, to the inn. I could not help being struck by the strange coincidences that had taken place during this eventful night; but, knowing that I had been conversing with several persons in the island I had inhabited about the time that the body had been found, I was perfectly tranquil as to the consequences of the affair.
+
+I entered the room where the corpse lay and was led up to the coffin. How can I describe my sensations on beholding it? I feel yet parched with horror, nor can I reflect on that terrible moment without shuddering and agony. The examination, the presence of the magistrate and witnesses, passed like a dream from my memory when I saw the lifeless form of Henry Clerval stretched before me. I gasped for breath, and throwing myself on the body, I exclaimed, “Have my murderous machinations deprived you also, my dearest Henry, of life? Two I have already destroyed; other victims await their destiny; but you, Clerval, my friend, my benefactor—”
+
+The human frame could no longer support the agonies that I endured, and I was carried out of the room in strong convulsions.
+
+A fever succeeded to this. I lay for two months on the point of death; my ravings, as I afterwards heard, were frightful; I called myself the murderer of William, of Justine, and of Clerval. Sometimes I entreated my attendants to assist me in the destruction of the fiend by whom I was tormented; and at others I felt the fingers of the monster already grasping my neck, and screamed aloud with agony and terror. Fortunately, as I spoke my native language, Mr. Kirwin alone understood me; but my gestures and bitter cries were sufficient to affright the other witnesses.
+
+Why did I not die? More miserable than man ever was before, why did I not sink into forgetfulness and rest? Death snatches away many blooming children, the only hopes of their doting parents; how many brides and youthful lovers have been one day in the bloom of health and hope, and the next a prey for worms and the decay of the tomb! Of what materials was I made that I could thus resist so many shocks, which, like the turning of the wheel, continually renewed the torture?
+
+But I was doomed to live and in two months found myself as awaking from a dream, in a prison, stretched on a wretched bed, surrounded by gaolers, turnkeys, bolts, and all the miserable apparatus of a dungeon. It was morning, I remember, when I thus awoke to understanding; I had forgotten the particulars of what had happened and only felt as if some great misfortune had suddenly overwhelmed me; but when I looked around and saw the barred windows and the squalidness of the room in which I was, all flashed across my memory and I groaned bitterly.
+
+This sound disturbed an old woman who was sleeping in a chair beside me. She was a hired nurse, the wife of one of the turnkeys, and her countenance expressed all those bad qualities which often characterise that class. The lines of her face were hard and rude, like that of persons accustomed to see without sympathising in sights of misery. Her tone expressed her entire indifference; she addressed me in English, and the voice struck me as one that I had heard during my sufferings.
+
+“Are you better now, sir?” said she.
+
+I replied in the same language, with a feeble voice, “I believe I am; but if it be all true, if indeed I did not dream, I am sorry that I am still alive to feel this misery and horror.”
+
+“For that matter,” replied the old woman, “if you mean about the gentleman you murdered, I believe that it were better for you if you were dead, for I fancy it will go hard with you! However, that’s none of my business; I am sent to nurse you and get you well; I do my duty with a safe conscience; it were well if everybody did the same.”
+
+I turned with loathing from the woman who could utter so unfeeling a speech to a person just saved, on the very edge of death; but I felt languid and unable to reflect on all that had passed. The whole series of my life appeared to me as a dream; I sometimes doubted if indeed it were all true, for it never presented itself to my mind with the force of reality.
+
+As the images that floated before me became more distinct, I grew feverish; a darkness pressed around me; no one was near me who soothed me with the gentle voice of love; no dear hand supported me. The physician came and prescribed medicines, and the old woman prepared them for me; but utter carelessness was visible in the first, and the expression of brutality was strongly marked in the visage of the second. Who could be interested in the fate of a murderer but the hangman who would gain his fee?
+
+These were my first reflections, but I soon learned that Mr. Kirwin had shown me extreme kindness. He had caused the best room in the prison to be prepared for me (wretched indeed was the best); and it was he who had provided a physician and a nurse. It is true, he seldom came to see me, for although he ardently desired to relieve the sufferings of every human creature, he did not wish to be present at the agonies and miserable ravings of a murderer. He came, therefore, sometimes to see that I was not neglected, but his visits were short and with long intervals.
+
+One day, while I was gradually recovering, I was seated in a chair, my eyes half open and my cheeks livid like those in death. I was overcome by gloom and misery and often reflected I had better seek death than desire to remain in a world which to me was replete with wretchedness. At one time I considered whether I should not declare myself guilty and suffer the penalty of the law, less innocent than poor Justine had been. Such were my thoughts when the door of my apartment was opened and Mr. Kirwin entered. His countenance expressed sympathy and compassion; he drew a chair close to mine and addressed me in French,
+
+“I fear that this place is very shocking to you; can I do anything to make you more comfortable?”
+
+“I thank you, but all that you mention is nothing to me; on the whole earth there is no comfort which I am capable of receiving.”
+
+“I know that the sympathy of a stranger can be but of little relief to one borne down as you are by so strange a misfortune. But you will, I hope, soon quit this melancholy abode, for doubtless evidence can easily be brought to free you from the criminal charge.”
+
+“That is my least concern; I am, by a course of strange events, become the most miserable of mortals. Persecuted and tortured as I am and have been, can death be any evil to me?”
+
+“Nothing indeed could be more unfortunate and agonising than the strange chances that have lately occurred. You were thrown, by some surprising accident, on this shore, renowned for its hospitality, seized immediately, and charged with murder. The first sight that was presented to your eyes was the body of your friend, murdered in so unaccountable a manner and placed, as it were, by some fiend across your path.”
+
+As Mr. Kirwin said this, notwithstanding the agitation I endured on this retrospect of my sufferings, I also felt considerable surprise at the knowledge he seemed to possess concerning me. I suppose some astonishment was exhibited in my countenance, for Mr. Kirwin hastened to say,
+
+“Immediately upon your being taken ill, all the papers that were on your person were brought me, and I examined them that I might discover some trace by which I could send to your relations an account of your misfortune and illness. I found several letters, and, among others, one which I discovered from its commencement to be from your father. I instantly wrote to Geneva; nearly two months have elapsed since the departure of my letter. But you are ill; even now you tremble; you are unfit for agitation of any kind.”
+
+“This suspense is a thousand times worse than the most horrible event; tell me what new scene of death has been acted, and whose murder I am now to lament?”
+
+“Your family is perfectly well,” said Mr. Kirwin with gentleness; “and someone, a friend, is come to visit you.”
+
+I know not by what chain of thought the idea presented itself, but it instantly darted into my mind that the murderer had come to mock at my misery and taunt me with the death of Clerval, as a new incitement for me to comply with his hellish desires. I put my hand before my eyes, and cried out in agony,
+
+“Oh! Take him away! I cannot see him; for God’s sake, do not let him enter!”
+
+Mr. Kirwin regarded me with a troubled countenance. He could not help regarding my exclamation as a presumption of my guilt and said in rather a severe tone,
+
+“I should have thought, young man, that the presence of your father would have been welcome instead of inspiring such violent repugnance.”
+
+“My father!” cried I, while every feature and every muscle was relaxed from anguish to pleasure. “Is my father indeed come? How kind, how very kind! But where is he, why does he not hasten to me?”
+
+My change of manner surprised and pleased the magistrate; perhaps he thought that my former exclamation was a momentary return of delirium, and now he instantly resumed his former benevolence. He rose and quitted the room with my nurse, and in a moment my father entered it.
+
+Nothing, at this moment, could have given me greater pleasure than the arrival of my father. I stretched out my hand to him and cried,
+
+“Are you then safe—and Elizabeth—and Ernest?”
+
+My father calmed me with assurances of their welfare and endeavoured, by dwelling on these subjects so interesting to my heart, to raise my desponding spirits; but he soon felt that a prison cannot be the abode of cheerfulness. “What a place is this that you inhabit, my son!” said he, looking mournfully at the barred windows and wretched appearance of the room. “You travelled to seek happiness, but a fatality seems to pursue you. And poor Clerval—”
+
+The name of my unfortunate and murdered friend was an agitation too great to be endured in my weak state; I shed tears.
+
+“Alas! Yes, my father,” replied I; “some destiny of the most horrible kind hangs over me, and I must live to fulfil it, or surely I should have died on the coffin of Henry.”
+
+We were not allowed to converse for any length of time, for the precarious state of my health rendered every precaution necessary that could ensure tranquillity. Mr. Kirwin came in and insisted that my strength should not be exhausted by too much exertion. But the appearance of my father was to me like that of my good angel, and I gradually recovered my health.
+
+As my sickness quitted me, I was absorbed by a gloomy and black melancholy that nothing could dissipate. The image of Clerval was for ever before me, ghastly and murdered. More than once the agitation into which these reflections threw me made my friends dread a dangerous relapse. Alas! Why did they preserve so miserable and detested a life? It was surely that I might fulfil my destiny, which is now drawing to a close. Soon, oh, very soon, will death extinguish these throbbings and relieve me from the mighty weight of anguish that bears me to the dust; and, in executing the award of justice, I shall also sink to rest. Then the appearance of death was distant, although the wish was ever present to my thoughts; and I often sat for hours motionless and speechless, wishing for some mighty revolution that might bury me and my destroyer in its ruins.
+
+The season of the assizes approached. I had already been three months in prison, and although I was still weak and in continual danger of a relapse, I was obliged to travel nearly a hundred miles to the country town where the court was held. Mr. Kirwin charged himself with every care of collecting witnesses and arranging my defence. I was spared the disgrace of appearing publicly as a criminal, as the case was not brought before the court that decides on life and death. The grand jury rejected the bill, on its being proved that I was on the Orkney Islands at the hour the body of my friend was found; and a fortnight after my removal I was liberated from prison.
+
+My father was enraptured on finding me freed from the vexations of a criminal charge, that I was again allowed to breathe the fresh atmosphere and permitted to return to my native country. I did not participate in these feelings, for to me the walls of a dungeon or a palace were alike hateful. The cup of life was poisoned for ever, and although the sun shone upon me, as upon the happy and gay of heart, I saw around me nothing but a dense and frightful darkness, penetrated by no light but the glimmer of two eyes that glared upon me. Sometimes they were the expressive eyes of Henry, languishing in death, the dark orbs nearly covered by the lids and the long black lashes that fringed them; sometimes it was the watery, clouded eyes of the monster, as I first saw them in my chamber at Ingolstadt.
+
+My father tried to awaken in me the feelings of affection. He talked of Geneva, which I should soon visit, of Elizabeth and Ernest; but these words only drew deep groans from me. Sometimes, indeed, I felt a wish for happiness and thought with melancholy delight of my beloved cousin or longed, with a devouring maladie du pays, to see once more the blue lake and rapid Rhone, that had been so dear to me in early childhood; but my general state of feeling was a torpor in which a prison was as welcome a residence as the divinest scene in nature; and these fits were seldom interrupted but by paroxysms of anguish and despair. At these moments I often endeavoured to put an end to the existence I loathed, and it required unceasing attendance and vigilance to restrain me from committing some dreadful act of violence.
+
+Yet one duty remained to me, the recollection of which finally triumphed over my selfish despair. It was necessary that I should return without delay to Geneva, there to watch over the lives of those I so fondly loved and to lie in wait for the murderer, that if any chance led me to the place of his concealment, or if he dared again to blast me by his presence, I might, with unfailing aim, put an end to the existence of the monstrous image which I had endued with the mockery of a soul still more monstrous. My father still desired to delay our departure, fearful that I could not sustain the fatigues of a journey, for I was a shattered wreck—the shadow of a human being. My strength was gone. I was a mere skeleton, and fever night and day preyed upon my wasted frame.
+
+Still, as I urged our leaving Ireland with such inquietude and impatience, my father thought it best to yield. We took our passage on board a vessel bound for Havre-de-Grace and sailed with a fair wind from the Irish shores. It was midnight. I lay on the deck looking at the stars and listening to the dashing of the waves. I hailed the darkness that shut Ireland from my sight, and my pulse beat with a feverish joy when I reflected that I should soon see Geneva. The past appeared to me in the light of a frightful dream; yet the vessel in which I was, the wind that blew me from the detested shore of Ireland, and the sea which surrounded me, told me too forcibly that I was deceived by no vision and that Clerval, my friend and dearest companion, had fallen a victim to me and the monster of my creation. I repassed, in my memory, my whole life; my quiet happiness while residing with my family in Geneva, the death of my mother, and my departure for Ingolstadt. I remembered, shuddering, the mad enthusiasm that hurried me on to the creation of my hideous enemy, and I called to mind the night in which he first lived. I was unable to pursue the train of thought; a thousand feelings pressed upon me, and I wept bitterly.
+
+Ever since my recovery from the fever, I had been in the custom of taking every night a small quantity of laudanum, for it was by means of this drug only that I was enabled to gain the rest necessary for the preservation of life. Oppressed by the recollection of my various misfortunes, I now swallowed double my usual quantity and soon slept profoundly. But sleep did not afford me respite from thought and misery; my dreams presented a thousand objects that scared me. Towards morning I was possessed by a kind of nightmare; I felt the fiend’s grasp in my neck and could not free myself from it; groans and cries rang in my ears. My father, who was watching over me, perceiving my restlessness, awoke me; the dashing waves were around, the cloudy sky above, the fiend was not here: a sense of security, a feeling that a truce was established between the present hour and the irresistible, disastrous future imparted to me a kind of calm forgetfulness, of which the human mind is by its structure peculiarly susceptible.
+
+Chapter 22
+The voyage came to an end. We landed, and proceeded to Paris. I soon found that I had overtaxed my strength and that I must repose before I could continue my journey. My father’s care and attentions were indefatigable, but he did not know the origin of my sufferings and sought erroneous methods to remedy the incurable ill. He wished me to seek amusement in society. I abhorred the face of man. Oh, not abhorred! They were my brethren, my fellow beings, and I felt attracted even to the most repulsive among them, as to creatures of an angelic nature and celestial mechanism. But I felt that I had no right to share their intercourse. I had unchained an enemy among them whose joy it was to shed their blood and to revel in their groans. How they would, each and all, abhor me and hunt me from the world, did they know my unhallowed acts and the crimes which had their source in me!
+
+My father yielded at length to my desire to avoid society and strove by various arguments to banish my despair. Sometimes he thought that I felt deeply the degradation of being obliged to answer a charge of murder, and he endeavoured to prove to me the futility of pride.
+
+“Alas! My father,” said I, “how little do you know me. Human beings, their feelings and passions, would indeed be degraded if such a wretch as I felt pride. Justine, poor unhappy Justine, was as innocent as I, and she suffered the same charge; she died for it; and I am the cause of this—I murdered her. William, Justine, and Henry—they all died by my hands.”
+
+My father had often, during my imprisonment, heard me make the same assertion; when I thus accused myself, he sometimes seemed to desire an explanation, and at others he appeared to consider it as the offspring of delirium, and that, during my illness, some idea of this kind had presented itself to my imagination, the remembrance of which I preserved in my convalescence. I avoided explanation and maintained a continual silence concerning the wretch I had created. I had a persuasion that I should be supposed mad, and this in itself would for ever have chained my tongue. But, besides, I could not bring myself to disclose a secret which would fill my hearer with consternation and make fear and unnatural horror the inmates of his breast. I checked, therefore, my impatient thirst for sympathy and was silent when I would have given the world to have confided the fatal secret. Yet, still, words like those I have recorded would burst uncontrollably from me. I could offer no explanation of them, but their truth in part relieved the burden of my mysterious woe.
+
+Upon this occasion my father said, with an expression of unbounded wonder, “My dearest Victor, what infatuation is this? My dear son, I entreat you never to make such an assertion again.”
+
+“I am not mad,” I cried energetically; “the sun and the heavens, who have viewed my operations, can bear witness of my truth. I am the assassin of those most innocent victims; they died by my machinations. A thousand times would I have shed my own blood, drop by drop, to have saved their lives; but I could not, my father, indeed I could not sacrifice the whole human race.”
+
+The conclusion of this speech convinced my father that my ideas were deranged, and he instantly changed the subject of our conversation and endeavoured to alter the course of my thoughts. He wished as much as possible to obliterate the memory of the scenes that had taken place in Ireland and never alluded to them or suffered me to speak of my misfortunes.
+
+As time passed away I became more calm; misery had her dwelling in my heart, but I no longer talked in the same incoherent manner of my own crimes; sufficient for me was the consciousness of them. By the utmost self-violence I curbed the imperious voice of wretchedness, which sometimes desired to declare itself to the whole world, and my manners were calmer and more composed than they had ever been since my journey to the sea of ice.
+
+A few days before we left Paris on our way to Switzerland, I received the following letter from Elizabeth:
+
+“My dear Friend,
+
+“It gave me the greatest pleasure to receive a letter from my uncle dated at Paris; you are no longer at a formidable distance, and I may hope to see you in less than a fortnight. My poor cousin, how much you must have suffered! I expect to see you looking even more ill than when you quitted Geneva. This winter has been passed most miserably, tortured as I have been by anxious suspense; yet I hope to see peace in your countenance and to find that your heart is not totally void of comfort and tranquillity.
+
+“Yet I fear that the same feelings now exist that made you so miserable a year ago, even perhaps augmented by time. I would not disturb you at this period, when so many misfortunes weigh upon you, but a conversation that I had with my uncle previous to his departure renders some explanation necessary before we meet.
+
+Explanation! You may possibly say, What can Elizabeth have to explain? If you really say this, my questions are answered and all my doubts satisfied. But you are distant from me, and it is possible that you may dread and yet be pleased with this explanation; and in a probability of this being the case, I dare not any longer postpone writing what, during your absence, I have often wished to express to you but have never had the courage to begin.
+
+“You well know, Victor, that our union had been the favourite plan of your parents ever since our infancy. We were told this when young, and taught to look forward to it as an event that would certainly take place. We were affectionate playfellows during childhood, and, I believe, dear and valued friends to one another as we grew older. But as brother and sister often entertain a lively affection towards each other without desiring a more intimate union, may not such also be our case? Tell me, dearest Victor. Answer me, I conjure you by our mutual happiness, with simple truth—Do you not love another?
+
+“You have travelled; you have spent several years of your life at Ingolstadt; and I confess to you, my friend, that when I saw you last autumn so unhappy, flying to solitude from the society of every creature, I could not help supposing that you might regret our connection and believe yourself bound in honour to fulfil the wishes of your parents, although they opposed themselves to your inclinations. But this is false reasoning. I confess to you, my friend, that I love you and that in my airy dreams of futurity you have been my constant friend and companion. But it is your happiness I desire as well as my own when I declare to you that our marriage would render me eternally miserable unless it were the dictate of your own free choice. Even now I weep to think that, borne down as you are by the cruellest misfortunes, you may stifle, by the word honour, all hope of that love and happiness which would alone restore you to yourself. I, who have so disinterested an affection for you, may increase your miseries tenfold by being an obstacle to your wishes. Ah! Victor, be assured that your cousin and playmate has too sincere a love for you not to be made miserable by this supposition. Be happy, my friend; and if you obey me in this one request, remain satisfied that nothing on earth will have the power to interrupt my tranquillity.
+
+“Do not let this letter disturb you; do not answer tomorrow, or the next day, or even until you come, if it will give you pain. My uncle will send me news of your health, and if I see but one smile on your lips when we meet, occasioned by this or any other exertion of mine, I shall need no other happiness.
+
+“Elizabeth Lavenza.
+
+“Geneva, May 18th, 17—”
+
+This letter revived in my memory what I had before forgotten, the threat of the fiend—“I will be with you on your wedding-night!” Such was my sentence, and on that night would the dæmon employ every art to destroy me and tear me from the glimpse of happiness which promised partly to console my sufferings. On that night he had determined to consummate his crimes by my death. Well, be it so; a deadly struggle would then assuredly take place, in which if he were victorious I should be at peace and his power over me be at an end. If he were vanquished, I should be a free man. Alas! What freedom? Such as the peasant enjoys when his family have been massacred before his eyes, his cottage burnt, his lands laid waste, and he is turned adrift, homeless, penniless, and alone, but free. Such would be my liberty except that in my Elizabeth I possessed a treasure, alas, balanced by those horrors of remorse and guilt which would pursue me until death.
+
+Sweet and beloved Elizabeth! I read and reread her letter, and some softened feelings stole into my heart and dared to whisper paradisiacal dreams of love and joy; but the apple was already eaten, and the angel’s arm bared to drive me from all hope. Yet I would die to make her happy. If the monster executed his threat, death was inevitable; yet, again, I considered whether my marriage would hasten my fate. My destruction might indeed arrive a few months sooner, but if my torturer should suspect that I postponed it, influenced by his menaces, he would surely find other and perhaps more dreadful means of revenge. He had vowed to be with me on my wedding-night, yet he did not consider that threat as binding him to peace in the meantime, for as if to show me that he was not yet satiated with blood, he had murdered Clerval immediately after the enunciation of his threats. I resolved, therefore, that if my immediate union with my cousin would conduce either to hers or my father’s happiness, my adversary’s designs against my life should not retard it a single hour.
+
+In this state of mind I wrote to Elizabeth. My letter was calm and affectionate. “I fear, my beloved girl,” I said, “little happiness remains for us on earth; yet all that I may one day enjoy is centred in you. Chase away your idle fears; to you alone do I consecrate my life and my endeavours for contentment. I have one secret, Elizabeth, a dreadful one; when revealed to you, it will chill your frame with horror, and then, far from being surprised at my misery, you will only wonder that I survive what I have endured. I will confide this tale of misery and terror to you the day after our marriage shall take place, for, my sweet cousin, there must be perfect confidence between us. But until then, I conjure you, do not mention or allude to it. This I most earnestly entreat, and I know you will comply.”
+
+In about a week after the arrival of Elizabeth’s letter we returned to Geneva. The sweet girl welcomed me with warm affection, yet tears were in her eyes as she beheld my emaciated frame and feverish cheeks. I saw a change in her also. She was thinner and had lost much of that heavenly vivacity that had before charmed me; but her gentleness and soft looks of compassion made her a more fit companion for one blasted and miserable as I was.
+
+The tranquillity which I now enjoyed did not endure. Memory brought madness with it, and when I thought of what had passed, a real insanity possessed me; sometimes I was furious and burnt with rage, sometimes low and despondent. I neither spoke nor looked at anyone, but sat motionless, bewildered by the multitude of miseries that overcame me.
+
+Elizabeth alone had the power to draw me from these fits; her gentle voice would soothe me when transported by passion and inspire me with human feelings when sunk in torpor. She wept with me and for me. When reason returned, she would remonstrate and endeavour to inspire me with resignation. Ah! It is well for the unfortunate to be resigned, but for the guilty there is no peace. The agonies of remorse poison the luxury there is otherwise sometimes found in indulging the excess of grief.
+
+Soon after my arrival my father spoke of my immediate marriage with Elizabeth. I remained silent.
+
+“Have you, then, some other attachment?”
+
+“None on earth. I love Elizabeth and look forward to our union with delight. Let the day therefore be fixed; and on it I will consecrate myself, in life or death, to the happiness of my cousin.”
+
+“My dear Victor, do not speak thus. Heavy misfortunes have befallen us, but let us only cling closer to what remains and transfer our love for those whom we have lost to those who yet live. Our circle will be small but bound close by the ties of affection and mutual misfortune. And when time shall have softened your despair, new and dear objects of care will be born to replace those of whom we have been so cruelly deprived.”
+
+Such were the lessons of my father. But to me the remembrance of the threat returned; nor can you wonder that, omnipotent as the fiend had yet been in his deeds of blood, I should almost regard him as invincible, and that when he had pronounced the words “I shall be with you on your wedding-night,” I should regard the threatened fate as unavoidable. But death was no evil to me if the loss of Elizabeth were balanced with it, and I therefore, with a contented and even cheerful countenance, agreed with my father that if my cousin would consent, the ceremony should take place in ten days, and thus put, as I imagined, the seal to my fate.
+
+Great God! If for one instant I had thought what might be the hellish intention of my fiendish adversary, I would rather have banished myself for ever from my native country and wandered a friendless outcast over the earth than have consented to this miserable marriage. But, as if possessed of magic powers, the monster had blinded me to his real intentions; and when I thought that I had prepared only my own death, I hastened that of a far dearer victim.
+
+As the period fixed for our marriage drew nearer, whether from cowardice or a prophetic feeling, I felt my heart sink within me. But I concealed my feelings by an appearance of hilarity that brought smiles and joy to the countenance of my father, but hardly deceived the ever-watchful and nicer eye of Elizabeth. She looked forward to our union with placid contentment, not unmingled with a little fear, which past misfortunes had impressed, that what now appeared certain and tangible happiness might soon dissipate into an airy dream and leave no trace but deep and everlasting regret.
+
+Preparations were made for the event, congratulatory visits were received, and all wore a smiling appearance. I shut up, as well as I could, in my own heart the anxiety that preyed there and entered with seeming earnestness into the plans of my father, although they might only serve as the decorations of my tragedy. Through my father’s exertions a part of the inheritance of Elizabeth had been restored to her by the Austrian government. A small possession on the shores of Como belonged to her. It was agreed that, immediately after our union, we should proceed to Villa Lavenza and spend our first days of happiness beside the beautiful lake near which it stood.
+
+In the meantime I took every precaution to defend my person in case the fiend should openly attack me. I carried pistols and a dagger constantly about me and was ever on the watch to prevent artifice, and by these means gained a greater degree of tranquillity. Indeed, as the period approached, the threat appeared more as a delusion, not to be regarded as worthy to disturb my peace, while the happiness I hoped for in my marriage wore a greater appearance of certainty as the day fixed for its solemnisation drew nearer and I heard it continually spoken of as an occurrence which no accident could possibly prevent.
+
+Elizabeth seemed happy; my tranquil demeanour contributed greatly to calm her mind. But on the day that was to fulfil my wishes and my destiny, she was melancholy, and a presentiment of evil pervaded her; and perhaps also she thought of the dreadful secret which I had promised to reveal to her on the following day. My father was in the meantime overjoyed, and, in the bustle of preparation, only recognised in the melancholy of his niece the diffidence of a bride.
+
+After the ceremony was performed a large party assembled at my father’s, but it was agreed that Elizabeth and I should commence our journey by water, sleeping that night at Evian and continuing our voyage on the following day. The day was fair, the wind favourable; all smiled on our nuptial embarkation.
+
+Those were the last moments of my life during which I enjoyed the feeling of happiness. We passed rapidly along; the sun was hot, but we were sheltered from its rays by a kind of canopy while we enjoyed the beauty of the scene, sometimes on one side of the lake, where we saw Mont Salêve, the pleasant banks of Montalègre, and at a distance, surmounting all, the beautiful Mont Blanc, and the assemblage of snowy mountains that in vain endeavour to emulate her; sometimes coasting the opposite banks, we saw the mighty Jura opposing its dark side to the ambition that would quit its native country, and an almost insurmountable barrier to the invader who should wish to enslave it.
+
+I took the hand of Elizabeth. “You are sorrowful, my love. Ah! If you knew what I have suffered and what I may yet endure, you would endeavour to let me taste the quiet and freedom from despair that this one day at least permits me to enjoy.”
+
+“Be happy, my dear Victor,” replied Elizabeth; “there is, I hope, nothing to distress you; and be assured that if a lively joy is not painted in my face, my heart is contented. Something whispers to me not to depend too much on the prospect that is opened before us, but I will not listen to such a sinister voice. Observe how fast we move along and how the clouds, which sometimes obscure and sometimes rise above the dome of Mont Blanc, render this scene of beauty still more interesting. Look also at the innumerable fish that are swimming in the clear waters, where we can distinguish every pebble that lies at the bottom. What a divine day! How happy and serene all nature appears!”
+
+Thus Elizabeth endeavoured to divert her thoughts and mine from all reflection upon melancholy subjects. But her temper was fluctuating; joy for a few instants shone in her eyes, but it continually gave place to distraction and reverie.
+
+The sun sank lower in the heavens; we passed the river Drance and observed its path through the chasms of the higher and the glens of the lower hills. The Alps here come closer to the lake, and we approached the amphitheatre of mountains which forms its eastern boundary. The spire of Evian shone under the woods that surrounded it and the range of mountain above mountain by which it was overhung.
+
+The wind, which had hitherto carried us along with amazing rapidity, sank at sunset to a light breeze; the soft air just ruffled the water and caused a pleasant motion among the trees as we approached the shore, from which it wafted the most delightful scent of flowers and hay. The sun sank beneath the horizon as we landed, and as I touched the shore I felt those cares and fears revive which soon were to clasp me and cling to me for ever.
+
+Chapter 23
+It was eight o’clock when we landed; we walked for a short time on the shore, enjoying the transitory light, and then retired to the inn and contemplated the lovely scene of waters, woods, and mountains, obscured in darkness, yet still displaying their black outlines.
+
+The wind, which had fallen in the south, now rose with great violence in the west. The moon had reached her summit in the heavens and was beginning to descend; the clouds swept across it swifter than the flight of the vulture and dimmed her rays, while the lake reflected the scene of the busy heavens, rendered still busier by the restless waves that were beginning to rise. Suddenly a heavy storm of rain descended.
+
+I had been calm during the day, but so soon as night obscured the shapes of objects, a thousand fears arose in my mind. I was anxious and watchful, while my right hand grasped a pistol which was hidden in my bosom; every sound terrified me, but I resolved that I would sell my life dearly and not shrink from the conflict until my own life or that of my adversary was extinguished.
+
+Elizabeth observed my agitation for some time in timid and fearful silence, but there was something in my glance which communicated terror to her, and trembling, she asked, “What is it that agitates you, my dear Victor? What is it you fear?”
+
+“Oh! Peace, peace, my love,” replied I; “this night, and all will be safe; but this night is dreadful, very dreadful.”
+
+I passed an hour in this state of mind, when suddenly I reflected how fearful the combat which I momentarily expected would be to my wife, and I earnestly entreated her to retire, resolving not to join her until I had obtained some knowledge as to the situation of my enemy.
+
+She left me, and I continued some time walking up and down the passages of the house and inspecting every corner that might afford a retreat to my adversary. But I discovered no trace of him and was beginning to conjecture that some fortunate chance had intervened to prevent the execution of his menaces when suddenly I heard a shrill and dreadful scream. It came from the room into which Elizabeth had retired. As I heard it, the whole truth rushed into my mind, my arms dropped, the motion of every muscle and fibre was suspended; I could feel the blood trickling in my veins and tingling in the extremities of my limbs. This state lasted but for an instant; the scream was repeated, and I rushed into the room.
+
+Great God! Why did I not then expire! Why am I here to relate the destruction of the best hope and the purest creature on earth? She was there, lifeless and inanimate, thrown across the bed, her head hanging down and her pale and distorted features half covered by her hair. Everywhere I turn I see the same figure—her bloodless arms and relaxed form flung by the murderer on its bridal bier. Could I behold this and live? Alas! Life is obstinate and clings closest where it is most hated. For a moment only did I lose recollection; I fell senseless on the ground.
+
+When I recovered I found myself surrounded by the people of the inn; their countenances expressed a breathless terror, but the horror of others appeared only as a mockery, a shadow of the feelings that oppressed me. I escaped from them to the room where lay the body of Elizabeth, my love, my wife, so lately living, so dear, so worthy. She had been moved from the posture in which I had first beheld her, and now, as she lay, her head upon her arm and a handkerchief thrown across her face and neck, I might have supposed her asleep. I rushed towards her and embraced her with ardour, but the deadly languor and coldness of the limbs told me that what I now held in my arms had ceased to be the Elizabeth whom I had loved and cherished. The murderous mark of the fiend’s grasp was on her neck, and the breath had ceased to issue from her lips.
+
+While I still hung over her in the agony of despair, I happened to look up. The windows of the room had before been darkened, and I felt a kind of panic on seeing the pale yellow light of the moon illuminate the chamber. The shutters had been thrown back, and with a sensation of horror not to be described, I saw at the open window a figure the most hideous and abhorred. A grin was on the face of the monster; he seemed to jeer, as with his fiendish finger he pointed towards the corpse of my wife. I rushed towards the window, and drawing a pistol from my bosom, fired; but he eluded me, leaped from his station, and running with the swiftness of lightning, plunged into the lake.
+
+The report of the pistol brought a crowd into the room. I pointed to the spot where he had disappeared, and we followed the track with boats; nets were cast, but in vain. After passing several hours, we returned hopeless, most of my companions believing it to have been a form conjured up by my fancy. After having landed, they proceeded to search the country, parties going in different directions among the woods and vines.
+
+I attempted to accompany them and proceeded a short distance from the house, but my head whirled round, my steps were like those of a drunken man, I fell at last in a state of utter exhaustion; a film covered my eyes, and my skin was parched with the heat of fever. In this state I was carried back and placed on a bed, hardly conscious of what had happened; my eyes wandered round the room as if to seek something that I had lost.
+
+After an interval I arose, and as if by instinct, crawled into the room where the corpse of my beloved lay. There were women weeping around; I hung over it and joined my sad tears to theirs; all this time no distinct idea presented itself to my mind, but my thoughts rambled to various subjects, reflecting confusedly on my misfortunes and their cause. I was bewildered, in a cloud of wonder and horror. The death of William, the execution of Justine, the murder of Clerval, and lastly of my wife; even at that moment I knew not that my only remaining friends were safe from the malignity of the fiend; my father even now might be writhing under his grasp, and Ernest might be dead at his feet. This idea made me shudder and recalled me to action. I started up and resolved to return to Geneva with all possible speed.
+
+There were no horses to be procured, and I must return by the lake; but the wind was unfavourable, and the rain fell in torrents. However, it was hardly morning, and I might reasonably hope to arrive by night. I hired men to row and took an oar myself, for I had always experienced relief from mental torment in bodily exercise. But the overflowing misery I now felt, and the excess of agitation that I endured rendered me incapable of any exertion. I threw down the oar, and leaning my head upon my hands, gave way to every gloomy idea that arose. If I looked up, I saw scenes which were familiar to me in my happier time and which I had contemplated but the day before in the company of her who was now but a shadow and a recollection. Tears streamed from my eyes. The rain had ceased for a moment, and I saw the fish play in the waters as they had done a few hours before; they had then been observed by Elizabeth. Nothing is so painful to the human mind as a great and sudden change. The sun might shine or the clouds might lower, but nothing could appear to me as it had done the day before. A fiend had snatched from me every hope of future happiness; no creature had ever been so miserable as I was; so frightful an event is single in the history of man.
+
+But why should I dwell upon the incidents that followed this last overwhelming event? Mine has been a tale of horrors; I have reached their acme, and what I must now relate can but be tedious to you. Know that, one by one, my friends were snatched away; I was left desolate. My own strength is exhausted, and I must tell, in a few words, what remains of my hideous narration.
+
+I arrived at Geneva. My father and Ernest yet lived, but the former sunk under the tidings that I bore. I see him now, excellent and venerable old man! His eyes wandered in vacancy, for they had lost their charm and their delight—his Elizabeth, his more than daughter, whom he doted on with all that affection which a man feels, who in the decline of life, having few affections, clings more earnestly to those that remain. Cursed, cursed be the fiend that brought misery on his grey hairs and doomed him to waste in wretchedness! He could not live under the horrors that were accumulated around him; the springs of existence suddenly gave way; he was unable to rise from his bed, and in a few days he died in my arms.
+
+What then became of me? I know not; I lost sensation, and chains and darkness were the only objects that pressed upon me. Sometimes, indeed, I dreamt that I wandered in flowery meadows and pleasant vales with the friends of my youth, but I awoke and found myself in a dungeon. Melancholy followed, but by degrees I gained a clear conception of my miseries and situation and was then released from my prison. For they had called me mad, and during many months, as I understood, a solitary cell had been my habitation.
+
+Liberty, however, had been a useless gift to me, had I not, as I awakened to reason, at the same time awakened to revenge. As the memory of past misfortunes pressed upon me, I began to reflect on their cause—the monster whom I had created, the miserable dæmon whom I had sent abroad into the world for my destruction. I was possessed by a maddening rage when I thought of him, and desired and ardently prayed that I might have him within my grasp to wreak a great and signal revenge on his cursed head.
+
+Nor did my hate long confine itself to useless wishes; I began to reflect on the best means of securing him; and for this purpose, about a month after my release, I repaired to a criminal judge in the town and told him that I had an accusation to make, that I knew the destroyer of my family, and that I required him to exert his whole authority for the apprehension of the murderer.
+
+The magistrate listened to me with attention and kindness. “Be assured, sir,” said he, “no pains or exertions on my part shall be spared to discover the villain.”
+
+“I thank you,” replied I; “listen, therefore, to the deposition that I have to make. It is indeed a tale so strange that I should fear you would not credit it were there not something in truth which, however wonderful, forces conviction. The story is too connected to be mistaken for a dream, and I have no motive for falsehood.” My manner as I thus addressed him was impressive but calm; I had formed in my own heart a resolution to pursue my destroyer to death, and this purpose quieted my agony and for an interval reconciled me to life. I now related my history briefly but with firmness and precision, marking the dates with accuracy and never deviating into invective or exclamation.
+
+The magistrate appeared at first perfectly incredulous, but as I continued he became more attentive and interested; I saw him sometimes shudder with horror; at others a lively surprise, unmingled with disbelief, was painted on his countenance.
+
+When I had concluded my narration, I said, “This is the being whom I accuse and for whose seizure and punishment I call upon you to exert your whole power. It is your duty as a magistrate, and I believe and hope that your feelings as a man will not revolt from the execution of those functions on this occasion.”
+
+This address caused a considerable change in the physiognomy of my own auditor. He had heard my story with that half kind of belief that is given to a tale of spirits and supernatural events; but when he was called upon to act officially in consequence, the whole tide of his incredulity returned. He, however, answered mildly, “I would willingly afford you every aid in your pursuit, but the creature of whom you speak appears to have powers which would put all my exertions to defiance. Who can follow an animal which can traverse the sea of ice and inhabit caves and dens where no man would venture to intrude? Besides, some months have elapsed since the commission of his crimes, and no one can conjecture to what place he has wandered or what region he may now inhabit.”
+
+“I do not doubt that he hovers near the spot which I inhabit, and if he has indeed taken refuge in the Alps, he may be hunted like the chamois and destroyed as a beast of prey. But I perceive your thoughts; you do not credit my narrative and do not intend to pursue my enemy with the punishment which is his desert.”
+
+As I spoke, rage sparkled in my eyes; the magistrate was intimidated. “You are mistaken,” said he. “I will exert myself, and if it is in my power to seize the monster, be assured that he shall suffer punishment proportionate to his crimes. But I fear, from what you have yourself described to be his properties, that this will prove impracticable; and thus, while every proper measure is pursued, you should make up your mind to disappointment.”
+
+“That cannot be; but all that I can say will be of little avail. My revenge is of no moment to you; yet, while I allow it to be a vice, I confess that it is the devouring and only passion of my soul. My rage is unspeakable when I reflect that the murderer, whom I have turned loose upon society, still exists. You refuse my just demand; I have but one resource, and I devote myself, either in my life or death, to his destruction.”
+
+I trembled with excess of agitation as I said this; there was a frenzy in my manner, and something, I doubt not, of that haughty fierceness which the martyrs of old are said to have possessed. But to a Genevan magistrate, whose mind was occupied by far other ideas than those of devotion and heroism, this elevation of mind had much the appearance of madness. He endeavoured to soothe me as a nurse does a child and reverted to my tale as the effects of delirium.
+
+“Man,” I cried, “how ignorant art thou in thy pride of wisdom! Cease; you know not what it is you say.”
+
+I broke from the house angry and disturbed and retired to meditate on some other mode of action.
+
+Chapter 24
+My present situation was one in which all voluntary thought was swallowed up and lost. I was hurried away by fury; revenge alone endowed me with strength and composure; it moulded my feelings and allowed me to be calculating and calm at periods when otherwise delirium or death would have been my portion.
+
+My first resolution was to quit Geneva for ever; my country, which, when I was happy and beloved, was dear to me, now, in my adversity, became hateful. I provided myself with a sum of money, together with a few jewels which had belonged to my mother, and departed.
+
+And now my wanderings began which are to cease but with life. I have traversed a vast portion of the earth and have endured all the hardships which travellers in deserts and barbarous countries are wont to meet. How I have lived I hardly know; many times have I stretched my failing limbs upon the sandy plain and prayed for death. But revenge kept me alive; I dared not die and leave my adversary in being.
+
+When I quitted Geneva my first labour was to gain some clue by which I might trace the steps of my fiendish enemy. But my plan was unsettled, and I wandered many hours round the confines of the town, uncertain what path I should pursue. As night approached I found myself at the entrance of the cemetery where William, Elizabeth, and my father reposed. I entered it and approached the tomb which marked their graves. Everything was silent except the leaves of the trees, which were gently agitated by the wind; the night was nearly dark, and the scene would have been solemn and affecting even to an uninterested observer. The spirits of the departed seemed to flit around and to cast a shadow, which was felt but not seen, around the head of the mourner.
+
+The deep grief which this scene had at first excited quickly gave way to rage and despair. They were dead, and I lived; their murderer also lived, and to destroy him I must drag out my weary existence. I knelt on the grass and kissed the earth and with quivering lips exclaimed, “By the sacred earth on which I kneel, by the shades that wander near me, by the deep and eternal grief that I feel, I swear; and by thee, O Night, and the spirits that preside over thee, to pursue the dæmon who caused this misery, until he or I shall perish in mortal conflict. For this purpose I will preserve my life; to execute this dear revenge will I again behold the sun and tread the green herbage of earth, which otherwise should vanish from my eyes for ever. And I call on you, spirits of the dead, and on you, wandering ministers of vengeance, to aid and conduct me in my work. Let the cursed and hellish monster drink deep of agony; let him feel the despair that now torments me.”
+
+I had begun my adjuration with solemnity and an awe which almost assured me that the shades of my murdered friends heard and approved my devotion, but the furies possessed me as I concluded, and rage choked my utterance.
+
+I was answered through the stillness of night by a loud and fiendish laugh. It rang on my ears long and heavily; the mountains re-echoed it, and I felt as if all hell surrounded me with mockery and laughter. Surely in that moment I should have been possessed by frenzy and have destroyed my miserable existence but that my vow was heard and that I was reserved for vengeance. The laughter died away, when a well-known and abhorred voice, apparently close to my ear, addressed me in an audible whisper, “I am satisfied, miserable wretch! You have determined to live, and I am satisfied.”
+
+I darted towards the spot from which the sound proceeded, but the devil eluded my grasp. Suddenly the broad disk of the moon arose and shone full upon his ghastly and distorted shape as he fled with more than mortal speed.
+
+I pursued him, and for many months this has been my task. Guided by a slight clue, I followed the windings of the Rhone, but vainly. The blue Mediterranean appeared, and by a strange chance, I saw the fiend enter by night and hide himself in a vessel bound for the Black Sea. I took my passage in the same ship, but he escaped, I know not how.
+
+Amidst the wilds of Tartary and Russia, although he still evaded me, I have ever followed in his track. Sometimes the peasants, scared by this horrid apparition, informed me of his path; sometimes he himself, who feared that if I lost all trace of him I should despair and die, left some mark to guide me. The snows descended on my head, and I saw the print of his huge step on the white plain. To you first entering on life, to whom care is new and agony unknown, how can you understand what I have felt and still feel? Cold, want, and fatigue were the least pains which I was destined to endure; I was cursed by some devil and carried about with me my eternal hell; yet still a spirit of good followed and directed my steps and when I most murmured would suddenly extricate me from seemingly insurmountable difficulties. Sometimes, when nature, overcome by hunger, sank under the exhaustion, a repast was prepared for me in the desert that restored and inspirited me. The fare was, indeed, coarse, such as the peasants of the country ate, but I will not doubt that it was set there by the spirits that I had invoked to aid me. Often, when all was dry, the heavens cloudless, and I was parched by thirst, a slight cloud would bedim the sky, shed the few drops that revived me, and vanish.
+
+I followed, when I could, the courses of the rivers; but the dæmon generally avoided these, as it was here that the population of the country chiefly collected. In other places human beings were seldom seen, and I generally subsisted on the wild animals that crossed my path. I had money with me and gained the friendship of the villagers by distributing it; or I brought with me some food that I had killed, which, after taking a small part, I always presented to those who had provided me with fire and utensils for cooking.
+
+My life, as it passed thus, was indeed hateful to me, and it was during sleep alone that I could taste joy. O blessed sleep! Often, when most miserable, I sank to repose, and my dreams lulled me even to rapture. The spirits that guarded me had provided these moments, or rather hours, of happiness that I might retain strength to fulfil my pilgrimage. Deprived of this respite, I should have sunk under my hardships. During the day I was sustained and inspirited by the hope of night, for in sleep I saw my friends, my wife, and my beloved country; again I saw the benevolent countenance of my father, heard the silver tones of my Elizabeth’s voice, and beheld Clerval enjoying health and youth. Often, when wearied by a toilsome march, I persuaded myself that I was dreaming until night should come and that I should then enjoy reality in the arms of my dearest friends. What agonising fondness did I feel for them! How did I cling to their dear forms, as sometimes they haunted even my waking hours, and persuade myself that they still lived! At such moments vengeance, that burned within me, died in my heart, and I pursued my path towards the destruction of the dæmon more as a task enjoined by heaven, as the mechanical impulse of some power of which I was unconscious, than as the ardent desire of my soul.
+
+What his feelings were whom I pursued I cannot know. Sometimes, indeed, he left marks in writing on the barks of the trees or cut in stone that guided me and instigated my fury. “My reign is not yet over”—these words were legible in one of these inscriptions—“you live, and my power is complete. Follow me; I seek the everlasting ices of the north, where you will feel the misery of cold and frost, to which I am impassive. You will find near this place, if you follow not too tardily, a dead hare; eat and be refreshed. Come on, my enemy; we have yet to wrestle for our lives, but many hard and miserable hours must you endure until that period shall arrive.”
+
+Scoffing devil! Again do I vow vengeance; again do I devote thee, miserable fiend, to torture and death. Never will I give up my search until he or I perish; and then with what ecstasy shall I join my Elizabeth and my departed friends, who even now prepare for me the reward of my tedious toil and horrible pilgrimage!
+
+As I still pursued my journey to the northward, the snows thickened and the cold increased in a degree almost too severe to support. The peasants were shut up in their hovels, and only a few of the most hardy ventured forth to seize the animals whom starvation had forced from their hiding-places to seek for prey. The rivers were covered with ice, and no fish could be procured; and thus I was cut off from my chief article of maintenance.
+
+The triumph of my enemy increased with the difficulty of my labours. One inscription that he left was in these words: “Prepare! Your toils only begin; wrap yourself in furs and provide food, for we shall soon enter upon a journey where your sufferings will satisfy my everlasting hatred.”
+
+My courage and perseverance were invigorated by these scoffing words; I resolved not to fail in my purpose, and calling on Heaven to support me, I continued with unabated fervour to traverse immense deserts, until the ocean appeared at a distance and formed the utmost boundary of the horizon. Oh! How unlike it was to the blue seasons of the south! Covered with ice, it was only to be distinguished from land by its superior wildness and ruggedness. The Greeks wept for joy when they beheld the Mediterranean from the hills of Asia, and hailed with rapture the boundary of their toils. I did not weep, but I knelt down and with a full heart thanked my guiding spirit for conducting me in safety to the place where I hoped, notwithstanding my adversary’s gibe, to meet and grapple with him.
+
+Some weeks before this period I had procured a sledge and dogs and thus traversed the snows with inconceivable speed. I know not whether the fiend possessed the same advantages, but I found that, as before I had daily lost ground in the pursuit, I now gained on him, so much so that when I first saw the ocean he was but one day’s journey in advance, and I hoped to intercept him before he should reach the beach. With new courage, therefore, I pressed on, and in two days arrived at a wretched hamlet on the seashore. I inquired of the inhabitants concerning the fiend and gained accurate information. A gigantic monster, they said, had arrived the night before, armed with a gun and many pistols, putting to flight the inhabitants of a solitary cottage through fear of his terrific appearance. He had carried off their store of winter food, and placing it in a sledge, to draw which he had seized on a numerous drove of trained dogs, he had harnessed them, and the same night, to the joy of the horror-struck villagers, had pursued his journey across the sea in a direction that led to no land; and they conjectured that he must speedily be destroyed by the breaking of the ice or frozen by the eternal frosts.
+
+On hearing this information I suffered a temporary access of despair. He had escaped me, and I must commence a destructive and almost endless journey across the mountainous ices of the ocean, amidst cold that few of the inhabitants could long endure and which I, the native of a genial and sunny climate, could not hope to survive. Yet at the idea that the fiend should live and be triumphant, my rage and vengeance returned, and like a mighty tide, overwhelmed every other feeling. After a slight repose, during which the spirits of the dead hovered round and instigated me to toil and revenge, I prepared for my journey.
+
+I exchanged my land-sledge for one fashioned for the inequalities of the Frozen Ocean, and purchasing a plentiful stock of provisions, I departed from land.
+
+I cannot guess how many days have passed since then, but I have endured misery which nothing but the eternal sentiment of a just retribution burning within my heart could have enabled me to support. Immense and rugged mountains of ice often barred up my passage, and I often heard the thunder of the ground sea, which threatened my destruction. But again the frost came and made the paths of the sea secure.
+
+By the quantity of provision which I had consumed, I should guess that I had passed three weeks in this journey; and the continual protraction of hope, returning back upon the heart, often wrung bitter drops of despondency and grief from my eyes. Despair had indeed almost secured her prey, and I should soon have sunk beneath this misery. Once, after the poor animals that conveyed me had with incredible toil gained the summit of a sloping ice mountain, and one, sinking under his fatigue, died, I viewed the expanse before me with anguish, when suddenly my eye caught a dark speck upon the dusky plain. I strained my sight to discover what it could be and uttered a wild cry of ecstasy when I distinguished a sledge and the distorted proportions of a well-known form within. Oh! With what a burning gush did hope revisit my heart! Warm tears filled my eyes, which I hastily wiped away, that they might not intercept the view I had of the dæmon; but still my sight was dimmed by the burning drops, until, giving way to the emotions that oppressed me, I wept aloud.
+
+But this was not the time for delay; I disencumbered the dogs of their dead companion, gave them a plentiful portion of food, and after an hour’s rest, which was absolutely necessary, and yet which was bitterly irksome to me, I continued my route. The sledge was still visible, nor did I again lose sight of it except at the moments when for a short time some ice-rock concealed it with its intervening crags. I indeed perceptibly gained on it, and when, after nearly two days’ journey, I beheld my enemy at no more than a mile distant, my heart bounded within me.
+
+But now, when I appeared almost within grasp of my foe, my hopes were suddenly extinguished, and I lost all trace of him more utterly than I had ever done before. A ground sea was heard; the thunder of its progress, as the waters rolled and swelled beneath me, became every moment more ominous and terrific. I pressed on, but in vain. The wind arose; the sea roared; and, as with the mighty shock of an earthquake, it split and cracked with a tremendous and overwhelming sound. The work was soon finished; in a few minutes a tumultuous sea rolled between me and my enemy, and I was left drifting on a scattered piece of ice that was continually lessening and thus preparing for me a hideous death.
+
+In this manner many appalling hours passed; several of my dogs died, and I myself was about to sink under the accumulation of distress when I saw your vessel riding at anchor and holding forth to me hopes of succour and life. I had no conception that vessels ever came so far north and was astounded at the sight. I quickly destroyed part of my sledge to construct oars, and by these means was enabled, with infinite fatigue, to move my ice raft in the direction of your ship. I had determined, if you were going southwards, still to trust myself to the mercy of the seas rather than abandon my purpose. I hoped to induce you to grant me a boat with which I could pursue my enemy. But your direction was northwards. You took me on board when my vigour was exhausted, and I should soon have sunk under my multiplied hardships into a death which I still dread, for my task is unfulfilled.
+
+Oh! When will my guiding spirit, in conducting me to the dæmon, allow me the rest I so much desire; or must I die, and he yet live? If I do, swear to me, Walton, that he shall not escape, that you will seek him and satisfy my vengeance in his death. And do I dare to ask of you to undertake my pilgrimage, to endure the hardships that I have undergone? No; I am not so selfish. Yet, when I am dead, if he should appear, if the ministers of vengeance should conduct him to you, swear that he shall not live—swear that he shall not triumph over my accumulated woes and survive to add to the list of his dark crimes. He is eloquent and persuasive, and once his words had even power over my heart; but trust him not. His soul is as hellish as his form, full of treachery and fiend-like malice. Hear him not; call on the names of William, Justine, Clerval, Elizabeth, my father, and of the wretched Victor, and thrust your sword into his heart. I will hover near and direct the steel aright.
+
+Walton, in continuation.
+
+August 26th, 17—.
+
+You have read this strange and terrific story, Margaret; and do you not feel your blood congeal with horror, like that which even now curdles mine? Sometimes, seized with sudden agony, he could not continue his tale; at others, his voice broken, yet piercing, uttered with difficulty the words so replete with anguish. His fine and lovely eyes were now lighted up with indignation, now subdued to downcast sorrow and quenched in infinite wretchedness. Sometimes he commanded his countenance and tones and related the most horrible incidents with a tranquil voice, suppressing every mark of agitation; then, like a volcano bursting forth, his face would suddenly change to an expression of the wildest rage as he shrieked out imprecations on his persecutor.
+
+His tale is connected and told with an appearance of the simplest truth, yet I own to you that the letters of Felix and Safie, which he showed me, and the apparition of the monster seen from our ship, brought to me a greater conviction of the truth of his narrative than his asseverations, however earnest and connected. Such a monster has, then, really existence! I cannot doubt it, yet I am lost in surprise and admiration. Sometimes I endeavoured to gain from Frankenstein the particulars of his creature’s formation, but on this point he was impenetrable.
+
+“Are you mad, my friend?” said he. “Or whither does your senseless curiosity lead you? Would you also create for yourself and the world a demoniacal enemy? Peace, peace! Learn my miseries and do not seek to increase your own.”
+
+Frankenstein discovered that I made notes concerning his history; he asked to see them and then himself corrected and augmented them in many places, but principally in giving the life and spirit to the conversations he held with his enemy. “Since you have preserved my narration,” said he, “I would not that a mutilated one should go down to posterity.”
+
+Thus has a week passed away, while I have listened to the strangest tale that ever imagination formed. My thoughts and every feeling of my soul have been drunk up by the interest for my guest which this tale and his own elevated and gentle manners have created. I wish to soothe him, yet can I counsel one so infinitely miserable, so destitute of every hope of consolation, to live? Oh, no! The only joy that he can now know will be when he composes his shattered spirit to peace and death. Yet he enjoys one comfort, the offspring of solitude and delirium; he believes that when in dreams he holds converse with his friends and derives from that communion consolation for his miseries or excitements to his vengeance, that they are not the creations of his fancy, but the beings themselves who visit him from the regions of a remote world. This faith gives a solemnity to his reveries that render them to me almost as imposing and interesting as truth.
+
+Our conversations are not always confined to his own history and misfortunes. On every point of general literature he displays unbounded knowledge and a quick and piercing apprehension. His eloquence is forcible and touching; nor can I hear him, when he relates a pathetic incident or endeavours to move the passions of pity or love, without tears. What a glorious creature must he have been in the days of his prosperity, when he is thus noble and godlike in ruin! He seems to feel his own worth and the greatness of his fall.
+
+“When younger,” said he, “I believed myself destined for some great enterprise. My feelings are profound, but I possessed a coolness of judgment that fitted me for illustrious achievements. This sentiment of the worth of my nature supported me when others would have been oppressed, for I deemed it criminal to throw away in useless grief those talents that might be useful to my fellow creatures. When I reflected on the work I had completed, no less a one than the creation of a sensitive and rational animal, I could not rank myself with the herd of common projectors. But this thought, which supported me in the commencement of my career, now serves only to plunge me lower in the dust. All my speculations and hopes are as nothing, and like the archangel who aspired to omnipotence, I am chained in an eternal hell. My imagination was vivid, yet my powers of analysis and application were intense; by the union of these qualities I conceived the idea and executed the creation of a man. Even now I cannot recollect without passion my reveries while the work was incomplete. I trod heaven in my thoughts, now exulting in my powers, now burning with the idea of their effects. From my infancy I was imbued with high hopes and a lofty ambition; but how am I sunk! Oh! My friend, if you had known me as I once was, you would not recognise me in this state of degradation. Despondency rarely visited my heart; a high destiny seemed to bear me on, until I fell, never, never again to rise.”
+
+Must I then lose this admirable being? I have longed for a friend; I have sought one who would sympathise with and love me. Behold, on these desert seas I have found such a one, but I fear I have gained him only to know his value and lose him. I would reconcile him to life, but he repulses the idea.
+
+“I thank you, Walton,” he said, “for your kind intentions towards so miserable a wretch; but when you speak of new ties and fresh affections, think you that any can replace those who are gone? Can any man be to me as Clerval was, or any woman another Elizabeth? Even where the affections are not strongly moved by any superior excellence, the companions of our childhood always possess a certain power over our minds which hardly any later friend can obtain. They know our infantine dispositions, which, however they may be afterwards modified, are never eradicated; and they can judge of our actions with more certain conclusions as to the integrity of our motives. A sister or a brother can never, unless indeed such symptoms have been shown early, suspect the other of fraud or false dealing, when another friend, however strongly he may be attached, may, in spite of himself, be contemplated with suspicion. But I enjoyed friends, dear not only through habit and association, but from their own merits; and wherever I am, the soothing voice of my Elizabeth and the conversation of Clerval will be ever whispered in my ear. They are dead, and but one feeling in such a solitude can persuade me to preserve my life. If I were engaged in any high undertaking or design, fraught with extensive utility to my fellow creatures, then could I live to fulfil it. But such is not my destiny; I must pursue and destroy the being to whom I gave existence; then my lot on earth will be fulfilled and I may die.”
+
+My beloved Sister,
+
+September 2d.
+
+I write to you, encompassed by peril and ignorant whether I am ever doomed to see again dear England and the dearer friends that inhabit it. I am surrounded by mountains of ice which admit of no escape and threaten every moment to crush my vessel. The brave fellows whom I have persuaded to be my companions look towards me for aid, but I have none to bestow. There is something terribly appalling in our situation, yet my courage and hopes do not desert me. Yet it is terrible to reflect that the lives of all these men are endangered through me. If we are lost, my mad schemes are the cause.
+
+And what, Margaret, will be the state of your mind? You will not hear of my destruction, and you will anxiously await my return. Years will pass, and you will have visitings of despair and yet be tortured by hope. Oh! My beloved sister, the sickening failing of your heart-felt expectations is, in prospect, more terrible to me than my own death. But you have a husband and lovely children; you may be happy. Heaven bless you and make you so!
+
+My unfortunate guest regards me with the tenderest compassion. He endeavours to fill me with hope and talks as if life were a possession which he valued. He reminds me how often the same accidents have happened to other navigators who have attempted this sea, and in spite of myself, he fills me with cheerful auguries. Even the sailors feel the power of his eloquence; when he speaks, they no longer despair; he rouses their energies, and while they hear his voice they believe these vast mountains of ice are mole-hills which will vanish before the resolutions of man. These feelings are transitory; each day of expectation delayed fills them with fear, and I almost dread a mutiny caused by this despair.
+
+September 5th.
+
+A scene has just passed of such uncommon interest that, although it is highly probable that these papers may never reach you, yet I cannot forbear recording it.
+
+We are still surrounded by mountains of ice, still in imminent danger of being crushed in their conflict. The cold is excessive, and many of my unfortunate comrades have already found a grave amidst this scene of desolation. Frankenstein has daily declined in health; a feverish fire still glimmers in his eyes, but he is exhausted, and when suddenly roused to any exertion, he speedily sinks again into apparent lifelessness.
+
+I mentioned in my last letter the fears I entertained of a mutiny. This morning, as I sat watching the wan countenance of my friend—his eyes half closed and his limbs hanging listlessly—I was roused by half a dozen of the sailors, who demanded admission into the cabin. They entered, and their leader addressed me. He told me that he and his companions had been chosen by the other sailors to come in deputation to me to make me a requisition which, in justice, I could not refuse. We were immured in ice and should probably never escape, but they feared that if, as was possible, the ice should dissipate and a free passage be opened, I should be rash enough to continue my voyage and lead them into fresh dangers, after they might happily have surmounted this. They insisted, therefore, that I should engage with a solemn promise that if the vessel should be freed I would instantly direct my course southwards.
+
+This speech troubled me. I had not despaired, nor had I yet conceived the idea of returning if set free. Yet could I, in justice, or even in possibility, refuse this demand? I hesitated before I answered, when Frankenstein, who had at first been silent, and indeed appeared hardly to have force enough to attend, now roused himself; his eyes sparkled, and his cheeks flushed with momentary vigour. Turning towards the men, he said,
+
+“What do you mean? What do you demand of your captain? Are you, then, so easily turned from your design? Did you not call this a glorious expedition? “And wherefore was it glorious? Not because the way was smooth and placid as a southern sea, but because it was full of dangers and terror, because at every new incident your fortitude was to be called forth and your courage exhibited, because danger and death surrounded it, and these you were to brave and overcome. For this was it a glorious, for this was it an honourable undertaking. You were hereafter to be hailed as the benefactors of your species, your names adored as belonging to brave men who encountered death for honour and the benefit of mankind. And now, behold, with the first imagination of danger, or, if you will, the first mighty and terrific trial of your courage, you shrink away and are content to be handed down as men who had not strength enough to endure cold and peril; and so, poor souls, they were chilly and returned to their warm firesides. Why, that requires not this preparation; ye need not have come thus far and dragged your captain to the shame of a defeat merely to prove yourselves cowards. Oh! Be men, or be more than men. Be steady to your purposes and firm as a rock. This ice is not made of such stuff as your hearts may be; it is mutable and cannot withstand you if you say that it shall not. Do not return to your families with the stigma of disgrace marked on your brows. Return as heroes who have fought and conquered and who know not what it is to turn their backs on the foe.”
+
+He spoke this with a voice so modulated to the different feelings expressed in his speech, with an eye so full of lofty design and heroism, that can you wonder that these men were moved? They looked at one another and were unable to reply. I spoke; I told them to retire and consider of what had been said, that I would not lead them farther north if they strenuously desired the contrary, but that I hoped that, with reflection, their courage would return.
+
+They retired and I turned towards my friend, but he was sunk in languor and almost deprived of life.
+
+How all this will terminate, I know not, but I had rather die than return shamefully, my purpose unfulfilled. Yet I fear such will be my fate; the men, unsupported by ideas of glory and honour, can never willingly continue to endure their present hardships.
+
+September 7th.
+
+The die is cast; I have consented to return if we are not destroyed. Thus are my hopes blasted by cowardice and indecision; I come back ignorant and disappointed. It requires more philosophy than I possess to bear this injustice with patience.
+
+September 12th.
+
+It is past; I am returning to England. I have lost my hopes of utility and glory; I have lost my friend. But I will endeavour to detail these bitter circumstances to you, my dear sister; and while I am wafted towards England and towards you, I will not despond.
+
+September 9th, the ice began to move, and roarings like thunder were heard at a distance as the islands split and cracked in every direction. We were in the most imminent peril, but as we could only remain passive, my chief attention was occupied by my unfortunate guest whose illness increased in such a degree that he was entirely confined to his bed. The ice cracked behind us and was driven with force towards the north; a breeze sprang from the west, and on the 11th the passage towards the south became perfectly free. When the sailors saw this and that their return to their native country was apparently assured, a shout of tumultuous joy broke from them, loud and long-continued. Frankenstein, who was dozing, awoke and asked the cause of the tumult. “They shout,” I said, “because they will soon return to England.”
+
+“Do you, then, really return?”
+
+“Alas! Yes; I cannot withstand their demands. I cannot lead them unwillingly to danger, and I must return.”
+
+“Do so, if you will; but I will not. You may give up your purpose, but mine is assigned to me by Heaven, and I dare not. I am weak, but surely the spirits who assist my vengeance will endow me with sufficient strength.” Saying this, he endeavoured to spring from the bed, but the exertion was too great for him; he fell back and fainted.
+
+It was long before he was restored, and I often thought that life was entirely extinct. At length he opened his eyes; he breathed with difficulty and was unable to speak. The surgeon gave him a composing draught and ordered us to leave him undisturbed. In the meantime he told me that my friend had certainly not many hours to live.
+
+His sentence was pronounced, and I could only grieve and be patient. I sat by his bed, watching him; his eyes were closed, and I thought he slept; but presently he called to me in a feeble voice, and bidding me come near, said, “Alas! The strength I relied on is gone; I feel that I shall soon die, and he, my enemy and persecutor, may still be in being. Think not, Walton, that in the last moments of my existence I feel that burning hatred and ardent desire of revenge I once expressed; but I feel myself justified in desiring the death of my adversary. During these last days I have been occupied in examining my past conduct; nor do I find it blamable. In a fit of enthusiastic madness I created a rational creature and was bound towards him to assure, as far as was in my power, his happiness and well-being. This was my duty, but there was another still paramount to that. My duties towards the beings of my own species had greater claims to my attention because they included a greater proportion of happiness or misery. Urged by this view, I refused, and I did right in refusing, to create a companion for the first creature. He showed unparalleled malignity and selfishness in evil; he destroyed my friends; he devoted to destruction beings who possessed exquisite sensations, happiness, and wisdom; nor do I know where this thirst for vengeance may end. Miserable himself that he may render no other wretched, he ought to die. The task of his destruction was mine, but I have failed. When actuated by selfish and vicious motives, I asked you to undertake my unfinished work, and I renew this request now, when I am only induced by reason and virtue.
+
+“Yet I cannot ask you to renounce your country and friends to fulfil this task; and now that you are returning to England, you will have little chance of meeting with him. But the consideration of these points, and the well balancing of what you may esteem your duties, I leave to you; my judgment and ideas are already disturbed by the near approach of death. I dare not ask you to do what I think right, for I may still be misled by passion.
+
+“That he should live to be an instrument of mischief disturbs me; in other respects, this hour, when I momentarily expect my release, is the only happy one which I have enjoyed for several years. The forms of the beloved dead flit before me, and I hasten to their arms. Farewell, Walton! Seek happiness in tranquillity and avoid ambition, even if it be only the apparently innocent one of distinguishing yourself in science and discoveries. Yet why do I say this? I have myself been blasted in these hopes, yet another may succeed.”
+
+His voice became fainter as he spoke, and at length, exhausted by his effort, he sank into silence. About half an hour afterwards he attempted again to speak but was unable; he pressed my hand feebly, and his eyes closed for ever, while the irradiation of a gentle smile passed away from his lips.
+
+Margaret, what comment can I make on the untimely extinction of this glorious spirit? What can I say that will enable you to understand the depth of my sorrow? All that I should express would be inadequate and feeble. My tears flow; my mind is overshadowed by a cloud of disappointment. But I journey towards England, and I may there find consolation.
+
+I am interrupted. What do these sounds portend? It is midnight; the breeze blows fairly, and the watch on deck scarcely stir. Again there is a sound as of a human voice, but hoarser; it comes from the cabin where the remains of Frankenstein still lie. I must arise and examine. Good night, my sister.
+
+Great God! what a scene has just taken place! I am yet dizzy with the remembrance of it. I hardly know whether I shall have the power to detail it; yet the tale which I have recorded would be incomplete without this final and wonderful catastrophe.
+
+I entered the cabin where lay the remains of my ill-fated and admirable friend. Over him hung a form which I cannot find words to describe—gigantic in stature, yet uncouth and distorted in its proportions. As he hung over the coffin, his face was concealed by long locks of ragged hair; but one vast hand was extended, in colour and apparent texture like that of a mummy. When he heard the sound of my approach, he ceased to utter exclamations of grief and horror and sprung towards the window. Never did I behold a vision so horrible as his face, of such loathsome yet appalling hideousness. I shut my eyes involuntarily and endeavoured to recollect what were my duties with regard to this destroyer. I called on him to stay.
+
+He paused, looking on me with wonder, and again turning towards the lifeless form of his creator, he seemed to forget my presence, and every feature and gesture seemed instigated by the wildest rage of some uncontrollable passion.
+
+“That is also my victim!” he exclaimed. “In his murder my crimes are consummated; the miserable series of my being is wound to its close! Oh, Frankenstein! Generous and self-devoted being! What does it avail that I now ask thee to pardon me? I, who irretrievably destroyed thee by destroying all thou lovedst. Alas! He is cold, he cannot answer me.”
+
+His voice seemed suffocated, and my first impulses, which had suggested to me the duty of obeying the dying request of my friend in destroying his enemy, were now suspended by a mixture of curiosity and compassion. I approached this tremendous being; I dared not again raise my eyes to his face, there was something so scaring and unearthly in his ugliness. I attempted to speak, but the words died away on my lips. The monster continued to utter wild and incoherent self-reproaches. At length I gathered resolution to address him in a pause of the tempest of his passion.
+
+“Your repentance,” I said, “is now superfluous. If you had listened to the voice of conscience and heeded the stings of remorse before you had urged your diabolical vengeance to this extremity, Frankenstein would yet have lived.”
+
+“And do you dream?” said the dæmon. “Do you think that I was then dead to agony and remorse? He,” he continued, pointing to the corpse, “he suffered not in the consummation of the deed. Oh! Not the ten-thousandth portion of the anguish that was mine during the lingering detail of its execution. A frightful selfishness hurried me on, while my heart was poisoned with remorse. Think you that the groans of Clerval were music to my ears? My heart was fashioned to be susceptible of love and sympathy, and when wrenched by misery to vice and hatred, it did not endure the violence of the change without torture such as you cannot even imagine.
+
+“After the murder of Clerval I returned to Switzerland, heart-broken and overcome. I pitied Frankenstein; my pity amounted to horror; I abhorred myself. But when I discovered that he, the author at once of my existence and of its unspeakable torments, dared to hope for happiness, that while he accumulated wretchedness and despair upon me he sought his own enjoyment in feelings and passions from the indulgence of which I was for ever barred, then impotent envy and bitter indignation filled me with an insatiable thirst for vengeance. I recollected my threat and resolved that it should be accomplished. I knew that I was preparing for myself a deadly torture, but I was the slave, not the master, of an impulse which I detested yet could not disobey. Yet when she died! Nay, then I was not miserable. I had cast off all feeling, subdued all anguish, to riot in the excess of my despair. Evil thenceforth became my good. Urged thus far, I had no choice but to adapt my nature to an element which I had willingly chosen. The completion of my demoniacal design became an insatiable passion. And now it is ended; there is my last victim!”
+
+I was at first touched by the expressions of his misery; yet, when I called to mind what Frankenstein had said of his powers of eloquence and persuasion, and when I again cast my eyes on the lifeless form of my friend, indignation was rekindled within me. “Wretch!” I said. “It is well that you come here to whine over the desolation that you have made. You throw a torch into a pile of buildings, and when they are consumed, you sit among the ruins and lament the fall. Hypocritical fiend! If he whom you mourn still lived, still would he be the object, again would he become the prey, of your accursed vengeance. It is not pity that you feel; you lament only because the victim of your malignity is withdrawn from your power.”
+
+“Oh, it is not thus—not thus,” interrupted the being. “Yet such must be the impression conveyed to you by what appears to be the purport of my actions. Yet I seek not a fellow feeling in my misery. No sympathy may I ever find. When I first sought it, it was the love of virtue, the feelings of happiness and affection with which my whole being overflowed, that I wished to be participated. But now that virtue has become to me a shadow, and that happiness and affection are turned into bitter and loathing despair, in what should I seek for sympathy? I am content to suffer alone while my sufferings shall endure; when I die, I am well satisfied that abhorrence and opprobrium should load my memory. Once my fancy was soothed with dreams of virtue, of fame, and of enjoyment. Once I falsely hoped to meet with beings who, pardoning my outward form, would love me for the excellent qualities which I was capable of unfolding. I was nourished with high thoughts of honour and devotion. But now crime has degraded me beneath the meanest animal. No guilt, no mischief, no malignity, no misery, can be found comparable to mine. When I run over the frightful catalogue of my sins, I cannot believe that I am the same creature whose thoughts were once filled with sublime and transcendent visions of the beauty and the majesty of goodness. But it is even so; the fallen angel becomes a malignant devil. Yet even that enemy of God and man had friends and associates in his desolation; I am alone.
+
+“You, who call Frankenstein your friend, seem to have a knowledge of my crimes and his misfortunes. But in the detail which he gave you of them he could not sum up the hours and months of misery which I endured wasting in impotent passions. For while I destroyed his hopes, I did not satisfy my own desires. They were for ever ardent and craving; still I desired love and fellowship, and I was still spurned. Was there no injustice in this? Am I to be thought the only criminal, when all humankind sinned against me? Why do you not hate Felix, who drove his friend from his door with contumely? Why do you not execrate the rustic who sought to destroy the saviour of his child? Nay, these are virtuous and immaculate beings! I, the miserable and the abandoned, am an abortion, to be spurned at, and kicked, and trampled on. Even now my blood boils at the recollection of this injustice.
+
+“But it is true that I am a wretch. I have murdered the lovely and the helpless; I have strangled the innocent as they slept and grasped to death his throat who never injured me or any other living thing. I have devoted my creator, the select specimen of all that is worthy of love and admiration among men, to misery; I have pursued him even to that irremediable ruin. There he lies, white and cold in death. You hate me, but your abhorrence cannot equal that with which I regard myself. I look on the hands which executed the deed; I think on the heart in which the imagination of it was conceived and long for the moment when these hands will meet my eyes, when that imagination will haunt my thoughts no more.
+
+“Fear not that I shall be the instrument of future mischief. My work is nearly complete. Neither yours nor any man’s death is needed to consummate the series of my being and accomplish that which must be done, but it requires my own. Do not think that I shall be slow to perform this sacrifice. I shall quit your vessel on the ice raft which brought me thither and shall seek the most northern extremity of the globe; I shall collect my funeral pile and consume to ashes this miserable frame, that its remains may afford no light to any curious and unhallowed wretch who would create such another as I have been. I shall die. I shall no longer feel the agonies which now consume me or be the prey of feelings unsatisfied, yet unquenched. He is dead who called me into being; and when I shall be no more, the very remembrance of us both will speedily vanish. I shall no longer see the sun or stars or feel the winds play on my cheeks. Light, feeling, and sense will pass away; and in this condition must I find my happiness. Some years ago, when the images which this world affords first opened upon me, when I felt the cheering warmth of summer and heard the rustling of the leaves and the warbling of the birds, and these were all to me, I should have wept to die; now it is my only consolation. Polluted by crimes and torn by the bitterest remorse, where can I find rest but in death?
+
+“Farewell! I leave you, and in you the last of humankind whom these eyes will ever behold. Farewell, Frankenstein! If thou wert yet alive and yet cherished a desire of revenge against me, it would be better satiated in my life than in my destruction. But it was not so; thou didst seek my extinction, that I might not cause greater wretchedness; and if yet, in some mode unknown to me, thou hadst not ceased to think and feel, thou wouldst not desire against me a vengeance greater than that which I feel. Blasted as thou wert, my agony was still superior to thine, for the bitter sting of remorse will not cease to rankle in my wounds until death shall close them for ever.
+
+“But soon,” he cried with sad and solemn enthusiasm, “I shall die, and what I now feel be no longer felt. Soon these burning miseries will be extinct. I shall ascend my funeral pile triumphantly and exult in the agony of the torturing flames. The light of that conflagration will fade away; my ashes will be swept into the sea by the winds. My spirit will sleep in peace, or if it thinks, it will not surely think thus. Farewell.”
+
+He sprang from the cabin-window as he said this, upon the ice raft which lay close to the vessel. He was soon borne away by the waves and lost in darkness and distance.
diff --git a/testdata/frankenstein_13.txt b/testdata/frankenstein_13.txt
new file mode 100644
index 0000000..48365c6
--- /dev/null
+++ b/testdata/frankenstein_13.txt
@@ -0,0 +1,66 @@
+Letter 1
+To Mrs. Saville, England.
+
+St. Petersburgh, Dec. 11th, 17—.
+
+You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking.
+
+I am already far north of London, and as I walk in the streets of Petersburgh, I feel a cold northern breeze play upon my cheeks, which braces my nerves and fills me with delight. Do you understand this feeling? This breeze, which has travelled from the regions towards which I am advancing, gives me a foretaste of those icy climes. Inspirited by this wind of promise, my daydreams become more fervent and vivid. I try in vain to be persuaded that the pole is the seat of frost and desolation; it ever presents itself to my imagination as the region of beauty and delight. There, Margaret, the sun is for ever visible, its broad disk just skirting the horizon and diffusing a perpetual splendour. There—for with your leave, my sister, I will put some trust in preceding navigators—there snow and frost are banished; and, sailing over a calm sea, we may be wafted to a land surpassing in wonders and in beauty every region hitherto discovered on the habitable globe. Its productions and features may be without example, as the phenomena of the heavenly bodies undoubtedly are in those undiscovered solitudes. What may not be expected in a country of eternal light? I may there discover the wondrous power which attracts the needle and may regulate a thousand celestial observations that require only this voyage to render their seeming eccentricities consistent for ever. I shall satiate my ardent curiosity with the sight of a part of the world never before visited, and may tread a land never before imprinted by the foot of man. These are my enticements, and they are sufficient to conquer all fear of danger or death and to induce me to commence this laborious voyage with the joy a child feels when he embarks in a little boat, with his holiday mates, on an expedition of discovery up his native river. But supposing all these conjectures to be false, you cannot contest the inestimable benefit which I shall confer on all mankind, to the last generation, by discovering a passage near the pole to those countries, to reach which at present so many months are requisite; or by ascertaining the secret of the magnet, which, if at all possible, can only be effected by an undertaking such as mine.
+
+These reflections have dispelled the agitation with which I began my letter, and I feel my heart glow with an enthusiasm which elevates me to heaven, for nothing contributes so much to tranquillise the mind as a steady purpose—a point on which the soul may fix its intellectual eye. This expedition has been the favourite dream of my early years. I have read with ardour the accounts of the various voyages which have been made in the prospect of arriving at the North Pacific Ocean through the seas which surround the pole. You may remember that a history of all the voyages made for purposes of discovery composed the whole of our good Uncle Thomas’ library. My education was neglected, yet I was passionately fond of reading. These volumes were my study day and night, and my familiarity with them increased that regret which I had felt, as a child, on learning that my father’s dying injunction had forbidden my uncle to allow me to embark in a seafaring life.
+
+These visions faded when I perused, for the first time, those poets whose effusions entranced my soul and lifted it to heaven. I also became a poet and for one year lived in a paradise of my own creation; I imagined that I also might obtain a niche in the temple where the names of Homer and Shakespeare are consecrated. You are well acquainted with my failure and how heavily I bore the disappointment. But just at that time I inherited the fortune of my cousin, and my thoughts were turned into the channel of their earlier bent.
+
+Six years have passed since I resolved on my present undertaking. I can, even now, remember the hour from which I dedicated myself to this great enterprise. I commenced by inuring my body to hardship. I accompanied the whale-fishers on several expeditions to the North Sea; I voluntarily endured cold, famine, thirst, and want of sleep; I often worked harder than the common sailors during the day and devoted my nights to the study of mathematics, the theory of medicine, and those branches of physical science from which a naval adventurer might derive the greatest practical advantage. Twice I actually hired myself as an under-mate in a Greenland whaler, and acquitted myself to admiration. I must own I felt a little proud when my captain offered me the second dignity in the vessel and entreated me to remain with the greatest earnestness, so valuable did he consider my services.
+
+And now, dear Margaret, do I not deserve to accomplish some great purpose? My life might have been passed in ease and luxury, but I preferred glory to every enticement that wealth placed in my path. Oh, that some encouraging voice would answer in the affirmative! My courage and my resolution is firm; but my hopes fluctuate, and my spirits are often depressed. I am about to proceed on a long and difficult voyage, the emergencies of which will demand all my fortitude: I am required not only to raise the spirits of others, but sometimes to sustain my own, when theirs are failing.
+
+This is the most favourable period for travelling in Russia. They fly quickly over the snow in their sledges; the motion is pleasant, and, in my opinion, far more agreeable than that of an English stagecoach. The cold is not excessive, if you are wrapped in furs—a dress which I have already adopted, for there is a great difference between walking the deck and remaining seated motionless for hours, when no exercise prevents the blood from actually freezing in your veins. I have no ambition to lose my life on the post-road between St. Petersburgh and Archangel.
+
+I shall depart for the latter town in a fortnight or three weeks; and my intention is to hire a ship there, which can easily be done by paying the insurance for the owner, and to engage as many sailors as I think necessary among those who are accustomed to the whale-fishing. I do not intend to sail until the month of June; and when shall I return? Ah, dear sister, how can I answer this question? If I succeed, many, many months, perhaps years, will pass before you and I may meet. If I fail, you will see me again soon, or never.
+
+Farewell, my dear, excellent Margaret. Heaven shower down blessings on you, and save me, that I may again and again testify my gratitude for all your love and kindness.
+
+Your affectionate brother,
+R. Walton
+
+Letter 2
+To Mrs. Saville, England.
+
+Archangel, 28th March, 17—.
+
+How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage.
+
+But I have one want which I have never yet been able to satisfy, and the absence of the object of which I now feel as a most severe evil, I have no friend, Margaret: when I am glowing with the enthusiasm of success, there will be none to participate my joy; if I am assailed by disappointment, no one will endeavour to sustain me in dejection. I shall commit my thoughts to paper, it is true; but that is a poor medium for the communication of feeling. I desire the company of a man who could sympathise with me, whose eyes would reply to mine. You may deem me romantic, my dear sister, but I bitterly feel the want of a friend. I have no one near me, gentle yet courageous, possessed of a cultivated as well as of a capacious mind, whose tastes are like my own, to approve or amend my plans. How would such a friend repair the faults of your poor brother! I am too ardent in execution and too impatient of difficulties. But it is a still greater evil to me that I am self-educated: for the first fourteen years of my life I ran wild on a common and read nothing but our Uncle Thomas’ books of voyages. At that age I became acquainted with the celebrated poets of our own country; but it was only when it had ceased to be in my power to derive its most important benefits from such a conviction that I perceived the necessity of becoming acquainted with more languages than that of my native country. Now I am twenty-eight and am in reality more illiterate than many schoolboys of fifteen. It is true that I have thought more and that my daydreams are more extended and magnificent, but they want (as the painters call it) keeping; and I greatly need a friend who would have sense enough not to despise me as romantic, and affection enough for me to endeavour to regulate my mind.
+
+Well, these are useless complaints; I shall certainly find no friend on the wide ocean, nor even here in Archangel, among merchants and seamen. Yet some feelings, unallied to the dross of human nature, beat even in these rugged bosoms. My lieutenant, for instance, is a man of wonderful courage and enterprise; he is madly desirous of glory, or rather, to word my phrase more characteristically, of advancement in his profession. He is an Englishman, and in the midst of national and professional prejudices, unsoftened by cultivation, retains some of the noblest endowments of humanity. I first became acquainted with him on board a whale vessel; finding that he was unemployed in this city, I easily engaged him to assist in my enterprise.
+
+The master is a person of an excellent disposition and is remarkable in the ship for his gentleness and the mildness of his discipline. This circumstance, added to his well-known integrity and dauntless courage, made me very desirous to engage him. A youth passed in solitude, my best years spent under your gentle and feminine fosterage, has so refined the groundwork of my character that I cannot overcome an intense distaste to the usual brutality exercised on board ship: I have never believed it to be necessary, and when I heard of a mariner equally noted for his kindliness of heart and the respect and obedience paid to him by his crew, I felt myself peculiarly fortunate in being able to secure his services. I heard of him first in rather a romantic manner, from a lady who owes to him the happiness of her life. This, briefly, is his story. Some years ago he loved a young Russian lady of moderate fortune, and having amassed a considerable sum in prize-money, the father of the girl consented to the match. He saw his mistress once before the destined ceremony; but she was bathed in tears, and throwing herself at his feet, entreated him to spare her, confessing at the same time that she loved another, but that he was poor, and that her father would never consent to the union. My generous friend reassured the suppliant, and on being informed of the name of her lover, instantly abandoned his pursuit. He had already bought a farm with his money, on which he had designed to pass the remainder of his life; but he bestowed the whole on his rival, together with the remains of his prize-money to purchase stock, and then himself solicited the young woman’s father to consent to her marriage with her lover. But the old man decidedly refused, thinking himself bound in honour to my friend, who, when he found the father inexorable, quitted his country, nor returned until he heard that his former mistress was married according to her inclinations. “What a noble fellow!” you will exclaim. He is so; but then he is wholly uneducated: he is as silent as a Turk, and a kind of ignorant carelessness attends him, which, while it renders his conduct the more astonishing, detracts from the interest and sympathy which otherwise he would command.
+
+Yet do not suppose, because I complain a little or because I can conceive a consolation for my toils which I may never know, that I am wavering in my resolutions. Those are as fixed as fate, and my voyage is only now delayed until the weather shall permit my embarkation. The winter has been dreadfully severe, but the spring promises well, and it is considered as a remarkably early season, so that perhaps I may sail sooner than I expected. I shall do nothing rashly: you know me sufficiently to confide in my prudence and considerateness whenever the safety of others is committed to my care.
+
+I cannot describe to you my sensations on the near prospect of my undertaking. It is impossible to communicate to you a conception of the trembling sensation, half pleasurable and half fearful, with which I am preparing to depart. I am going to unexplored regions, to “the land of mist and snow,” but I shall kill no albatross; therefore do not be alarmed for my safety or if I should come back to you as worn and woeful as the “Ancient Mariner.” You will smile at my allusion, but I will disclose a secret. I have often attributed my attachment to, my passionate enthusiasm for, the dangerous mysteries of ocean to that production of the most imaginative of modern poets. There is something at work in my soul which I do not understand. I am practically industrious—painstaking, a workman to execute with perseverance and labour—but besides this there is a love for the marvellous, a belief in the marvellous, intertwined in all my projects, which hurries me out of the common pathways of men, even to the wild sea and unvisited regions I am about to explore.
+
+But to return to dearer considerations. Shall I meet you again, after having traversed immense seas, and returned by the most southern cape of Africa or America? I dare not expect such success, yet I cannot bear to look on the reverse of the picture. Continue for the present to write to me by every opportunity: I may receive your letters on some occasions when I need them most to support my spirits. I love you very tenderly. Remember me with affection, should you never hear from me again.
+
+Your affectionate brother,
+Robert Walton
+
+Letter 3
+To Mrs. Saville, England.
+
+July 7th, 17—.
+
+My dear Sister,
+
+I write a few lines in haste to say that I am safe—and well advanced on my voyage. This letter will reach England by a merchantman now on its homeward voyage from Archangel; more fortunate than I, who may not see my native land, perhaps, for many years. I am, however, in good spirits: my men are bold and apparently firm of purpose, nor do the floating sheets of ice that continually pass us, indicating the dangers of the region towards which we are advancing, appear to dismay them. We have already reached a very high latitude; but it is the height of summer, and although not so warm as in England, the southern gales, which blow us speedily towards those shores which I so ardently desire to attain, breathe a degree of renovating warmth which I had not expected.
+
+No incidents have hitherto befallen us that would make a figure in a letter. One or two stiff gales and the springing of a leak are accidents which experienced navigators scarcely remember to record, and I shall be well content if nothing worse happen to us during our voyage.
+
+Adieu, my dear Margaret. Be assured that for my own sake, as well as yours, I will not rashly encounter danger. I will be cool, persevering, and prudent.
+
+But success shall crown my endeavours. Wherefore not? Thus far I have gone, tracing a secure way over the pathless seas, the very stars themselves being witnesses and testimonies of my triumph. Why not still proceed over the untamed yet obedient element? What can stop the determined heart and resolved will of man?
+
+My swelling heart involuntarily pours itself out thus. But I must finish. Heaven bless my beloved sister!
+
+R.W.
diff --git a/testdata/frankenstein_chap1.txt b/testdata/frankenstein_chap1.txt
new file mode 100644
index 0000000..55cee87
--- /dev/null
+++ b/testdata/frankenstein_chap1.txt
@@ -0,0 +1,168 @@
+Letter 1
+To Mrs. Saville, England.
+
+St. Petersburgh, Dec. 11th, 17—.
+
+You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking.
+
+I am already far north of London, and as I walk in the streets of Petersburgh, I feel a cold northern breeze play upon my cheeks, which braces my nerves and fills me with delight. Do you understand this feeling? This breeze, which has travelled from the regions towards which I am advancing, gives me a foretaste of those icy climes. Inspirited by this wind of promise, my daydreams become more fervent and vivid. I try in vain to be persuaded that the pole is the seat of frost and desolation; it ever presents itself to my imagination as the region of beauty and delight. There, Margaret, the sun is for ever visible, its broad disk just skirting the horizon and diffusing a perpetual splendour. There—for with your leave, my sister, I will put some trust in preceding navigators—there snow and frost are banished; and, sailing over a calm sea, we may be wafted to a land surpassing in wonders and in beauty every region hitherto discovered on the habitable globe. Its productions and features may be without example, as the phenomena of the heavenly bodies undoubtedly are in those undiscovered solitudes. What may not be expected in a country of eternal light? I may there discover the wondrous power which attracts the needle and may regulate a thousand celestial observations that require only this voyage to render their seeming eccentricities consistent for ever. I shall satiate my ardent curiosity with the sight of a part of the world never before visited, and may tread a land never before imprinted by the foot of man. These are my enticements, and they are sufficient to conquer all fear of danger or death and to induce me to commence this laborious voyage with the joy a child feels when he embarks in a little boat, with his holiday mates, on an expedition of discovery up his native river. But supposing all these conjectures to be false, you cannot contest the inestimable benefit which I shall confer on all mankind, to the last generation, by discovering a passage near the pole to those countries, to reach which at present so many months are requisite; or by ascertaining the secret of the magnet, which, if at all possible, can only be effected by an undertaking such as mine.
+
+These reflections have dispelled the agitation with which I began my letter, and I feel my heart glow with an enthusiasm which elevates me to heaven, for nothing contributes so much to tranquillise the mind as a steady purpose—a point on which the soul may fix its intellectual eye. This expedition has been the favourite dream of my early years. I have read with ardour the accounts of the various voyages which have been made in the prospect of arriving at the North Pacific Ocean through the seas which surround the pole. You may remember that a history of all the voyages made for purposes of discovery composed the whole of our good Uncle Thomas’ library. My education was neglected, yet I was passionately fond of reading. These volumes were my study day and night, and my familiarity with them increased that regret which I had felt, as a child, on learning that my father’s dying injunction had forbidden my uncle to allow me to embark in a seafaring life.
+
+These visions faded when I perused, for the first time, those poets whose effusions entranced my soul and lifted it to heaven. I also became a poet and for one year lived in a paradise of my own creation; I imagined that I also might obtain a niche in the temple where the names of Homer and Shakespeare are consecrated. You are well acquainted with my failure and how heavily I bore the disappointment. But just at that time I inherited the fortune of my cousin, and my thoughts were turned into the channel of their earlier bent.
+
+Six years have passed since I resolved on my present undertaking. I can, even now, remember the hour from which I dedicated myself to this great enterprise. I commenced by inuring my body to hardship. I accompanied the whale-fishers on several expeditions to the North Sea; I voluntarily endured cold, famine, thirst, and want of sleep; I often worked harder than the common sailors during the day and devoted my nights to the study of mathematics, the theory of medicine, and those branches of physical science from which a naval adventurer might derive the greatest practical advantage. Twice I actually hired myself as an under-mate in a Greenland whaler, and acquitted myself to admiration. I must own I felt a little proud when my captain offered me the second dignity in the vessel and entreated me to remain with the greatest earnestness, so valuable did he consider my services.
+
+And now, dear Margaret, do I not deserve to accomplish some great purpose? My life might have been passed in ease and luxury, but I preferred glory to every enticement that wealth placed in my path. Oh, that some encouraging voice would answer in the affirmative! My courage and my resolution is firm; but my hopes fluctuate, and my spirits are often depressed. I am about to proceed on a long and difficult voyage, the emergencies of which will demand all my fortitude: I am required not only to raise the spirits of others, but sometimes to sustain my own, when theirs are failing.
+
+This is the most favourable period for travelling in Russia. They fly quickly over the snow in their sledges; the motion is pleasant, and, in my opinion, far more agreeable than that of an English stagecoach. The cold is not excessive, if you are wrapped in furs—a dress which I have already adopted, for there is a great difference between walking the deck and remaining seated motionless for hours, when no exercise prevents the blood from actually freezing in your veins. I have no ambition to lose my life on the post-road between St. Petersburgh and Archangel.
+
+I shall depart for the latter town in a fortnight or three weeks; and my intention is to hire a ship there, which can easily be done by paying the insurance for the owner, and to engage as many sailors as I think necessary among those who are accustomed to the whale-fishing. I do not intend to sail until the month of June; and when shall I return? Ah, dear sister, how can I answer this question? If I succeed, many, many months, perhaps years, will pass before you and I may meet. If I fail, you will see me again soon, or never.
+
+Farewell, my dear, excellent Margaret. Heaven shower down blessings on you, and save me, that I may again and again testify my gratitude for all your love and kindness.
+
+Your affectionate brother,
+R. Walton
+
+Letter 2
+To Mrs. Saville, England.
+
+Archangel, 28th March, 17—.
+
+How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage.
+
+But I have one want which I have never yet been able to satisfy, and the absence of the object of which I now feel as a most severe evil, I have no friend, Margaret: when I am glowing with the enthusiasm of success, there will be none to participate my joy; if I am assailed by disappointment, no one will endeavour to sustain me in dejection. I shall commit my thoughts to paper, it is true; but that is a poor medium for the communication of feeling. I desire the company of a man who could sympathise with me, whose eyes would reply to mine. You may deem me romantic, my dear sister, but I bitterly feel the want of a friend. I have no one near me, gentle yet courageous, possessed of a cultivated as well as of a capacious mind, whose tastes are like my own, to approve or amend my plans. How would such a friend repair the faults of your poor brother! I am too ardent in execution and too impatient of difficulties. But it is a still greater evil to me that I am self-educated: for the first fourteen years of my life I ran wild on a common and read nothing but our Uncle Thomas’ books of voyages. At that age I became acquainted with the celebrated poets of our own country; but it was only when it had ceased to be in my power to derive its most important benefits from such a conviction that I perceived the necessity of becoming acquainted with more languages than that of my native country. Now I am twenty-eight and am in reality more illiterate than many schoolboys of fifteen. It is true that I have thought more and that my daydreams are more extended and magnificent, but they want (as the painters call it) keeping; and I greatly need a friend who would have sense enough not to despise me as romantic, and affection enough for me to endeavour to regulate my mind.
+
+Well, these are useless complaints; I shall certainly find no friend on the wide ocean, nor even here in Archangel, among merchants and seamen. Yet some feelings, unallied to the dross of human nature, beat even in these rugged bosoms. My lieutenant, for instance, is a man of wonderful courage and enterprise; he is madly desirous of glory, or rather, to word my phrase more characteristically, of advancement in his profession. He is an Englishman, and in the midst of national and professional prejudices, unsoftened by cultivation, retains some of the noblest endowments of humanity. I first became acquainted with him on board a whale vessel; finding that he was unemployed in this city, I easily engaged him to assist in my enterprise.
+
+The master is a person of an excellent disposition and is remarkable in the ship for his gentleness and the mildness of his discipline. This circumstance, added to his well-known integrity and dauntless courage, made me very desirous to engage him. A youth passed in solitude, my best years spent under your gentle and feminine fosterage, has so refined the groundwork of my character that I cannot overcome an intense distaste to the usual brutality exercised on board ship: I have never believed it to be necessary, and when I heard of a mariner equally noted for his kindliness of heart and the respect and obedience paid to him by his crew, I felt myself peculiarly fortunate in being able to secure his services. I heard of him first in rather a romantic manner, from a lady who owes to him the happiness of her life. This, briefly, is his story. Some years ago he loved a young Russian lady of moderate fortune, and having amassed a considerable sum in prize-money, the father of the girl consented to the match. He saw his mistress once before the destined ceremony; but she was bathed in tears, and throwing herself at his feet, entreated him to spare her, confessing at the same time that she loved another, but that he was poor, and that her father would never consent to the union. My generous friend reassured the suppliant, and on being informed of the name of her lover, instantly abandoned his pursuit. He had already bought a farm with his money, on which he had designed to pass the remainder of his life; but he bestowed the whole on his rival, together with the remains of his prize-money to purchase stock, and then himself solicited the young woman’s father to consent to her marriage with her lover. But the old man decidedly refused, thinking himself bound in honour to my friend, who, when he found the father inexorable, quitted his country, nor returned until he heard that his former mistress was married according to her inclinations. “What a noble fellow!” you will exclaim. He is so; but then he is wholly uneducated: he is as silent as a Turk, and a kind of ignorant carelessness attends him, which, while it renders his conduct the more astonishing, detracts from the interest and sympathy which otherwise he would command.
+
+Yet do not suppose, because I complain a little or because I can conceive a consolation for my toils which I may never know, that I am wavering in my resolutions. Those are as fixed as fate, and my voyage is only now delayed until the weather shall permit my embarkation. The winter has been dreadfully severe, but the spring promises well, and it is considered as a remarkably early season, so that perhaps I may sail sooner than I expected. I shall do nothing rashly: you know me sufficiently to confide in my prudence and considerateness whenever the safety of others is committed to my care.
+
+I cannot describe to you my sensations on the near prospect of my undertaking. It is impossible to communicate to you a conception of the trembling sensation, half pleasurable and half fearful, with which I am preparing to depart. I am going to unexplored regions, to “the land of mist and snow,” but I shall kill no albatross; therefore do not be alarmed for my safety or if I should come back to you as worn and woeful as the “Ancient Mariner.” You will smile at my allusion, but I will disclose a secret. I have often attributed my attachment to, my passionate enthusiasm for, the dangerous mysteries of ocean to that production of the most imaginative of modern poets. There is something at work in my soul which I do not understand. I am practically industrious—painstaking, a workman to execute with perseverance and labour—but besides this there is a love for the marvellous, a belief in the marvellous, intertwined in all my projects, which hurries me out of the common pathways of men, even to the wild sea and unvisited regions I am about to explore.
+
+But to return to dearer considerations. Shall I meet you again, after having traversed immense seas, and returned by the most southern cape of Africa or America? I dare not expect such success, yet I cannot bear to look on the reverse of the picture. Continue for the present to write to me by every opportunity: I may receive your letters on some occasions when I need them most to support my spirits. I love you very tenderly. Remember me with affection, should you never hear from me again.
+
+Your affectionate brother,
+Robert Walton
+
+Letter 3
+To Mrs. Saville, England.
+
+July 7th, 17—.
+
+My dear Sister,
+
+I write a few lines in haste to say that I am safe—and well advanced on my voyage. This letter will reach England by a merchantman now on its homeward voyage from Archangel; more fortunate than I, who may not see my native land, perhaps, for many years. I am, however, in good spirits: my men are bold and apparently firm of purpose, nor do the floating sheets of ice that continually pass us, indicating the dangers of the region towards which we are advancing, appear to dismay them. We have already reached a very high latitude; but it is the height of summer, and although not so warm as in England, the southern gales, which blow us speedily towards those shores which I so ardently desire to attain, breathe a degree of renovating warmth which I had not expected.
+
+No incidents have hitherto befallen us that would make a figure in a letter. One or two stiff gales and the springing of a leak are accidents which experienced navigators scarcely remember to record, and I shall be well content if nothing worse happen to us during our voyage.
+
+Adieu, my dear Margaret. Be assured that for my own sake, as well as yours, I will not rashly encounter danger. I will be cool, persevering, and prudent.
+
+But success shall crown my endeavours. Wherefore not? Thus far I have gone, tracing a secure way over the pathless seas, the very stars themselves being witnesses and testimonies of my triumph. Why not still proceed over the untamed yet obedient element? What can stop the determined heart and resolved will of man?
+
+My swelling heart involuntarily pours itself out thus. But I must finish. Heaven bless my beloved sister!
+
+R.W.
+
+Letter 4
+To Mrs. Saville, England.
+
+August 5th, 17—.
+
+So strange an accident has happened to us that I cannot forbear recording it, although it is very probable that you will see me before these papers can come into your possession.
+
+Last Monday (July 31st) we were nearly surrounded by ice, which closed in the ship on all sides, scarcely leaving her the sea-room in which she floated. Our situation was somewhat dangerous, especially as we were compassed round by a very thick fog. We accordingly lay to, hoping that some change would take place in the atmosphere and weather.
+
+About two o’clock the mist cleared away, and we beheld, stretched out in every direction, vast and irregular plains of ice, which seemed to have no end. Some of my comrades groaned, and my own mind began to grow watchful with anxious thoughts, when a strange sight suddenly attracted our attention and diverted our solicitude from our own situation. We perceived a low carriage, fixed on a sledge and drawn by dogs, pass on towards the north, at the distance of half a mile; a being which had the shape of a man, but apparently of gigantic stature, sat in the sledge and guided the dogs. We watched the rapid progress of the traveller with our telescopes until he was lost among the distant inequalities of the ice.
+
+This appearance excited our unqualified wonder. We were, as we believed, many hundred miles from any land; but this apparition seemed to denote that it was not, in reality, so distant as we had supposed. Shut in, however, by ice, it was impossible to follow his track, which we had observed with the greatest attention.
+
+About two hours after this occurrence we heard the ground sea, and before night the ice broke and freed our ship. We, however, lay to until the morning, fearing to encounter in the dark those large loose masses which float about after the breaking up of the ice. I profited of this time to rest for a few hours.
+
+In the morning, however, as soon as it was light, I went upon deck and found all the sailors busy on one side of the vessel, apparently talking to someone in the sea. It was, in fact, a sledge, like that we had seen before, which had drifted towards us in the night on a large fragment of ice. Only one dog remained alive; but there was a human being within it whom the sailors were persuading to enter the vessel. He was not, as the other traveller seemed to be, a savage inhabitant of some undiscovered island, but a European. When I appeared on deck the master said, “Here is our captain, and he will not allow you to perish on the open sea.”
+
+On perceiving me, the stranger addressed me in English, although with a foreign accent. “Before I come on board your vessel,” said he, “will you have the kindness to inform me whither you are bound?”
+
+You may conceive my astonishment on hearing such a question addressed to me from a man on the brink of destruction and to whom I should have supposed that my vessel would have been a resource which he would not have exchanged for the most precious wealth the earth can afford. I replied, however, that we were on a voyage of discovery towards the northern pole.
+
+Upon hearing this he appeared satisfied and consented to come on board. Good God! Margaret, if you had seen the man who thus capitulated for his safety, your surprise would have been boundless. His limbs were nearly frozen, and his body dreadfully emaciated by fatigue and suffering. I never saw a man in so wretched a condition. We attempted to carry him into the cabin, but as soon as he had quitted the fresh air he fainted. We accordingly brought him back to the deck and restored him to animation by rubbing him with brandy and forcing him to swallow a small quantity. As soon as he showed signs of life we wrapped him up in blankets and placed him near the chimney of the kitchen stove. By slow degrees he recovered and ate a little soup, which restored him wonderfully.
+
+Two days passed in this manner before he was able to speak, and I often feared that his sufferings had deprived him of understanding. When he had in some measure recovered, I removed him to my own cabin and attended on him as much as my duty would permit. I never saw a more interesting creature: his eyes have generally an expression of wildness, and even madness, but there are moments when, if anyone performs an act of kindness towards him or does him any the most trifling service, his whole countenance is lighted up, as it were, with a beam of benevolence and sweetness that I never saw equalled. But he is generally melancholy and despairing, and sometimes he gnashes his teeth, as if impatient of the weight of woes that oppresses him.
+
+When my guest was a little recovered I had great trouble to keep off the men, who wished to ask him a thousand questions; but I would not allow him to be tormented by their idle curiosity, in a state of body and mind whose restoration evidently depended upon entire repose. Once, however, the lieutenant asked why he had come so far upon the ice in so strange a vehicle.
+
+His countenance instantly assumed an aspect of the deepest gloom, and he replied, “To seek one who fled from me.”
+
+“And did the man whom you pursued travel in the same fashion?”
+
+“Yes.”
+
+“Then I fancy we have seen him, for the day before we picked you up we saw some dogs drawing a sledge, with a man in it, across the ice.”
+
+This aroused the stranger’s attention, and he asked a multitude of questions concerning the route which the dæmon, as he called him, had pursued. Soon after, when he was alone with me, he said, “I have, doubtless, excited your curiosity, as well as that of these good people; but you are too considerate to make inquiries.”
+
+“Certainly; it would indeed be very impertinent and inhuman in me to trouble you with any inquisitiveness of mine.”
+
+“And yet you rescued me from a strange and perilous situation; you have benevolently restored me to life.”
+
+Soon after this he inquired if I thought that the breaking up of the ice had destroyed the other sledge. I replied that I could not answer with any degree of certainty, for the ice had not broken until near midnight, and the traveller might have arrived at a place of safety before that time; but of this I could not judge.
+
+From this time a new spirit of life animated the decaying frame of the stranger. He manifested the greatest eagerness to be upon deck to watch for the sledge which had before appeared; but I have persuaded him to remain in the cabin, for he is far too weak to sustain the rawness of the atmosphere. I have promised that someone should watch for him and give him instant notice if any new object should appear in sight.
+
+Such is my journal of what relates to this strange occurrence up to the present day. The stranger has gradually improved in health but is very silent and appears uneasy when anyone except myself enters his cabin. Yet his manners are so conciliating and gentle that the sailors are all interested in him, although they have had very little communication with him. For my own part, I begin to love him as a brother, and his constant and deep grief fills me with sympathy and compassion. He must have been a noble creature in his better days, being even now in wreck so attractive and amiable.
+
+I said in one of my letters, my dear Margaret, that I should find no friend on the wide ocean; yet I have found a man who, before his spirit had been broken by misery, I should have been happy to have possessed as the brother of my heart.
+
+I shall continue my journal concerning the stranger at intervals, should I have any fresh incidents to record.
+
+August 13th, 17—.
+
+My affection for my guest increases every day. He excites at once my admiration and my pity to an astonishing degree. How can I see so noble a creature destroyed by misery without feeling the most poignant grief? He is so gentle, yet so wise; his mind is so cultivated, and when he speaks, although his words are culled with the choicest art, yet they flow with rapidity and unparalleled eloquence.
+
+He is now much recovered from his illness and is continually on the deck, apparently watching for the sledge that preceded his own. Yet, although unhappy, he is not so utterly occupied by his own misery but that he interests himself deeply in the projects of others. He has frequently conversed with me on mine, which I have communicated to him without disguise. He entered attentively into all my arguments in favour of my eventual success and into every minute detail of the measures I had taken to secure it. I was easily led by the sympathy which he evinced to use the language of my heart, to give utterance to the burning ardour of my soul and to say, with all the fervour that warmed me, how gladly I would sacrifice my fortune, my existence, my every hope, to the furtherance of my enterprise. One man’s life or death were but a small price to pay for the acquirement of the knowledge which I sought, for the dominion I should acquire and transmit over the elemental foes of our race. As I spoke, a dark gloom spread over my listener’s countenance. At first I perceived that he tried to suppress his emotion; he placed his hands before his eyes, and my voice quivered and failed me as I beheld tears trickle fast from between his fingers; a groan burst from his heaving breast. I paused; at length he spoke, in broken accents: “Unhappy man! Do you share my madness? Have you drunk also of the intoxicating draught? Hear me; let me reveal my tale, and you will dash the cup from your lips!”
+
+Such words, you may imagine, strongly excited my curiosity; but the paroxysm of grief that had seized the stranger overcame his weakened powers, and many hours of repose and tranquil conversation were necessary to restore his composure.
+
+Having conquered the violence of his feelings, he appeared to despise himself for being the slave of passion; and quelling the dark tyranny of despair, he led me again to converse concerning myself personally. He asked me the history of my earlier years. The tale was quickly told, but it awakened various trains of reflection. I spoke of my desire of finding a friend, of my thirst for a more intimate sympathy with a fellow mind than had ever fallen to my lot, and expressed my conviction that a man could boast of little happiness who did not enjoy this blessing.
+
+“I agree with you,” replied the stranger; “we are unfashioned creatures, but half made up, if one wiser, better, dearer than ourselves—such a friend ought to be—do not lend his aid to perfectionate our weak and faulty natures. I once had a friend, the most noble of human creatures, and am entitled, therefore, to judge respecting friendship. You have hope, and the world before you, and have no cause for despair. But I—I have lost everything and cannot begin life anew.”
+
+As he said this his countenance became expressive of a calm, settled grief that touched me to the heart. But he was silent and presently retired to his cabin.
+
+Even broken in spirit as he is, no one can feel more deeply than he does the beauties of nature. The starry sky, the sea, and every sight afforded by these wonderful regions seem still to have the power of elevating his soul from earth. Such a man has a double existence: he may suffer misery and be overwhelmed by disappointments, yet when he has retired into himself, he will be like a celestial spirit that has a halo around him, within whose circle no grief or folly ventures.
+
+Will you smile at the enthusiasm I express concerning this divine wanderer? You would not if you saw him. You have been tutored and refined by books and retirement from the world, and you are therefore somewhat fastidious; but this only renders you the more fit to appreciate the extraordinary merits of this wonderful man. Sometimes I have endeavoured to discover what quality it is which he possesses that elevates him so immeasurably above any other person I ever knew. I believe it to be an intuitive discernment, a quick but never-failing power of judgment, a penetration into the causes of things, unequalled for clearness and precision; add to this a facility of expression and a voice whose varied intonations are soul-subduing music.
+
+August 19th, 17—.
+
+Yesterday the stranger said to me, “You may easily perceive, Captain Walton, that I have suffered great and unparalleled misfortunes. I had determined at one time that the memory of these evils should die with me, but you have won me to alter my determination. You seek for knowledge and wisdom, as I once did; and I ardently hope that the gratification of your wishes may not be a serpent to sting you, as mine has been. I do not know that the relation of my disasters will be useful to you; yet, when I reflect that you are pursuing the same course, exposing yourself to the same dangers which have rendered me what I am, I imagine that you may deduce an apt moral from my tale, one that may direct you if you succeed in your undertaking and console you in case of failure. Prepare to hear of occurrences which are usually deemed marvellous. Were we among the tamer scenes of nature I might fear to encounter your unbelief, perhaps your ridicule; but many things will appear possible in these wild and mysterious regions which would provoke the laughter of those unacquainted with the ever-varied powers of nature; nor can I doubt but that my tale conveys in its series internal evidence of the truth of the events of which it is composed.”
+
+You may easily imagine that I was much gratified by the offered communication, yet I could not endure that he should renew his grief by a recital of his misfortunes. I felt the greatest eagerness to hear the promised narrative, partly from curiosity and partly from a strong desire to ameliorate his fate if it were in my power. I expressed these feelings in my answer.
+
+“I thank you,” he replied, “for your sympathy, but it is useless; my fate is nearly fulfilled. I wait but for one event, and then I shall repose in peace. I understand your feeling,” continued he, perceiving that I wished to interrupt him; “but you are mistaken, my friend, if thus you will allow me to name you; nothing can alter my destiny; listen to my history, and you will perceive how irrevocably it is determined.”
+
+He then told me that he would commence his narrative the next day when I should be at leisure. This promise drew from me the warmest thanks. I have resolved every night, when I am not imperatively occupied by my duties, to record, as nearly as possible in his own words, what he has related during the day. If I should be engaged, I will at least make notes. This manuscript will doubtless afford you the greatest pleasure; but to me, who know him, and who hear it from his own lips—with what interest and sympathy shall I read it in some future day! Even now, as I commence my task, his full-toned voice swells in my ears; his lustrous eyes dwell on me with all their melancholy sweetness; I see his thin hand raised in animation, while the lineaments of his face are irradiated by the soul within. Strange and harrowing must be his story, frightful the storm which embraced the gallant vessel on its course and wrecked it—thus!
+
+Chapter 1
+I am by birth a Genevese, and my family is one of the most distinguished of that republic. My ancestors had been for many years counsellors and syndics, and my father had filled several public situations with honour and reputation. He was respected by all who knew him for his integrity and indefatigable attention to public business. He passed his younger days perpetually occupied by the affairs of his country; a variety of circumstances had prevented his marrying early, nor was it until the decline of life that he became a husband and the father of a family.
+
+As the circumstances of his marriage illustrate his character, I cannot refrain from relating them. One of his most intimate friends was a merchant who, from a flourishing state, fell, through numerous mischances, into poverty. This man, whose name was Beaufort, was of a proud and unbending disposition and could not bear to live in poverty and oblivion in the same country where he had formerly been distinguished for his rank and magnificence. Having paid his debts, therefore, in the most honourable manner, he retreated with his daughter to the town of Lucerne, where he lived unknown and in wretchedness. My father loved Beaufort with the truest friendship and was deeply grieved by his retreat in these unfortunate circumstances. He bitterly deplored the false pride which led his friend to a conduct so little worthy of the affection that united them. He lost no time in endeavouring to seek him out, with the hope of persuading him to begin the world again through his credit and assistance.
+
+Beaufort had taken effectual measures to conceal himself, and it was ten months before my father discovered his abode. Overjoyed at this discovery, he hastened to the house, which was situated in a mean street near the Reuss. But when he entered, misery and despair alone welcomed him. Beaufort had saved but a very small sum of money from the wreck of his fortunes, but it was sufficient to provide him with sustenance for some months, and in the meantime he hoped to procure some respectable employment in a merchant’s house. The interval was, consequently, spent in inaction; his grief only became more deep and rankling when he had leisure for reflection, and at length it took so fast hold of his mind that at the end of three months he lay on a bed of sickness, incapable of any exertion.
+
+His daughter attended him with the greatest tenderness, but she saw with despair that their little fund was rapidly decreasing and that there was no other prospect of support. But Caroline Beaufort possessed a mind of an uncommon mould, and her courage rose to support her in her adversity. She procured plain work; she plaited straw and by various means contrived to earn a pittance scarcely sufficient to support life.
+
+Several months passed in this manner. Her father grew worse; her time was more entirely occupied in attending him; her means of subsistence decreased; and in the tenth month her father died in her arms, leaving her an orphan and a beggar. This last blow overcame her, and she knelt by Beaufort’s coffin weeping bitterly, when my father entered the chamber. He came like a protecting spirit to the poor girl, who committed herself to his care; and after the interment of his friend he conducted her to Geneva and placed her under the protection of a relation. Two years after this event Caroline became his wife.
+
+There was a considerable difference between the ages of my parents, but this circumstance seemed to unite them only closer in bonds of devoted affection. There was a sense of justice in my father’s upright mind which rendered it necessary that he should approve highly to love strongly. Perhaps during former years he had suffered from the late-discovered unworthiness of one beloved and so was disposed to set a greater value on tried worth. There was a show of gratitude and worship in his attachment to my mother, differing wholly from the doting fondness of age, for it was inspired by reverence for her virtues and a desire to be the means of, in some degree, recompensing her for the sorrows she had endured, but which gave inexpressible grace to his behaviour to her. Everything was made to yield to her wishes and her convenience. He strove to shelter her, as a fair exotic is sheltered by the gardener, from every rougher wind and to surround her with all that could tend to excite pleasurable emotion in her soft and benevolent mind. Her health, and even the tranquillity of her hitherto constant spirit, had been shaken by what she had gone through. During the two years that had elapsed previous to their marriage my father had gradually relinquished all his public functions; and immediately after their union they sought the pleasant climate of Italy, and the change of scene and interest attendant on a tour through that land of wonders, as a restorative for her weakened frame.
+
+From Italy they visited Germany and France. I, their eldest child, was born at Naples, and as an infant accompanied them in their rambles. I remained for several years their only child. Much as they were attached to each other, they seemed to draw inexhaustible stores of affection from a very mine of love to bestow them upon me. My mother’s tender caresses and my father’s smile of benevolent pleasure while regarding me are my first recollections. I was their plaything and their idol, and something better—their child, the innocent and helpless creature bestowed on them by Heaven, whom to bring up to good, and whose future lot it was in their hands to direct to happiness or misery, according as they fulfilled their duties towards me. With this deep consciousness of what they owed towards the being to which they had given life, added to the active spirit of tenderness that animated both, it may be imagined that while during every hour of my infant life I received a lesson of patience, of charity, and of self-control, I was so guided by a silken cord that all seemed but one train of enjoyment to me.
+
+For a long time I was their only care. My mother had much desired to have a daughter, but I continued their single offspring. When I was about five years old, while making an excursion beyond the frontiers of Italy, they passed a week on the shores of the Lake of Como. Their benevolent disposition often made them enter the cottages of the poor. This, to my mother, was more than a duty; it was a necessity, a passion—remembering what she had suffered, and how she had been relieved—for her to act in her turn the guardian angel to the afflicted. During one of their walks a poor cot in the foldings of a vale attracted their notice as being singularly disconsolate, while the number of half-clothed children gathered about it spoke of penury in its worst shape. One day, when my father had gone by himself to Milan, my mother, accompanied by me, visited this abode. She found a peasant and his wife, hard working, bent down by care and labour, distributing a scanty meal to five hungry babes. Among these there was one which attracted my mother far above all the rest. She appeared of a different stock. The four others were dark-eyed, hardy little vagrants; this child was thin and very fair. Her hair was the brightest living gold, and despite the poverty of her clothing, seemed to set a crown of distinction on her head. Her brow was clear and ample, her blue eyes cloudless, and her lips and the moulding of her face so expressive of sensibility and sweetness that none could behold her without looking on her as of a distinct species, a being heaven-sent, and bearing a celestial stamp in all her features.
+
+The peasant woman, perceiving that my mother fixed eyes of wonder and admiration on this lovely girl, eagerly communicated her history. She was not her child, but the daughter of a Milanese nobleman. Her mother was a German and had died on giving her birth. The infant had been placed with these good people to nurse: they were better off then. They had not been long married, and their eldest child was but just born. The father of their charge was one of those Italians nursed in the memory of the antique glory of Italy—one among the schiavi ognor frementi, who exerted himself to obtain the liberty of his country. He became the victim of its weakness. Whether he had died or still lingered in the dungeons of Austria was not known. His property was confiscated; his child became an orphan and a beggar. She continued with her foster parents and bloomed in their rude abode, fairer than a garden rose among dark-leaved brambles.
+
+When my father returned from Milan, he found playing with me in the hall of our villa a child fairer than pictured cherub—a creature who seemed to shed radiance from her looks and whose form and motions were lighter than the chamois of the hills. The apparition was soon explained. With his permission my mother prevailed on her rustic guardians to yield their charge to her. They were fond of the sweet orphan. Her presence had seemed a blessing to them, but it would be unfair to her to keep her in poverty and want when Providence afforded her such powerful protection. They consulted their village priest, and the result was that Elizabeth Lavenza became the inmate of my parents’ house—my more than sister—the beautiful and adored companion of all my occupations and my pleasures.
+
+Everyone loved Elizabeth. The passionate and almost reverential attachment with which all regarded her became, while I shared it, my pride and my delight. On the evening previous to her being brought to my home, my mother had said playfully, “I have a pretty present for my Victor—tomorrow he shall have it.” And when, on the morrow, she presented Elizabeth to me as her promised gift, I, with childish seriousness, interpreted her words literally and looked upon Elizabeth as mine—mine to protect, love, and cherish. All praises bestowed on her I received as made to a possession of my own. We called each other familiarly by the name of cousin. No word, no expression could body forth the kind of relation in which she stood to me—my more than sister, since till death she was to be mine only.
diff --git a/testdata/frankenstein_chap4.txt b/testdata/frankenstein_chap4.txt
new file mode 100644
index 0000000..c7fe14b
--- /dev/null
+++ b/testdata/frankenstein_chap4.txt
@@ -0,0 +1,273 @@
+Letter 1
+To Mrs. Saville, England.
+
+St. Petersburgh, Dec. 11th, 17—.
+
+You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking.
+
+I am already far north of London, and as I walk in the streets of Petersburgh, I feel a cold northern breeze play upon my cheeks, which braces my nerves and fills me with delight. Do you understand this feeling? This breeze, which has travelled from the regions towards which I am advancing, gives me a foretaste of those icy climes. Inspirited by this wind of promise, my daydreams become more fervent and vivid. I try in vain to be persuaded that the pole is the seat of frost and desolation; it ever presents itself to my imagination as the region of beauty and delight. There, Margaret, the sun is for ever visible, its broad disk just skirting the horizon and diffusing a perpetual splendour. There—for with your leave, my sister, I will put some trust in preceding navigators—there snow and frost are banished; and, sailing over a calm sea, we may be wafted to a land surpassing in wonders and in beauty every region hitherto discovered on the habitable globe. Its productions and features may be without example, as the phenomena of the heavenly bodies undoubtedly are in those undiscovered solitudes. What may not be expected in a country of eternal light? I may there discover the wondrous power which attracts the needle and may regulate a thousand celestial observations that require only this voyage to render their seeming eccentricities consistent for ever. I shall satiate my ardent curiosity with the sight of a part of the world never before visited, and may tread a land never before imprinted by the foot of man. These are my enticements, and they are sufficient to conquer all fear of danger or death and to induce me to commence this laborious voyage with the joy a child feels when he embarks in a little boat, with his holiday mates, on an expedition of discovery up his native river. But supposing all these conjectures to be false, you cannot contest the inestimable benefit which I shall confer on all mankind, to the last generation, by discovering a passage near the pole to those countries, to reach which at present so many months are requisite; or by ascertaining the secret of the magnet, which, if at all possible, can only be effected by an undertaking such as mine.
+
+These reflections have dispelled the agitation with which I began my letter, and I feel my heart glow with an enthusiasm which elevates me to heaven, for nothing contributes so much to tranquillise the mind as a steady purpose—a point on which the soul may fix its intellectual eye. This expedition has been the favourite dream of my early years. I have read with ardour the accounts of the various voyages which have been made in the prospect of arriving at the North Pacific Ocean through the seas which surround the pole. You may remember that a history of all the voyages made for purposes of discovery composed the whole of our good Uncle Thomas’ library. My education was neglected, yet I was passionately fond of reading. These volumes were my study day and night, and my familiarity with them increased that regret which I had felt, as a child, on learning that my father’s dying injunction had forbidden my uncle to allow me to embark in a seafaring life.
+
+These visions faded when I perused, for the first time, those poets whose effusions entranced my soul and lifted it to heaven. I also became a poet and for one year lived in a paradise of my own creation; I imagined that I also might obtain a niche in the temple where the names of Homer and Shakespeare are consecrated. You are well acquainted with my failure and how heavily I bore the disappointment. But just at that time I inherited the fortune of my cousin, and my thoughts were turned into the channel of their earlier bent.
+
+Six years have passed since I resolved on my present undertaking. I can, even now, remember the hour from which I dedicated myself to this great enterprise. I commenced by inuring my body to hardship. I accompanied the whale-fishers on several expeditions to the North Sea; I voluntarily endured cold, famine, thirst, and want of sleep; I often worked harder than the common sailors during the day and devoted my nights to the study of mathematics, the theory of medicine, and those branches of physical science from which a naval adventurer might derive the greatest practical advantage. Twice I actually hired myself as an under-mate in a Greenland whaler, and acquitted myself to admiration. I must own I felt a little proud when my captain offered me the second dignity in the vessel and entreated me to remain with the greatest earnestness, so valuable did he consider my services.
+
+And now, dear Margaret, do I not deserve to accomplish some great purpose? My life might have been passed in ease and luxury, but I preferred glory to every enticement that wealth placed in my path. Oh, that some encouraging voice would answer in the affirmative! My courage and my resolution is firm; but my hopes fluctuate, and my spirits are often depressed. I am about to proceed on a long and difficult voyage, the emergencies of which will demand all my fortitude: I am required not only to raise the spirits of others, but sometimes to sustain my own, when theirs are failing.
+
+This is the most favourable period for travelling in Russia. They fly quickly over the snow in their sledges; the motion is pleasant, and, in my opinion, far more agreeable than that of an English stagecoach. The cold is not excessive, if you are wrapped in furs—a dress which I have already adopted, for there is a great difference between walking the deck and remaining seated motionless for hours, when no exercise prevents the blood from actually freezing in your veins. I have no ambition to lose my life on the post-road between St. Petersburgh and Archangel.
+
+I shall depart for the latter town in a fortnight or three weeks; and my intention is to hire a ship there, which can easily be done by paying the insurance for the owner, and to engage as many sailors as I think necessary among those who are accustomed to the whale-fishing. I do not intend to sail until the month of June; and when shall I return? Ah, dear sister, how can I answer this question? If I succeed, many, many months, perhaps years, will pass before you and I may meet. If I fail, you will see me again soon, or never.
+
+Farewell, my dear, excellent Margaret. Heaven shower down blessings on you, and save me, that I may again and again testify my gratitude for all your love and kindness.
+
+Your affectionate brother,
+R. Walton
+
+Letter 2
+To Mrs. Saville, England.
+
+Archangel, 28th March, 17—.
+
+How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage.
+
+But I have one want which I have never yet been able to satisfy, and the absence of the object of which I now feel as a most severe evil, I have no friend, Margaret: when I am glowing with the enthusiasm of success, there will be none to participate my joy; if I am assailed by disappointment, no one will endeavour to sustain me in dejection. I shall commit my thoughts to paper, it is true; but that is a poor medium for the communication of feeling. I desire the company of a man who could sympathise with me, whose eyes would reply to mine. You may deem me romantic, my dear sister, but I bitterly feel the want of a friend. I have no one near me, gentle yet courageous, possessed of a cultivated as well as of a capacious mind, whose tastes are like my own, to approve or amend my plans. How would such a friend repair the faults of your poor brother! I am too ardent in execution and too impatient of difficulties. But it is a still greater evil to me that I am self-educated: for the first fourteen years of my life I ran wild on a common and read nothing but our Uncle Thomas’ books of voyages. At that age I became acquainted with the celebrated poets of our own country; but it was only when it had ceased to be in my power to derive its most important benefits from such a conviction that I perceived the necessity of becoming acquainted with more languages than that of my native country. Now I am twenty-eight and am in reality more illiterate than many schoolboys of fifteen. It is true that I have thought more and that my daydreams are more extended and magnificent, but they want (as the painters call it) keeping; and I greatly need a friend who would have sense enough not to despise me as romantic, and affection enough for me to endeavour to regulate my mind.
+
+Well, these are useless complaints; I shall certainly find no friend on the wide ocean, nor even here in Archangel, among merchants and seamen. Yet some feelings, unallied to the dross of human nature, beat even in these rugged bosoms. My lieutenant, for instance, is a man of wonderful courage and enterprise; he is madly desirous of glory, or rather, to word my phrase more characteristically, of advancement in his profession. He is an Englishman, and in the midst of national and professional prejudices, unsoftened by cultivation, retains some of the noblest endowments of humanity. I first became acquainted with him on board a whale vessel; finding that he was unemployed in this city, I easily engaged him to assist in my enterprise.
+
+The master is a person of an excellent disposition and is remarkable in the ship for his gentleness and the mildness of his discipline. This circumstance, added to his well-known integrity and dauntless courage, made me very desirous to engage him. A youth passed in solitude, my best years spent under your gentle and feminine fosterage, has so refined the groundwork of my character that I cannot overcome an intense distaste to the usual brutality exercised on board ship: I have never believed it to be necessary, and when I heard of a mariner equally noted for his kindliness of heart and the respect and obedience paid to him by his crew, I felt myself peculiarly fortunate in being able to secure his services. I heard of him first in rather a romantic manner, from a lady who owes to him the happiness of her life. This, briefly, is his story. Some years ago he loved a young Russian lady of moderate fortune, and having amassed a considerable sum in prize-money, the father of the girl consented to the match. He saw his mistress once before the destined ceremony; but she was bathed in tears, and throwing herself at his feet, entreated him to spare her, confessing at the same time that she loved another, but that he was poor, and that her father would never consent to the union. My generous friend reassured the suppliant, and on being informed of the name of her lover, instantly abandoned his pursuit. He had already bought a farm with his money, on which he had designed to pass the remainder of his life; but he bestowed the whole on his rival, together with the remains of his prize-money to purchase stock, and then himself solicited the young woman’s father to consent to her marriage with her lover. But the old man decidedly refused, thinking himself bound in honour to my friend, who, when he found the father inexorable, quitted his country, nor returned until he heard that his former mistress was married according to her inclinations. “What a noble fellow!” you will exclaim. He is so; but then he is wholly uneducated: he is as silent as a Turk, and a kind of ignorant carelessness attends him, which, while it renders his conduct the more astonishing, detracts from the interest and sympathy which otherwise he would command.
+
+Yet do not suppose, because I complain a little or because I can conceive a consolation for my toils which I may never know, that I am wavering in my resolutions. Those are as fixed as fate, and my voyage is only now delayed until the weather shall permit my embarkation. The winter has been dreadfully severe, but the spring promises well, and it is considered as a remarkably early season, so that perhaps I may sail sooner than I expected. I shall do nothing rashly: you know me sufficiently to confide in my prudence and considerateness whenever the safety of others is committed to my care.
+
+I cannot describe to you my sensations on the near prospect of my undertaking. It is impossible to communicate to you a conception of the trembling sensation, half pleasurable and half fearful, with which I am preparing to depart. I am going to unexplored regions, to “the land of mist and snow,” but I shall kill no albatross; therefore do not be alarmed for my safety or if I should come back to you as worn and woeful as the “Ancient Mariner.” You will smile at my allusion, but I will disclose a secret. I have often attributed my attachment to, my passionate enthusiasm for, the dangerous mysteries of ocean to that production of the most imaginative of modern poets. There is something at work in my soul which I do not understand. I am practically industrious—painstaking, a workman to execute with perseverance and labour—but besides this there is a love for the marvellous, a belief in the marvellous, intertwined in all my projects, which hurries me out of the common pathways of men, even to the wild sea and unvisited regions I am about to explore.
+
+But to return to dearer considerations. Shall I meet you again, after having traversed immense seas, and returned by the most southern cape of Africa or America? I dare not expect such success, yet I cannot bear to look on the reverse of the picture. Continue for the present to write to me by every opportunity: I may receive your letters on some occasions when I need them most to support my spirits. I love you very tenderly. Remember me with affection, should you never hear from me again.
+
+Your affectionate brother,
+Robert Walton
+
+Letter 3
+To Mrs. Saville, England.
+
+July 7th, 17—.
+
+My dear Sister,
+
+I write a few lines in haste to say that I am safe—and well advanced on my voyage. This letter will reach England by a merchantman now on its homeward voyage from Archangel; more fortunate than I, who may not see my native land, perhaps, for many years. I am, however, in good spirits: my men are bold and apparently firm of purpose, nor do the floating sheets of ice that continually pass us, indicating the dangers of the region towards which we are advancing, appear to dismay them. We have already reached a very high latitude; but it is the height of summer, and although not so warm as in England, the southern gales, which blow us speedily towards those shores which I so ardently desire to attain, breathe a degree of renovating warmth which I had not expected.
+
+No incidents have hitherto befallen us that would make a figure in a letter. One or two stiff gales and the springing of a leak are accidents which experienced navigators scarcely remember to record, and I shall be well content if nothing worse happen to us during our voyage.
+
+Adieu, my dear Margaret. Be assured that for my own sake, as well as yours, I will not rashly encounter danger. I will be cool, persevering, and prudent.
+
+But success shall crown my endeavours. Wherefore not? Thus far I have gone, tracing a secure way over the pathless seas, the very stars themselves being witnesses and testimonies of my triumph. Why not still proceed over the untamed yet obedient element? What can stop the determined heart and resolved will of man?
+
+My swelling heart involuntarily pours itself out thus. But I must finish. Heaven bless my beloved sister!
+
+R.W.
+
+Letter 4
+To Mrs. Saville, England.
+
+August 5th, 17—.
+
+So strange an accident has happened to us that I cannot forbear recording it, although it is very probable that you will see me before these papers can come into your possession.
+
+Last Monday (July 31st) we were nearly surrounded by ice, which closed in the ship on all sides, scarcely leaving her the sea-room in which she floated. Our situation was somewhat dangerous, especially as we were compassed round by a very thick fog. We accordingly lay to, hoping that some change would take place in the atmosphere and weather.
+
+About two o’clock the mist cleared away, and we beheld, stretched out in every direction, vast and irregular plains of ice, which seemed to have no end. Some of my comrades groaned, and my own mind began to grow watchful with anxious thoughts, when a strange sight suddenly attracted our attention and diverted our solicitude from our own situation. We perceived a low carriage, fixed on a sledge and drawn by dogs, pass on towards the north, at the distance of half a mile; a being which had the shape of a man, but apparently of gigantic stature, sat in the sledge and guided the dogs. We watched the rapid progress of the traveller with our telescopes until he was lost among the distant inequalities of the ice.
+
+This appearance excited our unqualified wonder. We were, as we believed, many hundred miles from any land; but this apparition seemed to denote that it was not, in reality, so distant as we had supposed. Shut in, however, by ice, it was impossible to follow his track, which we had observed with the greatest attention.
+
+About two hours after this occurrence we heard the ground sea, and before night the ice broke and freed our ship. We, however, lay to until the morning, fearing to encounter in the dark those large loose masses which float about after the breaking up of the ice. I profited of this time to rest for a few hours.
+
+In the morning, however, as soon as it was light, I went upon deck and found all the sailors busy on one side of the vessel, apparently talking to someone in the sea. It was, in fact, a sledge, like that we had seen before, which had drifted towards us in the night on a large fragment of ice. Only one dog remained alive; but there was a human being within it whom the sailors were persuading to enter the vessel. He was not, as the other traveller seemed to be, a savage inhabitant of some undiscovered island, but a European. When I appeared on deck the master said, “Here is our captain, and he will not allow you to perish on the open sea.”
+
+On perceiving me, the stranger addressed me in English, although with a foreign accent. “Before I come on board your vessel,” said he, “will you have the kindness to inform me whither you are bound?”
+
+You may conceive my astonishment on hearing such a question addressed to me from a man on the brink of destruction and to whom I should have supposed that my vessel would have been a resource which he would not have exchanged for the most precious wealth the earth can afford. I replied, however, that we were on a voyage of discovery towards the northern pole.
+
+Upon hearing this he appeared satisfied and consented to come on board. Good God! Margaret, if you had seen the man who thus capitulated for his safety, your surprise would have been boundless. His limbs were nearly frozen, and his body dreadfully emaciated by fatigue and suffering. I never saw a man in so wretched a condition. We attempted to carry him into the cabin, but as soon as he had quitted the fresh air he fainted. We accordingly brought him back to the deck and restored him to animation by rubbing him with brandy and forcing him to swallow a small quantity. As soon as he showed signs of life we wrapped him up in blankets and placed him near the chimney of the kitchen stove. By slow degrees he recovered and ate a little soup, which restored him wonderfully.
+
+Two days passed in this manner before he was able to speak, and I often feared that his sufferings had deprived him of understanding. When he had in some measure recovered, I removed him to my own cabin and attended on him as much as my duty would permit. I never saw a more interesting creature: his eyes have generally an expression of wildness, and even madness, but there are moments when, if anyone performs an act of kindness towards him or does him any the most trifling service, his whole countenance is lighted up, as it were, with a beam of benevolence and sweetness that I never saw equalled. But he is generally melancholy and despairing, and sometimes he gnashes his teeth, as if impatient of the weight of woes that oppresses him.
+
+When my guest was a little recovered I had great trouble to keep off the men, who wished to ask him a thousand questions; but I would not allow him to be tormented by their idle curiosity, in a state of body and mind whose restoration evidently depended upon entire repose. Once, however, the lieutenant asked why he had come so far upon the ice in so strange a vehicle.
+
+His countenance instantly assumed an aspect of the deepest gloom, and he replied, “To seek one who fled from me.”
+
+“And did the man whom you pursued travel in the same fashion?”
+
+“Yes.”
+
+“Then I fancy we have seen him, for the day before we picked you up we saw some dogs drawing a sledge, with a man in it, across the ice.”
+
+This aroused the stranger’s attention, and he asked a multitude of questions concerning the route which the dæmon, as he called him, had pursued. Soon after, when he was alone with me, he said, “I have, doubtless, excited your curiosity, as well as that of these good people; but you are too considerate to make inquiries.”
+
+“Certainly; it would indeed be very impertinent and inhuman in me to trouble you with any inquisitiveness of mine.”
+
+“And yet you rescued me from a strange and perilous situation; you have benevolently restored me to life.”
+
+Soon after this he inquired if I thought that the breaking up of the ice had destroyed the other sledge. I replied that I could not answer with any degree of certainty, for the ice had not broken until near midnight, and the traveller might have arrived at a place of safety before that time; but of this I could not judge.
+
+From this time a new spirit of life animated the decaying frame of the stranger. He manifested the greatest eagerness to be upon deck to watch for the sledge which had before appeared; but I have persuaded him to remain in the cabin, for he is far too weak to sustain the rawness of the atmosphere. I have promised that someone should watch for him and give him instant notice if any new object should appear in sight.
+
+Such is my journal of what relates to this strange occurrence up to the present day. The stranger has gradually improved in health but is very silent and appears uneasy when anyone except myself enters his cabin. Yet his manners are so conciliating and gentle that the sailors are all interested in him, although they have had very little communication with him. For my own part, I begin to love him as a brother, and his constant and deep grief fills me with sympathy and compassion. He must have been a noble creature in his better days, being even now in wreck so attractive and amiable.
+
+I said in one of my letters, my dear Margaret, that I should find no friend on the wide ocean; yet I have found a man who, before his spirit had been broken by misery, I should have been happy to have possessed as the brother of my heart.
+
+I shall continue my journal concerning the stranger at intervals, should I have any fresh incidents to record.
+
+August 13th, 17—.
+
+My affection for my guest increases every day. He excites at once my admiration and my pity to an astonishing degree. How can I see so noble a creature destroyed by misery without feeling the most poignant grief? He is so gentle, yet so wise; his mind is so cultivated, and when he speaks, although his words are culled with the choicest art, yet they flow with rapidity and unparalleled eloquence.
+
+He is now much recovered from his illness and is continually on the deck, apparently watching for the sledge that preceded his own. Yet, although unhappy, he is not so utterly occupied by his own misery but that he interests himself deeply in the projects of others. He has frequently conversed with me on mine, which I have communicated to him without disguise. He entered attentively into all my arguments in favour of my eventual success and into every minute detail of the measures I had taken to secure it. I was easily led by the sympathy which he evinced to use the language of my heart, to give utterance to the burning ardour of my soul and to say, with all the fervour that warmed me, how gladly I would sacrifice my fortune, my existence, my every hope, to the furtherance of my enterprise. One man’s life or death were but a small price to pay for the acquirement of the knowledge which I sought, for the dominion I should acquire and transmit over the elemental foes of our race. As I spoke, a dark gloom spread over my listener’s countenance. At first I perceived that he tried to suppress his emotion; he placed his hands before his eyes, and my voice quivered and failed me as I beheld tears trickle fast from between his fingers; a groan burst from his heaving breast. I paused; at length he spoke, in broken accents: “Unhappy man! Do you share my madness? Have you drunk also of the intoxicating draught? Hear me; let me reveal my tale, and you will dash the cup from your lips!”
+
+Such words, you may imagine, strongly excited my curiosity; but the paroxysm of grief that had seized the stranger overcame his weakened powers, and many hours of repose and tranquil conversation were necessary to restore his composure.
+
+Having conquered the violence of his feelings, he appeared to despise himself for being the slave of passion; and quelling the dark tyranny of despair, he led me again to converse concerning myself personally. He asked me the history of my earlier years. The tale was quickly told, but it awakened various trains of reflection. I spoke of my desire of finding a friend, of my thirst for a more intimate sympathy with a fellow mind than had ever fallen to my lot, and expressed my conviction that a man could boast of little happiness who did not enjoy this blessing.
+
+“I agree with you,” replied the stranger; “we are unfashioned creatures, but half made up, if one wiser, better, dearer than ourselves—such a friend ought to be—do not lend his aid to perfectionate our weak and faulty natures. I once had a friend, the most noble of human creatures, and am entitled, therefore, to judge respecting friendship. You have hope, and the world before you, and have no cause for despair. But I—I have lost everything and cannot begin life anew.”
+
+As he said this his countenance became expressive of a calm, settled grief that touched me to the heart. But he was silent and presently retired to his cabin.
+
+Even broken in spirit as he is, no one can feel more deeply than he does the beauties of nature. The starry sky, the sea, and every sight afforded by these wonderful regions seem still to have the power of elevating his soul from earth. Such a man has a double existence: he may suffer misery and be overwhelmed by disappointments, yet when he has retired into himself, he will be like a celestial spirit that has a halo around him, within whose circle no grief or folly ventures.
+
+Will you smile at the enthusiasm I express concerning this divine wanderer? You would not if you saw him. You have been tutored and refined by books and retirement from the world, and you are therefore somewhat fastidious; but this only renders you the more fit to appreciate the extraordinary merits of this wonderful man. Sometimes I have endeavoured to discover what quality it is which he possesses that elevates him so immeasurably above any other person I ever knew. I believe it to be an intuitive discernment, a quick but never-failing power of judgment, a penetration into the causes of things, unequalled for clearness and precision; add to this a facility of expression and a voice whose varied intonations are soul-subduing music.
+
+August 19th, 17—.
+
+Yesterday the stranger said to me, “You may easily perceive, Captain Walton, that I have suffered great and unparalleled misfortunes. I had determined at one time that the memory of these evils should die with me, but you have won me to alter my determination. You seek for knowledge and wisdom, as I once did; and I ardently hope that the gratification of your wishes may not be a serpent to sting you, as mine has been. I do not know that the relation of my disasters will be useful to you; yet, when I reflect that you are pursuing the same course, exposing yourself to the same dangers which have rendered me what I am, I imagine that you may deduce an apt moral from my tale, one that may direct you if you succeed in your undertaking and console you in case of failure. Prepare to hear of occurrences which are usually deemed marvellous. Were we among the tamer scenes of nature I might fear to encounter your unbelief, perhaps your ridicule; but many things will appear possible in these wild and mysterious regions which would provoke the laughter of those unacquainted with the ever-varied powers of nature; nor can I doubt but that my tale conveys in its series internal evidence of the truth of the events of which it is composed.”
+
+You may easily imagine that I was much gratified by the offered communication, yet I could not endure that he should renew his grief by a recital of his misfortunes. I felt the greatest eagerness to hear the promised narrative, partly from curiosity and partly from a strong desire to ameliorate his fate if it were in my power. I expressed these feelings in my answer.
+
+“I thank you,” he replied, “for your sympathy, but it is useless; my fate is nearly fulfilled. I wait but for one event, and then I shall repose in peace. I understand your feeling,” continued he, perceiving that I wished to interrupt him; “but you are mistaken, my friend, if thus you will allow me to name you; nothing can alter my destiny; listen to my history, and you will perceive how irrevocably it is determined.”
+
+He then told me that he would commence his narrative the next day when I should be at leisure. This promise drew from me the warmest thanks. I have resolved every night, when I am not imperatively occupied by my duties, to record, as nearly as possible in his own words, what he has related during the day. If I should be engaged, I will at least make notes. This manuscript will doubtless afford you the greatest pleasure; but to me, who know him, and who hear it from his own lips—with what interest and sympathy shall I read it in some future day! Even now, as I commence my task, his full-toned voice swells in my ears; his lustrous eyes dwell on me with all their melancholy sweetness; I see his thin hand raised in animation, while the lineaments of his face are irradiated by the soul within. Strange and harrowing must be his story, frightful the storm which embraced the gallant vessel on its course and wrecked it—thus!
+
+Chapter 1
+I am by birth a Genevese, and my family is one of the most distinguished of that republic. My ancestors had been for many years counsellors and syndics, and my father had filled several public situations with honour and reputation. He was respected by all who knew him for his integrity and indefatigable attention to public business. He passed his younger days perpetually occupied by the affairs of his country; a variety of circumstances had prevented his marrying early, nor was it until the decline of life that he became a husband and the father of a family.
+
+As the circumstances of his marriage illustrate his character, I cannot refrain from relating them. One of his most intimate friends was a merchant who, from a flourishing state, fell, through numerous mischances, into poverty. This man, whose name was Beaufort, was of a proud and unbending disposition and could not bear to live in poverty and oblivion in the same country where he had formerly been distinguished for his rank and magnificence. Having paid his debts, therefore, in the most honourable manner, he retreated with his daughter to the town of Lucerne, where he lived unknown and in wretchedness. My father loved Beaufort with the truest friendship and was deeply grieved by his retreat in these unfortunate circumstances. He bitterly deplored the false pride which led his friend to a conduct so little worthy of the affection that united them. He lost no time in endeavouring to seek him out, with the hope of persuading him to begin the world again through his credit and assistance.
+
+Beaufort had taken effectual measures to conceal himself, and it was ten months before my father discovered his abode. Overjoyed at this discovery, he hastened to the house, which was situated in a mean street near the Reuss. But when he entered, misery and despair alone welcomed him. Beaufort had saved but a very small sum of money from the wreck of his fortunes, but it was sufficient to provide him with sustenance for some months, and in the meantime he hoped to procure some respectable employment in a merchant’s house. The interval was, consequently, spent in inaction; his grief only became more deep and rankling when he had leisure for reflection, and at length it took so fast hold of his mind that at the end of three months he lay on a bed of sickness, incapable of any exertion.
+
+His daughter attended him with the greatest tenderness, but she saw with despair that their little fund was rapidly decreasing and that there was no other prospect of support. But Caroline Beaufort possessed a mind of an uncommon mould, and her courage rose to support her in her adversity. She procured plain work; she plaited straw and by various means contrived to earn a pittance scarcely sufficient to support life.
+
+Several months passed in this manner. Her father grew worse; her time was more entirely occupied in attending him; her means of subsistence decreased; and in the tenth month her father died in her arms, leaving her an orphan and a beggar. This last blow overcame her, and she knelt by Beaufort’s coffin weeping bitterly, when my father entered the chamber. He came like a protecting spirit to the poor girl, who committed herself to his care; and after the interment of his friend he conducted her to Geneva and placed her under the protection of a relation. Two years after this event Caroline became his wife.
+
+There was a considerable difference between the ages of my parents, but this circumstance seemed to unite them only closer in bonds of devoted affection. There was a sense of justice in my father’s upright mind which rendered it necessary that he should approve highly to love strongly. Perhaps during former years he had suffered from the late-discovered unworthiness of one beloved and so was disposed to set a greater value on tried worth. There was a show of gratitude and worship in his attachment to my mother, differing wholly from the doting fondness of age, for it was inspired by reverence for her virtues and a desire to be the means of, in some degree, recompensing her for the sorrows she had endured, but which gave inexpressible grace to his behaviour to her. Everything was made to yield to her wishes and her convenience. He strove to shelter her, as a fair exotic is sheltered by the gardener, from every rougher wind and to surround her with all that could tend to excite pleasurable emotion in her soft and benevolent mind. Her health, and even the tranquillity of her hitherto constant spirit, had been shaken by what she had gone through. During the two years that had elapsed previous to their marriage my father had gradually relinquished all his public functions; and immediately after their union they sought the pleasant climate of Italy, and the change of scene and interest attendant on a tour through that land of wonders, as a restorative for her weakened frame.
+
+From Italy they visited Germany and France. I, their eldest child, was born at Naples, and as an infant accompanied them in their rambles. I remained for several years their only child. Much as they were attached to each other, they seemed to draw inexhaustible stores of affection from a very mine of love to bestow them upon me. My mother’s tender caresses and my father’s smile of benevolent pleasure while regarding me are my first recollections. I was their plaything and their idol, and something better—their child, the innocent and helpless creature bestowed on them by Heaven, whom to bring up to good, and whose future lot it was in their hands to direct to happiness or misery, according as they fulfilled their duties towards me. With this deep consciousness of what they owed towards the being to which they had given life, added to the active spirit of tenderness that animated both, it may be imagined that while during every hour of my infant life I received a lesson of patience, of charity, and of self-control, I was so guided by a silken cord that all seemed but one train of enjoyment to me.
+
+For a long time I was their only care. My mother had much desired to have a daughter, but I continued their single offspring. When I was about five years old, while making an excursion beyond the frontiers of Italy, they passed a week on the shores of the Lake of Como. Their benevolent disposition often made them enter the cottages of the poor. This, to my mother, was more than a duty; it was a necessity, a passion—remembering what she had suffered, and how she had been relieved—for her to act in her turn the guardian angel to the afflicted. During one of their walks a poor cot in the foldings of a vale attracted their notice as being singularly disconsolate, while the number of half-clothed children gathered about it spoke of penury in its worst shape. One day, when my father had gone by himself to Milan, my mother, accompanied by me, visited this abode. She found a peasant and his wife, hard working, bent down by care and labour, distributing a scanty meal to five hungry babes. Among these there was one which attracted my mother far above all the rest. She appeared of a different stock. The four others were dark-eyed, hardy little vagrants; this child was thin and very fair. Her hair was the brightest living gold, and despite the poverty of her clothing, seemed to set a crown of distinction on her head. Her brow was clear and ample, her blue eyes cloudless, and her lips and the moulding of her face so expressive of sensibility and sweetness that none could behold her without looking on her as of a distinct species, a being heaven-sent, and bearing a celestial stamp in all her features.
+
+The peasant woman, perceiving that my mother fixed eyes of wonder and admiration on this lovely girl, eagerly communicated her history. She was not her child, but the daughter of a Milanese nobleman. Her mother was a German and had died on giving her birth. The infant had been placed with these good people to nurse: they were better off then. They had not been long married, and their eldest child was but just born. The father of their charge was one of those Italians nursed in the memory of the antique glory of Italy—one among the schiavi ognor frementi, who exerted himself to obtain the liberty of his country. He became the victim of its weakness. Whether he had died or still lingered in the dungeons of Austria was not known. His property was confiscated; his child became an orphan and a beggar. She continued with her foster parents and bloomed in their rude abode, fairer than a garden rose among dark-leaved brambles.
+
+When my father returned from Milan, he found playing with me in the hall of our villa a child fairer than pictured cherub—a creature who seemed to shed radiance from her looks and whose form and motions were lighter than the chamois of the hills. The apparition was soon explained. With his permission my mother prevailed on her rustic guardians to yield their charge to her. They were fond of the sweet orphan. Her presence had seemed a blessing to them, but it would be unfair to her to keep her in poverty and want when Providence afforded her such powerful protection. They consulted their village priest, and the result was that Elizabeth Lavenza became the inmate of my parents’ house—my more than sister—the beautiful and adored companion of all my occupations and my pleasures.
+
+Everyone loved Elizabeth. The passionate and almost reverential attachment with which all regarded her became, while I shared it, my pride and my delight. On the evening previous to her being brought to my home, my mother had said playfully, “I have a pretty present for my Victor—tomorrow he shall have it.” And when, on the morrow, she presented Elizabeth to me as her promised gift, I, with childish seriousness, interpreted her words literally and looked upon Elizabeth as mine—mine to protect, love, and cherish. All praises bestowed on her I received as made to a possession of my own. We called each other familiarly by the name of cousin. No word, no expression could body forth the kind of relation in which she stood to me—my more than sister, since till death she was to be mine only.
+
+Chapter 2
+We were brought up together; there was not quite a year difference in our ages. I need not say that we were strangers to any species of disunion or dispute. Harmony was the soul of our companionship, and the diversity and contrast that subsisted in our characters drew us nearer together. Elizabeth was of a calmer and more concentrated disposition; but, with all my ardour, I was capable of a more intense application and was more deeply smitten with the thirst for knowledge. She busied herself with following the aerial creations of the poets; and in the majestic and wondrous scenes which surrounded our Swiss home —the sublime shapes of the mountains, the changes of the seasons, tempest and calm, the silence of winter, and the life and turbulence of our Alpine summers—she found ample scope for admiration and delight. While my companion contemplated with a serious and satisfied spirit the magnificent appearances of things, I delighted in investigating their causes. The world was to me a secret which I desired to divine. Curiosity, earnest research to learn the hidden laws of nature, gladness akin to rapture, as they were unfolded to me, are among the earliest sensations I can remember.
+
+On the birth of a second son, my junior by seven years, my parents gave up entirely their wandering life and fixed themselves in their native country. We possessed a house in Geneva, and a campagne on Belrive, the eastern shore of the lake, at the distance of rather more than a league from the city. We resided principally in the latter, and the lives of my parents were passed in considerable seclusion. It was my temper to avoid a crowd and to attach myself fervently to a few. I was indifferent, therefore, to my school-fellows in general; but I united myself in the bonds of the closest friendship to one among them. Henry Clerval was the son of a merchant of Geneva. He was a boy of singular talent and fancy. He loved enterprise, hardship, and even danger for its own sake. He was deeply read in books of chivalry and romance. He composed heroic songs and began to write many a tale of enchantment and knightly adventure. He tried to make us act plays and to enter into masquerades, in which the characters were drawn from the heroes of Roncesvalles, of the Round Table of King Arthur, and the chivalrous train who shed their blood to redeem the holy sepulchre from the hands of the infidels.
+
+No human being could have passed a happier childhood than myself. My parents were possessed by the very spirit of kindness and indulgence. We felt that they were not the tyrants to rule our lot according to their caprice, but the agents and creators of all the many delights which we enjoyed. When I mingled with other families I distinctly discerned how peculiarly fortunate my lot was, and gratitude assisted the development of filial love.
+
+My temper was sometimes violent, and my passions vehement; but by some law in my temperature they were turned not towards childish pursuits but to an eager desire to learn, and not to learn all things indiscriminately. I confess that neither the structure of languages, nor the code of governments, nor the politics of various states possessed attractions for me. It was the secrets of heaven and earth that I desired to learn; and whether it was the outward substance of things or the inner spirit of nature and the mysterious soul of man that occupied me, still my inquiries were directed to the metaphysical, or in its highest sense, the physical secrets of the world.
+
+Meanwhile Clerval occupied himself, so to speak, with the moral relations of things. The busy stage of life, the virtues of heroes, and the actions of men were his theme; and his hope and his dream was to become one among those whose names are recorded in story as the gallant and adventurous benefactors of our species. The saintly soul of Elizabeth shone like a shrine-dedicated lamp in our peaceful home. Her sympathy was ours; her smile, her soft voice, the sweet glance of her celestial eyes, were ever there to bless and animate us. She was the living spirit of love to soften and attract; I might have become sullen in my study, rough through the ardour of my nature, but that she was there to subdue me to a semblance of her own gentleness. And Clerval—could aught ill entrench on the noble spirit of Clerval? Yet he might not have been so perfectly humane, so thoughtful in his generosity, so full of kindness and tenderness amidst his passion for adventurous exploit, had she not unfolded to him the real loveliness of beneficence and made the doing good the end and aim of his soaring ambition.
+
+I feel exquisite pleasure in dwelling on the recollections of childhood, before misfortune had tainted my mind and changed its bright visions of extensive usefulness into gloomy and narrow reflections upon self. Besides, in drawing the picture of my early days, I also record those events which led, by insensible steps, to my after tale of misery, for when I would account to myself for the birth of that passion which afterwards ruled my destiny I find it arise, like a mountain river, from ignoble and almost forgotten sources; but, swelling as it proceeded, it became the torrent which, in its course, has swept away all my hopes and joys.
+
+Natural philosophy is the genius that has regulated my fate; I desire, therefore, in this narration, to state those facts which led to my predilection for that science. When I was thirteen years of age we all went on a party of pleasure to the baths near Thonon; the inclemency of the weather obliged us to remain a day confined to the inn. In this house I chanced to find a volume of the works of Cornelius Agrippa. I opened it with apathy; the theory which he attempts to demonstrate and the wonderful facts which he relates soon changed this feeling into enthusiasm. A new light seemed to dawn upon my mind, and bounding with joy, I communicated my discovery to my father. My father looked carelessly at the title page of my book and said, “Ah! Cornelius Agrippa! My dear Victor, do not waste your time upon this; it is sad trash.”
+
+If, instead of this remark, my father had taken the pains to explain to me that the principles of Agrippa had been entirely exploded and that a modern system of science had been introduced which possessed much greater powers than the ancient, because the powers of the latter were chimerical, while those of the former were real and practical, under such circumstances I should certainly have thrown Agrippa aside and have contented my imagination, warmed as it was, by returning with greater ardour to my former studies. It is even possible that the train of my ideas would never have received the fatal impulse that led to my ruin. But the cursory glance my father had taken of my volume by no means assured me that he was acquainted with its contents, and I continued to read with the greatest avidity.
+
+When I returned home my first care was to procure the whole works of this author, and afterwards of Paracelsus and Albertus Magnus. I read and studied the wild fancies of these writers with delight; they appeared to me treasures known to few besides myself. I have described myself as always having been imbued with a fervent longing to penetrate the secrets of nature. In spite of the intense labour and wonderful discoveries of modern philosophers, I always came from my studies discontented and unsatisfied. Sir Isaac Newton is said to have avowed that he felt like a child picking up shells beside the great and unexplored ocean of truth. Those of his successors in each branch of natural philosophy with whom I was acquainted appeared even to my boy’s apprehensions as tyros engaged in the same pursuit.
+
+The untaught peasant beheld the elements around him and was acquainted with their practical uses. The most learned philosopher knew little more. He had partially unveiled the face of Nature, but her immortal lineaments were still a wonder and a mystery. He might dissect, anatomise, and give names; but, not to speak of a final cause, causes in their secondary and tertiary grades were utterly unknown to him. I had gazed upon the fortifications and impediments that seemed to keep human beings from entering the citadel of nature, and rashly and ignorantly I had repined.
+
+But here were books, and here were men who had penetrated deeper and knew more. I took their word for all that they averred, and I became their disciple. It may appear strange that such should arise in the eighteenth century; but while I followed the routine of education in the schools of Geneva, I was, to a great degree, self-taught with regard to my favourite studies. My father was not scientific, and I was left to struggle with a child’s blindness, added to a student’s thirst for knowledge. Under the guidance of my new preceptors I entered with the greatest diligence into the search of the philosopher’s stone and the elixir of life; but the latter soon obtained my undivided attention. Wealth was an inferior object, but what glory would attend the discovery if I could banish disease from the human frame and render man invulnerable to any but a violent death!
+
+Nor were these my only visions. The raising of ghosts or devils was a promise liberally accorded by my favourite authors, the fulfilment of which I most eagerly sought; and if my incantations were always unsuccessful, I attributed the failure rather to my own inexperience and mistake than to a want of skill or fidelity in my instructors. And thus for a time I was occupied by exploded systems, mingling, like an unadept, a thousand contradictory theories and floundering desperately in a very slough of multifarious knowledge, guided by an ardent imagination and childish reasoning, till an accident again changed the current of my ideas.
+
+When I was about fifteen years old we had retired to our house near Belrive, when we witnessed a most violent and terrible thunderstorm. It advanced from behind the mountains of Jura, and the thunder burst at once with frightful loudness from various quarters of the heavens. I remained, while the storm lasted, watching its progress with curiosity and delight. As I stood at the door, on a sudden I beheld a stream of fire issue from an old and beautiful oak which stood about twenty yards from our house; and so soon as the dazzling light vanished, the oak had disappeared, and nothing remained but a blasted stump. When we visited it the next morning, we found the tree shattered in a singular manner. It was not splintered by the shock, but entirely reduced to thin ribbons of wood. I never beheld anything so utterly destroyed.
+
+Before this I was not unacquainted with the more obvious laws of electricity. On this occasion a man of great research in natural philosophy was with us, and excited by this catastrophe, he entered on the explanation of a theory which he had formed on the subject of electricity and galvanism, which was at once new and astonishing to me. All that he said threw greatly into the shade Cornelius Agrippa, Albertus Magnus, and Paracelsus, the lords of my imagination; but by some fatality the overthrow of these men disinclined me to pursue my accustomed studies. It seemed to me as if nothing would or could ever be known. All that had so long engaged my attention suddenly grew despicable. By one of those caprices of the mind which we are perhaps most subject to in early youth, I at once gave up my former occupations, set down natural history and all its progeny as a deformed and abortive creation, and entertained the greatest disdain for a would-be science which could never even step within the threshold of real knowledge. In this mood of mind I betook myself to the mathematics and the branches of study appertaining to that science as being built upon secure foundations, and so worthy of my consideration.
+
+Thus strangely are our souls constructed, and by such slight ligaments are we bound to prosperity or ruin. When I look back, it seems to me as if this almost miraculous change of inclination and will was the immediate suggestion of the guardian angel of my life—the last effort made by the spirit of preservation to avert the storm that was even then hanging in the stars and ready to envelop me. Her victory was announced by an unusual tranquillity and gladness of soul which followed the relinquishing of my ancient and latterly tormenting studies. It was thus that I was to be taught to associate evil with their prosecution, happiness with their disregard.
+
+It was a strong effort of the spirit of good, but it was ineffectual. Destiny was too potent, and her immutable laws had decreed my utter and terrible destruction.
+
+Chapter 3
+When I had attained the age of seventeen my parents resolved that I should become a student at the university of Ingolstadt. I had hitherto attended the schools of Geneva, but my father thought it necessary for the completion of my education that I should be made acquainted with other customs than those of my native country. My departure was therefore fixed at an early date, but before the day resolved upon could arrive, the first misfortune of my life occurred—an omen, as it were, of my future misery.
+
+Elizabeth had caught the scarlet fever; her illness was severe, and she was in the greatest danger. During her illness many arguments had been urged to persuade my mother to refrain from attending upon her. She had at first yielded to our entreaties, but when she heard that the life of her favourite was menaced, she could no longer control her anxiety. She attended her sickbed; her watchful attentions triumphed over the malignity of the distemper—Elizabeth was saved, but the consequences of this imprudence were fatal to her preserver. On the third day my mother sickened; her fever was accompanied by the most alarming symptoms, and the looks of her medical attendants prognosticated the worst event. On her deathbed the fortitude and benignity of this best of women did not desert her. She joined the hands of Elizabeth and myself. “My children,” she said, “my firmest hopes of future happiness were placed on the prospect of your union. This expectation will now be the consolation of your father. Elizabeth, my love, you must supply my place to my younger children. Alas! I regret that I am taken from you; and, happy and beloved as I have been, is it not hard to quit you all? But these are not thoughts befitting me; I will endeavour to resign myself cheerfully to death and will indulge a hope of meeting you in another world.”
+
+She died calmly, and her countenance expressed affection even in death. I need not describe the feelings of those whose dearest ties are rent by that most irreparable evil, the void that presents itself to the soul, and the despair that is exhibited on the countenance. It is so long before the mind can persuade itself that she whom we saw every day and whose very existence appeared a part of our own can have departed for ever—that the brightness of a beloved eye can have been extinguished and the sound of a voice so familiar and dear to the ear can be hushed, never more to be heard. These are the reflections of the first days; but when the lapse of time proves the reality of the evil, then the actual bitterness of grief commences. Yet from whom has not that rude hand rent away some dear connection? And why should I describe a sorrow which all have felt, and must feel? The time at length arrives when grief is rather an indulgence than a necessity; and the smile that plays upon the lips, although it may be deemed a sacrilege, is not banished. My mother was dead, but we had still duties which we ought to perform; we must continue our course with the rest and learn to think ourselves fortunate whilst one remains whom the spoiler has not seized.
+
+My departure for Ingolstadt, which had been deferred by these events, was now again determined upon. I obtained from my father a respite of some weeks. It appeared to me sacrilege so soon to leave the repose, akin to death, of the house of mourning and to rush into the thick of life. I was new to sorrow, but it did not the less alarm me. I was unwilling to quit the sight of those that remained to me, and above all, I desired to see my sweet Elizabeth in some degree consoled.
+
+She indeed veiled her grief and strove to act the comforter to us all. She looked steadily on life and assumed its duties with courage and zeal. She devoted herself to those whom she had been taught to call her uncle and cousins. Never was she so enchanting as at this time, when she recalled the sunshine of her smiles and spent them upon us. She forgot even her own regret in her endeavours to make us forget.
+
+The day of my departure at length arrived. Clerval spent the last evening with us. He had endeavoured to persuade his father to permit him to accompany me and to become my fellow student, but in vain. His father was a narrow-minded trader and saw idleness and ruin in the aspirations and ambition of his son. Henry deeply felt the misfortune of being debarred from a liberal education. He said little, but when he spoke I read in his kindling eye and in his animated glance a restrained but firm resolve not to be chained to the miserable details of commerce.
+
+We sat late. We could not tear ourselves away from each other nor persuade ourselves to say the word “Farewell!” It was said, and we retired under the pretence of seeking repose, each fancying that the other was deceived; but when at morning’s dawn I descended to the carriage which was to convey me away, they were all there—my father again to bless me, Clerval to press my hand once more, my Elizabeth to renew her entreaties that I would write often and to bestow the last feminine attentions on her playmate and friend.
+
+I threw myself into the chaise that was to convey me away and indulged in the most melancholy reflections. I, who had ever been surrounded by amiable companions, continually engaged in endeavouring to bestow mutual pleasure—I was now alone. In the university whither I was going I must form my own friends and be my own protector. My life had hitherto been remarkably secluded and domestic, and this had given me invincible repugnance to new countenances. I loved my brothers, Elizabeth, and Clerval; these were “old familiar faces,” but I believed myself totally unfitted for the company of strangers. Such were my reflections as I commenced my journey; but as I proceeded, my spirits and hopes rose. I ardently desired the acquisition of knowledge. I had often, when at home, thought it hard to remain during my youth cooped up in one place and had longed to enter the world and take my station among other human beings. Now my desires were complied with, and it would, indeed, have been folly to repent.
+
+I had sufficient leisure for these and many other reflections during my journey to Ingolstadt, which was long and fatiguing. At length the high white steeple of the town met my eyes. I alighted and was conducted to my solitary apartment to spend the evening as I pleased.
+
+The next morning I delivered my letters of introduction and paid a visit to some of the principal professors. Chance—or rather the evil influence, the Angel of Destruction, which asserted omnipotent sway over me from the moment I turned my reluctant steps from my father’s door—led me first to M. Krempe, professor of natural philosophy. He was an uncouth man, but deeply imbued in the secrets of his science. He asked me several questions concerning my progress in the different branches of science appertaining to natural philosophy. I replied carelessly, and partly in contempt, mentioned the names of my alchemists as the principal authors I had studied. The professor stared. “Have you,” he said, “really spent your time in studying such nonsense?”
+
+I replied in the affirmative. “Every minute,” continued M. Krempe with warmth, “every instant that you have wasted on those books is utterly and entirely lost. You have burdened your memory with exploded systems and useless names. Good God! In what desert land have you lived, where no one was kind enough to inform you that these fancies which you have so greedily imbibed are a thousand years old and as musty as they are ancient? I little expected, in this enlightened and scientific age, to find a disciple of Albertus Magnus and Paracelsus. My dear sir, you must begin your studies entirely anew.”
+
+So saying, he stepped aside and wrote down a list of several books treating of natural philosophy which he desired me to procure, and dismissed me after mentioning that in the beginning of the following week he intended to commence a course of lectures upon natural philosophy in its general relations, and that M. Waldman, a fellow professor, would lecture upon chemistry the alternate days that he omitted.
+
+I returned home not disappointed, for I have said that I had long considered those authors useless whom the professor reprobated; but I returned not at all the more inclined to recur to these studies in any shape. M. Krempe was a little squat man with a gruff voice and a repulsive countenance; the teacher, therefore, did not prepossess me in favour of his pursuits. In rather a too philosophical and connected a strain, perhaps, I have given an account of the conclusions I had come to concerning them in my early years. As a child I had not been content with the results promised by the modern professors of natural science. With a confusion of ideas only to be accounted for by my extreme youth and my want of a guide on such matters, I had retrod the steps of knowledge along the paths of time and exchanged the discoveries of recent inquirers for the dreams of forgotten alchemists. Besides, I had a contempt for the uses of modern natural philosophy. It was very different when the masters of the science sought immortality and power; such views, although futile, were grand; but now the scene was changed. The ambition of the inquirer seemed to limit itself to the annihilation of those visions on which my interest in science was chiefly founded. I was required to exchange chimeras of boundless grandeur for realities of little worth.
+
+Such were my reflections during the first two or three days of my residence at Ingolstadt, which were chiefly spent in becoming acquainted with the localities and the principal residents in my new abode. But as the ensuing week commenced, I thought of the information which M. Krempe had given me concerning the lectures. And although I could not consent to go and hear that little conceited fellow deliver sentences out of a pulpit, I recollected what he had said of M. Waldman, whom I had never seen, as he had hitherto been out of town.
+
+Partly from curiosity and partly from idleness, I went into the lecturing room, which M. Waldman entered shortly after. This professor was very unlike his colleague. He appeared about fifty years of age, but with an aspect expressive of the greatest benevolence; a few grey hairs covered his temples, but those at the back of his head were nearly black. His person was short but remarkably erect and his voice the sweetest I had ever heard. He began his lecture by a recapitulation of the history of chemistry and the various improvements made by different men of learning, pronouncing with fervour the names of the most distinguished discoverers. He then took a cursory view of the present state of the science and explained many of its elementary terms. After having made a few preparatory experiments, he concluded with a panegyric upon modern chemistry, the terms of which I shall never forget:
+
+“The ancient teachers of this science,” said he, “promised impossibilities and performed nothing. The modern masters promise very little; they know that metals cannot be transmuted and that the elixir of life is a chimera but these philosophers, whose hands seem only made to dabble in dirt, and their eyes to pore over the microscope or crucible, have indeed performed miracles. They penetrate into the recesses of nature and show how she works in her hiding-places. They ascend into the heavens; they have discovered how the blood circulates, and the nature of the air we breathe. They have acquired new and almost unlimited powers; they can command the thunders of heaven, mimic the earthquake, and even mock the invisible world with its own shadows.”
+
+Such were the professor’s words—rather let me say such the words of the fate—enounced to destroy me. As he went on I felt as if my soul were grappling with a palpable enemy; one by one the various keys were touched which formed the mechanism of my being; chord after chord was sounded, and soon my mind was filled with one thought, one conception, one purpose. So much has been done, exclaimed the soul of Frankenstein—more, far more, will I achieve; treading in the steps already marked, I will pioneer a new way, explore unknown powers, and unfold to the world the deepest mysteries of creation.
+
+I closed not my eyes that night. My internal being was in a state of insurrection and turmoil; I felt that order would thence arise, but I had no power to produce it. By degrees, after the morning’s dawn, sleep came. I awoke, and my yesternight’s thoughts were as a dream. There only remained a resolution to return to my ancient studies and to devote myself to a science for which I believed myself to possess a natural talent. On the same day I paid M. Waldman a visit. His manners in private were even more mild and attractive than in public, for there was a certain dignity in his mien during his lecture which in his own house was replaced by the greatest affability and kindness. I gave him pretty nearly the same account of my former pursuits as I had given to his fellow professor. He heard with attention the little narration concerning my studies and smiled at the names of Cornelius Agrippa and Paracelsus, but without the contempt that M. Krempe had exhibited. He said that “These were men to whose indefatigable zeal modern philosophers were indebted for most of the foundations of their knowledge. They had left to us, as an easier task, to give new names and arrange in connected classifications the facts which they in a great degree had been the instruments of bringing to light. The labours of men of genius, however erroneously directed, scarcely ever fail in ultimately turning to the solid advantage of mankind.” I listened to his statement, which was delivered without any presumption or affectation, and then added that his lecture had removed my prejudices against modern chemists; I expressed myself in measured terms, with the modesty and deference due from a youth to his instructor, without letting escape (inexperience in life would have made me ashamed) any of the enthusiasm which stimulated my intended labours. I requested his advice concerning the books I ought to procure.
+
+“I am happy,” said M. Waldman, “to have gained a disciple; and if your application equals your ability, I have no doubt of your success. Chemistry is that branch of natural philosophy in which the greatest improvements have been and may be made; it is on that account that I have made it my peculiar study; but at the same time, I have not neglected the other branches of science. A man would make but a very sorry chemist if he attended to that department of human knowledge alone. If your wish is to become really a man of science and not merely a petty experimentalist, I should advise you to apply to every branch of natural philosophy, including mathematics.”
+
+He then took me into his laboratory and explained to me the uses of his various machines, instructing me as to what I ought to procure and promising me the use of his own when I should have advanced far enough in the science not to derange their mechanism. He also gave me the list of books which I had requested, and I took my leave.
+
+Thus ended a day memorable to me; it decided my future destiny.
+
+Chapter 4
+From this day natural philosophy, and particularly chemistry, in the most comprehensive sense of the term, became nearly my sole occupation. I read with ardour those works, so full of genius and discrimination, which modern inquirers have written on these subjects. I attended the lectures and cultivated the acquaintance of the men of science of the university, and I found even in M. Krempe a great deal of sound sense and real information, combined, it is true, with a repulsive physiognomy and manners, but not on that account the less valuable. In M. Waldman I found a true friend. His gentleness was never tinged by dogmatism, and his instructions were given with an air of frankness and good nature that banished every idea of pedantry. In a thousand ways he smoothed for me the path of knowledge and made the most abstruse inquiries clear and facile to my apprehension. My application was at first fluctuating and uncertain; it gained strength as I proceeded and soon became so ardent and eager that the stars often disappeared in the light of morning whilst I was yet engaged in my laboratory.
+
+As I applied so closely, it may be easily conceived that my progress was rapid. My ardour was indeed the astonishment of the students, and my proficiency that of the masters. Professor Krempe often asked me, with a sly smile, how Cornelius Agrippa went on, whilst M. Waldman expressed the most heartfelt exultation in my progress. Two years passed in this manner, during which I paid no visit to Geneva, but was engaged, heart and soul, in the pursuit of some discoveries which I hoped to make. None but those who have experienced them can conceive of the enticements of science. In other studies you go as far as others have gone before you, and there is nothing more to know; but in a scientific pursuit there is continual food for discovery and wonder. A mind of moderate capacity which closely pursues one study must infallibly arrive at great proficiency in that study; and I, who continually sought the attainment of one object of pursuit and was solely wrapped up in this, improved so rapidly that at the end of two years I made some discoveries in the improvement of some chemical instruments, which procured me great esteem and admiration at the university. When I had arrived at this point and had become as well acquainted with the theory and practice of natural philosophy as depended on the lessons of any of the professors at Ingolstadt, my residence there being no longer conducive to my improvements, I thought of returning to my friends and my native town, when an incident happened that protracted my stay.
+
+One of the phenomena which had peculiarly attracted my attention was the structure of the human frame, and, indeed, any animal endued with life. Whence, I often asked myself, did the principle of life proceed? It was a bold question, and one which has ever been considered as a mystery; yet with how many things are we upon the brink of becoming acquainted, if cowardice or carelessness did not restrain our inquiries. I revolved these circumstances in my mind and determined thenceforth to apply myself more particularly to those branches of natural philosophy which relate to physiology. Unless I had been animated by an almost supernatural enthusiasm, my application to this study would have been irksome and almost intolerable. To examine the causes of life, we must first have recourse to death. I became acquainted with the science of anatomy, but this was not sufficient; I must also observe the natural decay and corruption of the human body. In my education my father had taken the greatest precautions that my mind should be impressed with no supernatural horrors. I do not ever remember to have trembled at a tale of superstition or to have feared the apparition of a spirit. Darkness had no effect upon my fancy, and a churchyard was to me merely the receptacle of bodies deprived of life, which, from being the seat of beauty and strength, had become food for the worm. Now I was led to examine the cause and progress of this decay and forced to spend days and nights in vaults and charnel-houses. My attention was fixed upon every object the most insupportable to the delicacy of the human feelings. I saw how the fine form of man was degraded and wasted; I beheld the corruption of death succeed to the blooming cheek of life; I saw how the worm inherited the wonders of the eye and brain. I paused, examining and analysing all the minutiae of causation, as exemplified in the change from life to death, and death to life, until from the midst of this darkness a sudden light broke in upon me—a light so brilliant and wondrous, yet so simple, that while I became dizzy with the immensity of the prospect which it illustrated, I was surprised that among so many men of genius who had directed their inquiries towards the same science, that I alone should be reserved to discover so astonishing a secret.
+
+Remember, I am not recording the vision of a madman. The sun does not more certainly shine in the heavens than that which I now affirm is true. Some miracle might have produced it, yet the stages of the discovery were distinct and probable. After days and nights of incredible labour and fatigue, I succeeded in discovering the cause of generation and life; nay, more, I became myself capable of bestowing animation upon lifeless matter.
+
+The astonishment which I had at first experienced on this discovery soon gave place to delight and rapture. After so much time spent in painful labour, to arrive at once at the summit of my desires was the most gratifying consummation of my toils. But this discovery was so great and overwhelming that all the steps by which I had been progressively led to it were obliterated, and I beheld only the result. What had been the study and desire of the wisest men since the creation of the world was now within my grasp. Not that, like a magic scene, it all opened upon me at once: the information I had obtained was of a nature rather to direct my endeavours so soon as I should point them towards the object of my search than to exhibit that object already accomplished. I was like the Arabian who had been buried with the dead and found a passage to life, aided only by one glimmering and seemingly ineffectual light.
+
+I see by your eagerness and the wonder and hope which your eyes express, my friend, that you expect to be informed of the secret with which I am acquainted; that cannot be; listen patiently until the end of my story, and you will easily perceive why I am reserved upon that subject. I will not lead you on, unguarded and ardent as I then was, to your destruction and infallible misery. Learn from me, if not by my precepts, at least by my example, how dangerous is the acquirement of knowledge and how much happier that man is who believes his native town to be the world, than he who aspires to become greater than his nature will allow.
+
+When I found so astonishing a power placed within my hands, I hesitated a long time concerning the manner in which I should employ it. Although I possessed the capacity of bestowing animation, yet to prepare a frame for the reception of it, with all its intricacies of fibres, muscles, and veins, still remained a work of inconceivable difficulty and labour. I doubted at first whether I should attempt the creation of a being like myself, or one of simpler organization; but my imagination was too much exalted by my first success to permit me to doubt of my ability to give life to an animal as complex and wonderful as man. The materials at present within my command hardly appeared adequate to so arduous an undertaking, but I doubted not that I should ultimately succeed. I prepared myself for a multitude of reverses; my operations might be incessantly baffled, and at last my work be imperfect, yet when I considered the improvement which every day takes place in science and mechanics, I was encouraged to hope my present attempts would at least lay the foundations of future success. Nor could I consider the magnitude and complexity of my plan as any argument of its impracticability. It was with these feelings that I began the creation of a human being. As the minuteness of the parts formed a great hindrance to my speed, I resolved, contrary to my first intention, to make the being of a gigantic stature, that is to say, about eight feet in height, and proportionably large. After having formed this determination and having spent some months in successfully collecting and arranging my materials, I began.
+
+No one can conceive the variety of feelings which bore me onwards, like a hurricane, in the first enthusiasm of success. Life and death appeared to me ideal bounds, which I should first break through, and pour a torrent of light into our dark world. A new species would bless me as its creator and source; many happy and excellent natures would owe their being to me. No father could claim the gratitude of his child so completely as I should deserve theirs. Pursuing these reflections, I thought that if I could bestow animation upon lifeless matter, I might in process of time (although I now found it impossible) renew life where death had apparently devoted the body to corruption.
+
+These thoughts supported my spirits, while I pursued my undertaking with unremitting ardour. My cheek had grown pale with study, and my person had become emaciated with confinement. Sometimes, on the very brink of certainty, I failed; yet still I clung to the hope which the next day or the next hour might realise. One secret which I alone possessed was the hope to which I had dedicated myself; and the moon gazed on my midnight labours, while, with unrelaxed and breathless eagerness, I pursued nature to her hiding-places. Who shall conceive the horrors of my secret toil as I dabbled among the unhallowed damps of the grave or tortured the living animal to animate the lifeless clay? My limbs now tremble, and my eyes swim with the remembrance; but then a resistless and almost frantic impulse urged me forward; I seemed to have lost all soul or sensation but for this one pursuit. It was indeed but a passing trance, that only made me feel with renewed acuteness so soon as, the unnatural stimulus ceasing to operate, I had returned to my old habits. I collected bones from charnel-houses and disturbed, with profane fingers, the tremendous secrets of the human frame. In a solitary chamber, or rather cell, at the top of the house, and separated from all the other apartments by a gallery and staircase, I kept my workshop of filthy creation; my eyeballs were starting from their sockets in attending to the details of my employment. The dissecting room and the slaughter-house furnished many of my materials; and often did my human nature turn with loathing from my occupation, whilst, still urged on by an eagerness which perpetually increased, I brought my work near to a conclusion.
+
+The summer months passed while I was thus engaged, heart and soul, in one pursuit. It was a most beautiful season; never did the fields bestow a more plentiful harvest or the vines yield a more luxuriant vintage, but my eyes were insensible to the charms of nature. And the same feelings which made me neglect the scenes around me caused me also to forget those friends who were so many miles absent, and whom I had not seen for so long a time. I knew my silence disquieted them, and I well remembered the words of my father: “I know that while you are pleased with yourself you will think of us with affection, and we shall hear regularly from you. You must pardon me if I regard any interruption in your correspondence as a proof that your other duties are equally neglected.”
+
+I knew well therefore what would be my father’s feelings, but I could not tear my thoughts from my employment, loathsome in itself, but which had taken an irresistible hold of my imagination. I wished, as it were, to procrastinate all that related to my feelings of affection until the great object, which swallowed up every habit of my nature, should be completed.
+
+I then thought that my father would be unjust if he ascribed my neglect to vice or faultiness on my part, but I am now convinced that he was justified in conceiving that I should not be altogether free from blame. A human being in perfection ought always to preserve a calm and peaceful mind and never to allow passion or a transitory desire to disturb his tranquillity. I do not think that the pursuit of knowledge is an exception to this rule. If the study to which you apply yourself has a tendency to weaken your affections and to destroy your taste for those simple pleasures in which no alloy can possibly mix, then that study is certainly unlawful, that is to say, not befitting the human mind. If this rule were always observed; if no man allowed any pursuit whatsoever to interfere with the tranquillity of his domestic affections, Greece had not been enslaved, Cæsar would have spared his country, America would have been discovered more gradually, and the empires of Mexico and Peru had not been destroyed.
+
+But I forget that I am moralizing in the most interesting part of my tale, and your looks remind me to proceed.
+
+My father made no reproach in his letters and only took notice of my silence by inquiring into my occupations more particularly than before. Winter, spring, and summer passed away during my labours; but I did not watch the blossom or the expanding leaves—sights which before always yielded me supreme delight—so deeply was I engrossed in my occupation. The leaves of that year had withered before my work drew near to a close, and now every day showed me more plainly how well I had succeeded. But my enthusiasm was checked by my anxiety, and I appeared rather like one doomed by slavery to toil in the mines, or any other unwholesome trade than an artist occupied by his favourite employment. Every night I was oppressed by a slow fever, and I became nervous to a most painful degree; the fall of a leaf startled me, and I shunned my fellow creatures as if I had been guilty of a crime. Sometimes I grew alarmed at the wreck I perceived that I had become; the energy of my purpose alone sustained me: my labours would soon end, and I believed that exercise and amusement would then drive away incipient disease; and I promised myself both of these when my creation should be complete.
diff --git a/util/allocator.cc b/util/allocator.cc
index 4beedca..df2575e 100644
--- a/util/allocator.cc
+++ b/util/allocator.cc
@@ -15,12 +15,12 @@
 
 #include "util/allocator.h"
 
+#include <stdint.h>
 #include <stdio.h>
 
 #include "util/basics.h"  // MaybeCheckInitialized
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
-#include "hwy/contrib/thread_pool/futex.h"
 #include "hwy/contrib/thread_pool/topology.h"
 #include "hwy/per_target.h"  // VectorBytes
 
@@ -46,13 +46,32 @@
 #endif  // GEMMA_BIND
 
 #if GEMMA_BIND && HWY_OS_LINUX
+#include <atomic>
+
+#include "hwy/contrib/thread_pool/futex.h"
+#endif
+
+#if HWY_OS_LINUX
+#include <unistd.h>  // sysconf
+#if GEMMA_BIND
 // `move_pages` requires anonymous/private mappings, hence mmap.
 #include <sys/mman.h>
 #include <sys/syscall.h>
 
 #include <cerrno>
 #include <vector>
-#endif  // GEMMA_BIND && HWY_OS_LINUX
+#endif  // GEMMA_BIND
+#elif HWY_OS_WIN
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#ifndef VC_EXTRALEAN
+#define VC_EXTRALEAN
+#endif
+#include <Windows.h>
+#elif HWY_OS_APPLE
+#include <sys/sysctl.h>
+#endif  // HWY_OS_LINUX
 
 namespace gcpp {
 namespace {
@@ -68,40 +87,58 @@ size_t DetectLineBytes() {
 
 size_t DetectPageSize() {
 #if HWY_OS_LINUX
-  size_t page_bytes = static_cast<size_t>(sysconf(_SC_PAGESIZE));
+  const long ret = sysconf(_SC_PAGESIZE);  // NOLINT(runtime/int)
+  HWY_ASSERT(ret != -1);
+  const size_t page_bytes = static_cast<size_t>(ret);
   HWY_ASSERT(page_bytes <= (4 << 20));
   return page_bytes;
+#elif HWY_OS_WIN
+  SYSTEM_INFO info;
+  GetSystemInfo(&info);
+  return info.dwPageSize;
+#elif HWY_OS_APPLE
+  uint64_t data = 0;
+  size_t len = sizeof(data);
+  HWY_ASSERT(sysctlbyname("vm.pagesize", &data, &len, nullptr, 0) == 0);
+  return data;
 #else
   return 0;
 #endif
 }
 
+size_t DetectTotalMiB(size_t page_bytes) {
+  (void)page_bytes;
+#if HWY_OS_LINUX
+  const long ret = sysconf(_SC_PHYS_PAGES);  // NOLINT(runtime/int)
+  HWY_ASSERT(ret != -1);
+  return static_cast<size_t>(ret) * page_bytes >> 20;
+#elif HWY_OS_WIN
+  MEMORYSTATUSEX ms = {sizeof(MEMORYSTATUSEX)};
+  HWY_ASSERT(GlobalMemoryStatusEx(&ms) != 0);
+  return ms.ullTotalPhys >> 20;
+#elif HWY_OS_APPLE
+  int mib[2] = {CTL_HW, HW_MEMSIZE};
+  uint64_t data = 0;
+  size_t len = sizeof(data);
+  HWY_ASSERT(sysctl(mib, sizeof(mib) / sizeof(*mib), &data, &len, nullptr, 0) ==
+             0);
+  return data >> 20;
+#else
+#error "Port"
+#endif
+}
+
 }  // namespace
 
-static size_t line_bytes_;
-static size_t vector_bytes_;
-static size_t step_bytes_;
-static size_t quantum_bytes_;
-static size_t quantum_steps_;
-static size_t l1_bytes_;
-static size_t l2_bytes_;
-static size_t l3_bytes_;
-static bool should_bind_ = false;
-
-size_t Allocator::LineBytes() { return line_bytes_; }
-size_t Allocator::VectorBytes() { return vector_bytes_; }
-size_t Allocator::StepBytes() { return step_bytes_; }
-size_t Allocator::QuantumBytes() { return quantum_bytes_; }
-size_t Allocator::QuantumSteps() { return quantum_steps_; }
-size_t Allocator::L1Bytes() { return l1_bytes_; }
-size_t Allocator::L2Bytes() { return l2_bytes_; }
-size_t Allocator::L3Bytes() { return l3_bytes_; }
-bool Allocator::ShouldBind() { return should_bind_; }
-
-void Allocator::Init(const BoundedTopology& topology, bool enable_bind) {
+Allocator::Allocator(const BoundedTopology& topology, bool enable_bind) {
   line_bytes_ = DetectLineBytes();
+  // Ensure MaxLineBytes() is an upper bound.
+  HWY_ASSERT(MaxLineBytes() >= LineBytes());
+
   vector_bytes_ = hwy::VectorBytes();
+
   step_bytes_ = HWY_MAX(line_bytes_, vector_bytes_);
+  base_page_bytes_ = DetectPageSize();
   quantum_bytes_ = step_bytes_;  // may overwrite below
 
   const BoundedTopology::Cluster& cluster = topology.GetCluster(0, 0);
@@ -117,6 +154,8 @@ void Allocator::Init(const BoundedTopology& topology, bool enable_bind) {
     l3_bytes_ = (cluster.SharedKiB() ? cluster.SharedKiB() : 1024) << 10;
   }
 
+  total_mib_ = DetectTotalMiB(base_page_bytes_);
+
   // Prerequisites for binding:
   // - supported by the OS (currently Linux only),
   // - the page size is known and 'reasonably small', preferably less than
@@ -124,30 +163,44 @@ void Allocator::Init(const BoundedTopology& topology, bool enable_bind) {
   // - we successfully detected topology and there are multiple nodes;
   // - there are multiple packages, because we shard by package_idx.
   if constexpr (GEMMA_BIND) {
-    const size_t page_bytes = DetectPageSize();
-    if ((page_bytes != 0 && page_bytes <= 16 * 1024) &&
+    if ((base_page_bytes_ != 0 && base_page_bytes_ <= 16 * 1024) &&
         topology.NumNodes() > 1 && topology.NumPackages() > 1) {
       if (enable_bind) {
         // Ensure pages meet the alignment requirements of `AllocBytes`.
-        HWY_ASSERT(page_bytes >= quantum_bytes_);
-        quantum_bytes_ = page_bytes;
-        // Ensure MaxQuantumBytes() is an upper bound.
-        HWY_ASSERT(MaxQuantumBytes() >= quantum_bytes_);
-        quantum_bytes_ = HWY_MIN(quantum_bytes_, MaxQuantumBytes());
+        HWY_ASSERT(base_page_bytes_ >= quantum_bytes_);
+        quantum_bytes_ = base_page_bytes_;
         should_bind_ = true;
       } else {
         HWY_WARN(
             "Multiple sockets but binding disabled. This reduces speed; "
-            "set or remove enable_bind to avoid this warning.");
+            "set --bind 1 to avoid this warning.");
       }
     }
   }
-
-  HWY_DASSERT(quantum_bytes_ % step_bytes_ == 0);
-  quantum_steps_ = quantum_bytes_ / step_bytes_;
 }
 
-Allocator::PtrAndDeleter Allocator::AllocBytes(size_t bytes) {
+size_t Allocator::FreeMiB() const {
+#if HWY_OS_LINUX
+  const long ret = sysconf(_SC_AVPHYS_PAGES);  // NOLINT(runtime/int)
+  HWY_ASSERT(ret != -1);
+  return static_cast<size_t>(ret) * base_page_bytes_ >> 20;
+#elif HWY_OS_WIN
+  MEMORYSTATUSEX ms = {sizeof(MEMORYSTATUSEX)};
+  HWY_ASSERT(GlobalMemoryStatusEx(&ms) != 0);
+  return ms.ullAvailVirtual >> 20;
+#elif HWY_OS_APPLE
+  uint64_t free = 0, inactive = 0, speculative = 0;
+  size_t len = sizeof(free);
+  sysctlbyname("vm.page_free_count", &free, &len, nullptr, 0);
+  sysctlbyname("vm.page_inactive_count", &inactive, &len, nullptr, 0);
+  sysctlbyname("vm.page_speculative_count", &speculative, &len, nullptr, 0);
+  return (free + inactive + speculative) * base_page_bytes_ >> 20;
+#else
+#error "Port"
+#endif
+}
+
+AlignedPtr<uint8_t[]> Allocator::AllocBytes(size_t bytes) const {
   // If we are not binding, the Highway allocator is cheaper than `mmap`, and
   // defends against 2K aliasing.
   if (!should_bind_) {
@@ -155,7 +208,7 @@ Allocator::PtrAndDeleter Allocator::AllocBytes(size_t bytes) {
     if (HWY_ALIGNMENT < QuantumBytes()) {
       HWY_WARN(
           "HWY_ALIGNMENT %d < QuantumBytes %zu: either vector or cache lines "
-          "are huge, enable GEMMA_BIND to avoid this warning.",
+          "are huge, enable GEMMA_BIND and set --bind 1 to avoid this warning.",
           HWY_ALIGNMENT, QuantumBytes());
     }
     auto p = hwy::AllocateAligned<uint8_t>(bytes);
@@ -163,10 +216,9 @@ Allocator::PtrAndDeleter Allocator::AllocBytes(size_t bytes) {
     // alignment scheme in aligned_allocator.cc and does not work for
     // already-aligned pointers as returned by `mmap`, hence we wrap the Highway
     // pointer in our own deleter.
-    auto call_free = [](void* ptr, size_t /*bytes*/) {
-      hwy::FreeAlignedBytes(ptr, nullptr, nullptr);
-    };
-    return PtrAndDeleter{p.release(), DeleterFree(call_free, bytes)};
+    return AlignedPtr<uint8_t[]>(p.release(), DeleterFunc([](void* ptr) {
+                                   hwy::FreeAlignedBytes(ptr, nullptr, nullptr);
+                                 }));
   }
 
   // Binding, or large vector/cache line size: use platform-specific allocator.
@@ -178,22 +230,18 @@ Allocator::PtrAndDeleter Allocator::AllocBytes(size_t bytes) {
   const int prot = PROT_READ | PROT_WRITE;
   const int flags = MAP_ANONYMOUS | MAP_PRIVATE;
   const int fd = -1;
-  // Encourage transparent hugepages by rounding up to a multiple of 2 MiB.
-  bytes = hwy::RoundUpTo(bytes, 2ull << 20);
   void* p = mmap(0, bytes, prot, flags, fd, off_t{0});
   if (p == MAP_FAILED) p = nullptr;
-  const auto call_munmap = [](void* ptr, size_t bytes) {
-    const int ret = munmap(ptr, bytes);
-    HWY_ASSERT(ret == 0);
-  };
-  return PtrAndDeleter{p, DeleterFree(call_munmap, bytes)};
+  return AlignedPtr<uint8_t[]>(
+      static_cast<uint8_t*>(p),
+      DeleterFunc([bytes](void* ptr) { HWY_ASSERT(munmap(ptr, bytes) == 0); }));
 #elif HWY_OS_WIN
-  const auto call_free = [](void* ptr, size_t) { _aligned_free(ptr); };
   const size_t alignment = HWY_MAX(vector_bytes_, line_bytes_);
-  return PtrAndDeleter{_aligned_malloc(bytes, alignment),
-                       DeleterFree(call_free, bytes)};
+  return AlignedPtr<uint8_t[]>(
+      static_cast<uint8_t*>(_aligned_malloc(bytes, alignment)),
+      DeleterFunc([](void* ptr) { _aligned_free(ptr); }));
 #else
-  return PtrAndDeleter{nullptr, DeleterFree(nullptr, 0)};
+  return AlignedPtr<uint8_t[]>(nullptr, DeleterFunc());
 #endif
 }
 
@@ -245,7 +293,7 @@ size_t CountBusyPages(size_t num_pages, size_t node, void** pages,
   return num_busy;
 }
 
-bool Allocator::BindMemory(void* ptr, size_t bytes, size_t node) {
+bool Allocator::BindMemory(void* ptr, size_t bytes, size_t node) const {
   HWY_DASSERT(should_bind_);
   constexpr size_t kMaxNodes = 1024;  // valid for x86/x64, and "enough"
 
@@ -302,7 +350,7 @@ bool Allocator::BindMemory(void* ptr, size_t bytes, size_t node) {
 }
 
 #else
-bool Allocator::BindMemory(void*, size_t, size_t) { return false; }
+bool Allocator::BindMemory(void*, size_t, size_t) const { return false; }
 #endif  // GEMMA_BIND && HWY_OS_LINUX
 
 }  // namespace gcpp
diff --git a/util/allocator.h b/util/allocator.h
index e54fdc7..bf904c5 100644
--- a/util/allocator.h
+++ b/util/allocator.h
@@ -21,6 +21,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <functional>
 // IWYU pragma: begin_exports
 #include <memory>  // std::unique_ptr
 
@@ -29,37 +30,32 @@
 #include "hwy/base.h"
 // IWYU pragma: end_exports
 
-#include "hwy/aligned_allocator.h"
-
 namespace gcpp {
 
-// Points to an adapter lambda that calls `FreeAlignedBytes` or `munmap`. The
-// `bytes` argument is required for the latter.
-using FreeFunc = void (*)(void* mem, size_t bytes);
-
-// Custom deleter for std::unique_ptr that calls `FreeFunc`. T is POD.
-class DeleterFree {
+// Custom deleter for types without a dtor, but where the deallocation requires
+// state, e.g. a lambda with *by-value* capture.
+class DeleterFunc {
  public:
-  // `MatStorageT` requires this to be default-constructible.
-  DeleterFree() : free_func_(nullptr), bytes_(0) {}
-  DeleterFree(FreeFunc free_func, size_t bytes)
-      : free_func_(free_func), bytes_(bytes) {}
+  // `MatOwnerT` requires this to be default-constructible.
+  DeleterFunc() = default;
+
+  template <class Closure>
+  DeleterFunc(const Closure& free_closure) : free_func_(free_closure) {}
 
   template <typename T>
   void operator()(T* p) const {
-    free_func_(p, bytes_);
+    free_func_(const_cast<hwy::RemoveConst<T>*>(p));
   }
 
  private:
-  FreeFunc free_func_;
-  size_t bytes_;
+  std::function<void(void*)> free_func_;
 };
 
-// Wrapper that also calls the destructor for non-POD T.
+// Wrapper that also calls the destructor for each element being deallocated.
 class DeleterDtor {
  public:
   DeleterDtor() {}
-  DeleterDtor(size_t num, DeleterFree free) : num_(num), free_(free) {}
+  DeleterDtor(size_t num, DeleterFunc free) : num_(num), free_(free) {}
 
   template <typename T>
   void operator()(T* p) const {
@@ -70,266 +66,114 @@ class DeleterDtor {
   }
 
  private:
-  size_t num_;  // not the same as free_.bytes_ / sizeof(T)!
-  DeleterFree free_;
+  size_t num_;
+  DeleterFunc free_;
 };
 
-// Unique (move-only) pointer to an aligned array of POD T.
+// Unique (move-only) pointer to aligned POD T, which can be an array or class.
 template <typename T>
-using AlignedPtr = std::unique_ptr<T[], DeleterFree>;
+using AlignedPtr = std::unique_ptr<T, DeleterFunc>;
 // Unique (move-only) pointer to an aligned array of non-POD T.
 template <typename T>
-using AlignedClassPtr = std::unique_ptr<T[], DeleterDtor>;
+using AlignedClassPtr = std::unique_ptr<T, DeleterDtor>;
 
 // Both allocation, binding, and row accessors depend on the sizes of memory
 // pages and cache lines. To avoid having to pass `Allocator&` everywhere, we
-// use `Monostate` (static members).
+// wrap this in a singleton. A monostate requires explicit initialization,
+// which we prefer to avoid because there are many main() functions.
 class Allocator {
  public:
   // Must be called at least once before any other function. Not thread-safe,
   // hence only call this from the main thread.
-  // TODO: remove enable_bind once Gemma tensors support binding.
-  static void Init(const BoundedTopology& topology, bool enable_bind = false);
+  Allocator(const BoundedTopology& topology, bool enable_bind);
 
   // Bytes per cache line, or a reasonable guess if unknown. Used to choose
   // ranges such that there will be no false sharing.
-  static size_t LineBytes();
+  size_t LineBytes() const { return line_bytes_; }
+  // Upper bound on `LineBytes()`, for stack allocations.
+  static constexpr size_t MaxLineBytes() { return 256; }
   // Bytes per full vector. Used to compute loop steps.
-  static size_t VectorBytes();
+  size_t VectorBytes() const { return vector_bytes_; }
   // Work granularity that avoids false sharing and partial vectors.
-  static size_t StepBytes();  // = HWY_MAX(LineBytes(), VectorBytes())
-  // Granularity like `StepBytes()`, but when NUMA may be involved.
-  static size_t QuantumBytes();
-  // Upper bound on `QuantumBytes()`, for stack allocations.
-  static constexpr size_t MaxQuantumBytes() { return 4096; }
-  static size_t QuantumSteps();  // = QuantumBytes() / StepBytes()
+  // = HWY_MAX(LineBytes(), VectorBytes())
+  size_t StepBytes() const { return step_bytes_; }
+
+  // File size multiple required for memory mapping. Also used when binding
+  // memory to NUMA nodes (see `BindB/BindC`).
+  size_t BasePageBytes() const { return base_page_bytes_; }
+
+  // Desired allocator alignment: Either StepBytes, or BasePageBytes if NUMA.
+  size_t QuantumBytes() const { return quantum_bytes_; }
 
   // L1 and L2 are typically per core.
-  static size_t L1Bytes();
-  static size_t L2Bytes();
+  size_t L1Bytes() const { return l1_bytes_; }
+  size_t L2Bytes() const { return l2_bytes_; }
   // Clusters often share an L3. We return the total size per package.
-  static size_t L3Bytes();
+  size_t L3Bytes() const { return l3_bytes_; }
 
-  // Returns pointer aligned to `QuantumBytes()`.
+  size_t TotalMiB() const { return total_mib_; }
+  size_t FreeMiB() const;
+
+  // Returns byte pointer aligned to `QuantumBytes()`, without calling
+  // constructors nor destructors on deletion. Type-erased so this can be
+  // implemented in `allocator.cc` and called by `MatOwner`.
+  AlignedPtr<uint8_t[]> AllocBytes(size_t bytes) const;
+
+  // Returns pointer aligned to `QuantumBytes()`, without calling constructors
+  // nor destructors on deletion.
   template <typename T>
-  static AlignedPtr<T> Alloc(size_t num) {
-    constexpr size_t kSize = sizeof(T);
-    constexpr bool kIsPow2 = (kSize & (kSize - 1)) == 0;
-    constexpr size_t kBits = hwy::detail::ShiftCount(kSize);
-    static_assert(!kIsPow2 || (1ull << kBits) == kSize, "ShiftCount has a bug");
-    const size_t bytes = kIsPow2 ? num << kBits : num * kSize;
-    // Fail if the `bytes = num * kSize` computation overflowed.
-    const size_t check = kIsPow2 ? bytes >> kBits : bytes / kSize;
-    if (check != num) return AlignedPtr<T>();
+  AlignedPtr<T[]> Alloc(size_t num) const {
+    const size_t bytes = num * sizeof(T);
+    // Fail if the `bytes = num * sizeof(T)` computation overflowed.
+    HWY_ASSERT(bytes / sizeof(T) == num);
 
-    PtrAndDeleter pd = AllocBytes(bytes);
-    return AlignedPtr<T>(static_cast<T*>(pd.p), pd.deleter);
+    AlignedPtr<uint8_t[]> p8 = AllocBytes(bytes);
+    return AlignedPtr<T[]>(HWY_RCAST_ALIGNED(T*, p8.release()),
+                           p8.get_deleter());
   }
 
-  // Same as Alloc, but calls constructor(s) with `args`.
+  // Same as Alloc, but calls constructor(s) with `args` and the deleter will
+  // call destructor(s).
   template <typename T, class... Args>
-  static AlignedClassPtr<T> AllocClasses(size_t num, Args&&... args) {
-    constexpr size_t kSize = sizeof(T);
-    constexpr bool kIsPow2 = (kSize & (kSize - 1)) == 0;
-    constexpr size_t kBits = hwy::detail::ShiftCount(kSize);
-    static_assert(!kIsPow2 || (1ull << kBits) == kSize, "ShiftCount has a bug");
-    const size_t bytes = kIsPow2 ? num << kBits : num * kSize;
-    // Fail if the `bytes = num * kSize` computation overflowed.
-    const size_t check = kIsPow2 ? bytes >> kBits : bytes / kSize;
-    if (check != num) return AlignedClassPtr<T>();
+  AlignedClassPtr<T> AllocClasses(size_t num, Args&&... args) const {
+    const size_t bytes = num * sizeof(T);
+    // Fail if the `bytes = num * sizeof(T)` computation overflowed.
+    HWY_ASSERT(bytes / sizeof(T) == num);
 
-    PtrAndDeleter pd = AllocBytes(bytes);
-    T* p = static_cast<T*>(pd.p);
+    AlignedPtr<uint8_t[]> p8 = AllocBytes(bytes);
+    T* p = HWY_RCAST_ALIGNED(T*, p8.release());
     for (size_t i = 0; i < num; ++i) {
       new (p + i) T(std::forward<Args>(args)...);
     }
-    return AlignedClassPtr<T>(p, DeleterDtor(num, pd.deleter));
+    return AlignedClassPtr<T>(p, DeleterDtor(num, p8.get_deleter()));
   }
 
   // Returns whether `BindMemory` can/should be called, i.e. we have page-level
   // control over memory placement and multiple packages and NUMA nodes.
-  static bool ShouldBind();
+  bool ShouldBind() const { return should_bind_; }
 
   // Attempts to move(!) `[p, p + bytes)` to the given NUMA node, which is
   // typically `BoundedTopology::GetCluster(package_idx, cluster_idx).node`.
   // Writes zeros to SOME of the memory. Only call if `ShouldBind()`.
   // `p` and `bytes` must be multiples of `QuantumBytes()`.
-  static bool BindMemory(void* p, size_t bytes, size_t node);
+  bool BindMemory(void* p, size_t bytes, size_t node) const;
 
  private:
-  // Type-erased so this can be implemented in allocator.cc.
-  struct PtrAndDeleter {
-    void* p;
-    DeleterFree deleter;
-  };
-  static PtrAndDeleter AllocBytes(size_t bytes);
+  size_t line_bytes_;
+  size_t vector_bytes_;
+  size_t step_bytes_;
+  size_t base_page_bytes_;
+  size_t quantum_bytes_;
+
+  size_t l1_bytes_ = 0;
+  size_t l2_bytes_ = 0;
+  size_t l3_bytes_ = 0;
+
+  size_t total_mib_;
+
+  bool should_bind_ = false;
 };
 
-// Value of `stride` to pass to `RowVectorBatch` to enable the "cyclic offsets"
-// optimization. If `Allocator::ShouldBind()`, `Allocator::QuantumBytes()` is
-// typically 4KiB. To avoid remote accesses, we would thus pad each row to that,
-// which results in 4K aliasing and/or cache conflict misses. `RowPtr` is able
-// to prevent that by pulling rows forward by a cyclic offset, which is still a
-// multiple of the cache line size. This requires an additional
-// `Allocator::QuantumBytes()` of padding after also rounding up to that.
-template <typename T>
-constexpr size_t StrideForCyclicOffsets(size_t cols) {
-  const size_t quantum = Allocator::MaxQuantumBytes() / sizeof(T);
-  return hwy::RoundUpTo(cols, quantum) + quantum;
-}
-
-// Owns dynamically-allocated aligned memory for a batch of row vectors.
-// This can be seen as a (batch_size x cols) matrix. Unlike `RowPtr`, this owns
-// the memory.
-template <typename T>
-class RowVectorBatch {
- public:
-  // Default ctor for Activations ctor.
-  RowVectorBatch() = default;
-  // Main ctor, called from Activations::Allocate. If `stride` = 0, the default,
-  // we default to tightly packed rows (`stride = cols`).
-  // WARNING: not all call sites support `stride` != cols.
-  // TODO: once they do, remove stride and behave like AllocateAlignedRows here.
-  RowVectorBatch(Extents2D extents, size_t stride = 0) : extents_(extents) {
-    if (stride == 0) {
-      stride_ = extents_.cols;
-    } else {
-      HWY_ASSERT(stride >= extents_.cols);
-      stride_ = stride;
-    }
-    // Allow binding the entire matrix.
-    const size_t padded = hwy::RoundUpTo(extents_.rows * stride_,
-                                         Allocator::QuantumBytes() / sizeof(T));
-    mem_ = Allocator::Alloc<T>(padded);
-  }
-
-  // Move-only
-  RowVectorBatch(RowVectorBatch&) noexcept = delete;
-  RowVectorBatch& operator=(RowVectorBatch&) noexcept = delete;
-  RowVectorBatch(RowVectorBatch&&) noexcept = default;
-  RowVectorBatch& operator=(RowVectorBatch&&) noexcept = default;
-
-  size_t BatchSize() const { return extents_.rows; }
-  size_t Cols() const { return extents_.cols; }
-  size_t Stride() const { return stride_; }
-  Extents2D Extents() const { return extents_; }
-
-  // Returns the given row vector of length `Cols()`.
-  T* Batch(size_t batch_idx) {
-    HWY_DASSERT(batch_idx < BatchSize());
-    return mem_.get() + batch_idx * stride_;
-  }
-  const T* Batch(size_t batch_idx) const {
-    HWY_DASSERT(batch_idx < BatchSize());
-    return mem_.get() + batch_idx * stride_;
-  }
-
-  // For MatMul or other operations that process the entire batch at once.
-  // TODO: remove once we only use Mat.
-  T* All() { return mem_.get(); }
-  const T* Const() const { return mem_.get(); }
-  size_t NumBytes() const { return BatchSize() * stride_ * sizeof(T); }
-
- private:
-  AlignedPtr<T> mem_;
-  Extents2D extents_;
-  size_t stride_;
-};
-
-// Returns `num` rounded up to an odd number of cache lines. This is used to
-// compute strides. An odd number of cache lines prevents 2K aliasing and is
-// coprime with the cache associativity, which reduces conflict misses.
-template <typename T>
-static HWY_INLINE size_t RoundUpToOddLines(size_t num, size_t line_bytes) {
-  HWY_DASSERT(line_bytes >= 32);
-  HWY_DASSERT(line_bytes % sizeof(T) == 0);
-  const size_t lines = hwy::DivCeil(num * sizeof(T), line_bytes);
-  const size_t padded_num = (lines | 1) * line_bytes / sizeof(T);
-  HWY_DASSERT(padded_num >= num);
-  return padded_num;
-}
-
-template <typename T>
-RowVectorBatch<T> AllocateAlignedRows(Extents2D extents) {
-  return RowVectorBatch<T>(extents, StrideForCyclicOffsets<T>(extents.cols));
-}
-
-// Lightweight version of `MatPtr` used for the C argument of `MatMul`, because
-// it is always float and does not support compressed T, but does support an
-// arbitrary stride >= cols.
-#pragma pack(push, 1)  // power of two size
-template <typename T>
-class RowPtr {
- public:
-  RowPtr() = default;  // for `MMPtrs`.
-  RowPtr(T* HWY_RESTRICT row0, size_t cols, size_t stride)
-      : row0_(row0),
-        stride_(stride),
-        step_(static_cast<uint32_t>(Allocator::StepBytes())),
-        cols_(static_cast<uint32_t>(cols)),
-        row_mask_(Allocator::QuantumSteps() - 1) {
-    HWY_DASSERT(stride >= cols);
-    HWY_DASSERT(row_mask_ != ~size_t{0});
-    if constexpr (HWY_IS_DEBUG_BUILD) {
-      if (stride < StrideForCyclicOffsets<T>(cols)) {
-        static bool once;
-        if (!once) {
-          once = true;
-          HWY_WARN(
-              "Check why RowPtr stride=%zu < StrideForCyclicOffsets(cols=%zu), "
-              "T=%zu; this forces us to disable cyclic offsets.",
-              stride, cols, sizeof(T));
-        }
-        row_mask_ = 0;
-      }
-    }
-  }
-  RowPtr(T* HWY_RESTRICT row0, size_t cols) : RowPtr(row0, cols, cols) {}
-
-  T* HWY_RESTRICT Row(size_t r) const {
-    // How much of the previous row's padding to consume.
-    const size_t pad_bytes = (r & row_mask_) * step_;
-    HWY_DASSERT(pad_bytes < Allocator::QuantumBytes());
-    return row0_ + stride_ * r - pad_bytes;
-  }
-  size_t Cols() const { return cols_; }
-
-  size_t Stride() const { return stride_; }
-  void SetStride(size_t stride) {
-    HWY_DASSERT(stride >= Cols());
-    stride_ = stride;
-    // The caller might not have padded enough, so disable the padding in Row().
-    // Rows will now be exactly `stride` elements apart. This is used when
-    // writing to the KV cache via MatMul.
-    row_mask_ = 0;
-  }
-
-  // Returns 2D subrange whose top-left is `r, c` and width is `cols`.
-  RowPtr<T> View(size_t r, size_t c, size_t cols) const {
-    HWY_DASSERT(c < cols_);
-    HWY_DASSERT(cols <= cols_ - c);
-    return RowPtr<T>(Row(r) + c, cols, stride_);
-  }
-
- private:
-  T* HWY_RESTRICT row0_;
-  size_t stride_;
-  uint32_t step_;  // Copy from Allocator::LineBytes() to improve locality.
-  uint32_t cols_;
-  size_t row_mask_;
-};
-#pragma pack(pop)
-
-using RowPtrBF = RowPtr<BF16>;
-using RowPtrF = RowPtr<float>;
-using RowPtrD = RowPtr<double>;
-
-// For C argument to MatMul.
-template <typename T>
-RowPtr<T> RowPtrFromBatch(RowVectorBatch<T>& row_vectors) {
-  return RowPtr<T>(row_vectors.All(), row_vectors.Cols(), row_vectors.Stride());
-}
-
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_UTIL_ALLOCATOR_H_
diff --git a/util/app.h b/util/app.h
deleted file mode 100644
index a66dd3d..0000000
--- a/util/app.h
+++ /dev/null
@@ -1,320 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Shared between various frontends.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
-#define THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
-
-#include <stddef.h>
-#include <stdio.h>
-
-#include <memory>
-#include <string>
-
-#include "compression/io.h"  // Path
-#include "compression/shared.h"
-#include "gemma/common.h"
-#include "gemma/gemma.h"  // For CreateGemma
-#include "ops/matmul.h"
-#include "util/args.h"
-#include "util/basics.h"  // Tristate
-#include "util/threading.h"
-#include "hwy/base.h"  // HWY_IS_ASAN
-
-namespace gcpp {
-
-static inline const char* CompiledConfig() {
-  if (HWY_IS_ASAN) {
-    return "asan";
-  } else if (HWY_IS_MSAN) {
-    return "msan";
-  } else if (HWY_IS_TSAN) {
-    return "tsan";
-  } else if (HWY_IS_HWASAN) {
-    return "hwasan";
-  } else if (HWY_IS_UBSAN) {
-    return "ubsan";
-  } else if (HWY_IS_DEBUG_BUILD) {
-    return "dbg";
-  } else {
-    return "opt";
-  }
-}
-
-class AppArgs : public ArgsBase<AppArgs> {
- public:
-  AppArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
-  AppArgs() { Init(); };
-
-  int verbosity;
-
-  size_t max_threads;  // divided among the detected clusters
-  Tristate pin;        // pin threads?
-  Tristate spin;       // use spin waits?
-
-  // For BoundedSlice:
-  size_t skip_packages;
-  size_t max_packages;
-  size_t skip_clusters;
-  size_t max_clusters;
-  size_t skip_lps;
-  size_t max_lps;
-
-  std::string eot_line;
-
-  template <class Visitor>
-  void ForEach(const Visitor& visitor) {
-    visitor(verbosity, "verbosity", 1,
-            "Show verbose developer information\n    0 = only print generation "
-            "output\n    1 = standard user-facing terminal ui\n    2 = show "
-            "developer/debug info).\n    Default = 1.",
-            2);
-
-    // The exact meaning is more subtle: see the comment at NestedPools ctor.
-    visitor(max_threads, "num_threads", size_t{0},
-            "Maximum number of threads to use; default 0 = unlimited.", 2);
-    visitor(pin, "pin", Tristate::kDefault,
-            "Pin threads? -1 = auto, 0 = no, 1 = yes.", 2);
-    visitor(spin, "spin", Tristate::kDefault,
-            "Use spin waits? -1 = auto, 0 = no, 1 = yes.", 2);
-    // These can be used to partition CPU sockets/packages and their
-    // clusters/CCXs across several program instances. The default is to use
-    // all available resources.
-    visitor(skip_packages, "skip_packages", size_t{0},
-            "Index of the first socket to use; default 0 = unlimited.", 2);
-    visitor(max_packages, "max_packages", size_t{0},
-            "Maximum number of sockets to use; default 0 = unlimited.", 2);
-    visitor(skip_clusters, "skip_clusters", size_t{0},
-            "Index of the first CCX to use; default 0 = unlimited.", 2);
-    visitor(max_clusters, "max_clusters", size_t{0},
-            "Maximum number of CCXs to use; default 0 = unlimited.", 2);
-    // These are only used when CPU topology is unknown.
-    visitor(skip_lps, "skip_lps", size_t{0},
-            "Index of the first LP to use; default 0 = unlimited.", 2);
-    visitor(max_lps, "max_lps", size_t{0},
-            "Maximum number of LPs to use; default 0 = unlimited.", 2);
-
-    visitor(
-        eot_line, "eot_line", std::string(""),
-        "End of turn line. "
-        "When you specify this, the prompt will be all lines "
-        "before the line where only the given string appears.\n    Default = "
-        "When a newline is encountered, that signals the end of the turn.",
-        2);
-  }
-};
-
-static inline BoundedTopology CreateTopology(const AppArgs& app) {
-  return BoundedTopology(BoundedSlice(app.skip_packages, app.max_packages),
-                         BoundedSlice(app.skip_clusters, app.max_clusters),
-                         BoundedSlice(app.skip_lps, app.max_lps));
-}
-static inline NestedPools CreatePools(const BoundedTopology& topology,
-                                      const AppArgs& app) {
-  Allocator::Init(topology);
-  return NestedPools(topology, app.max_threads, app.pin);
-}
-
-struct LoaderArgs : public ArgsBase<LoaderArgs> {
-  LoaderArgs(int argc, char* argv[], bool validate = true) {
-    InitAndParse(argc, argv);
-
-    if (validate) {
-      if (const char* error = Validate()) {
-        HWY_ABORT("Invalid args: %s", error);
-      }
-    }
-  }
-  LoaderArgs(const std::string& tokenizer_path, const std::string& weights_path,
-             const std::string& model, bool validate = true) {
-    Init();  // Init sets to defaults, so assignments must come after Init().
-    tokenizer.path = tokenizer_path;
-    weights.path = weights_path;
-    model_type_str = model;
-
-    if (validate) {
-      if (const char* error = Validate()) {
-        HWY_ABORT("Invalid args: %s", error);
-      }
-    }
-  };
-
-  // Returns error string or nullptr if OK.
-  const char* Validate() {
-    if (!compressed_weights.path.empty()) {
-      if (weights.path.empty()) {
-        weights = compressed_weights;
-      } else {
-        return "Only one of --weights and --compressed_weights can be "
-               "specified. To create compressed weights use the "
-               "compress_weights tool.";
-      }
-    }
-    if (weights.path.empty()) {
-      return "Missing --weights flag, a file for the model weights.";
-    }
-    if (!weights.Exists()) {
-      return "Can't open file specified with --weights flag.";
-    }
-    info_.model = Model::UNKNOWN;
-    info_.wrapping = PromptWrapping::GEMMA_PT;
-    info_.weight = Type::kUnknown;
-    if (!model_type_str.empty()) {
-      const char* err = ParseModelTypeAndWrapping(model_type_str, info_.model,
-                                                  info_.wrapping);
-      if (err != nullptr) return err;
-    }
-    if (!weight_type_str.empty()) {
-      const char* err = ParseType(weight_type_str, info_.weight);
-      if (err != nullptr) return err;
-    }
-    if (!tokenizer.path.empty()) {
-      if (!tokenizer.Exists()) {
-        return "Can't open file specified with --tokenizer flag.";
-      }
-    }
-    // model_type and tokenizer must be either both present or both absent.
-    // Further checks happen on weight loading.
-    if (model_type_str.empty() != tokenizer.path.empty()) {
-      return "Missing or extra flags for model_type or tokenizer.";
-    }
-    return nullptr;
-  }
-
-  Path tokenizer;
-  Path weights;  // weights file location
-  Path compressed_weights;
-  std::string model_type_str;
-  std::string weight_type_str;
-
-  template <class Visitor>
-  void ForEach(const Visitor& visitor) {
-    visitor(tokenizer, "tokenizer", Path(),
-            "Path name of tokenizer model file.");
-    visitor(weights, "weights", Path(),
-            "Path name of model weights (.sbs) file.\n  Required argument.\n");
-    visitor(compressed_weights, "compressed_weights", Path(),
-            "Deprecated alias for --weights.");
-    visitor(
-        model_type_str, "model", std::string(),
-        "Model type, see common.cc for valid values.\n");
-    visitor(weight_type_str, "weight_type", std::string("sfp"),
-            "Weight type\n    f32 = float, bf16 = bfloat16, sfp = 8-bit SFP.");
-  }
-
-  // Uninitialized before Validate, must call after that.
-  const ModelInfo& Info() const { return info_; }
-
- private:
-  // TODO(rays): remove this. Eventually ModelConfig will be loaded from the
-  // weights file, so we can remove the need for this struct entirely.
-  ModelInfo info_;
-};
-
-// `env` must remain valid for the lifetime of the Gemma.
-static inline Gemma CreateGemma(const LoaderArgs& loader, MatMulEnv& env) {
-  if (Type::kUnknown == loader.Info().weight ||
-      Model::UNKNOWN == loader.Info().model || loader.tokenizer.path.empty()) {
-    // New weights file format doesn't need tokenizer path or model/weightinfo.
-    return Gemma(loader.weights, env);
-  }
-  return Gemma(loader.tokenizer, loader.weights, loader.Info(), env);
-}
-
-// `env` must remain valid for the lifetime of the Gemma.
-static inline std::unique_ptr<Gemma> AllocateGemma(const LoaderArgs& loader,
-                                                   MatMulEnv& env) {
-  if (Type::kUnknown == loader.Info().weight ||
-      Model::UNKNOWN == loader.Info().model || loader.tokenizer.path.empty()) {
-    // New weights file format doesn't need tokenizer path or model/weight info.
-    return std::make_unique<Gemma>(loader.weights, env);
-  }
-  return std::make_unique<Gemma>(loader.tokenizer, loader.weights,
-                                 loader.Info(), env);
-}
-
-struct InferenceArgs : public ArgsBase<InferenceArgs> {
-  InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
-  InferenceArgs() { Init(); };
-
-  size_t max_generated_tokens;
-
-  size_t prefill_tbatch_size;
-  size_t decode_qbatch_size;
-
-  float temperature;
-  size_t top_k;
-  bool deterministic;
-  bool multiturn;
-  Path image_file;
-
-  // Returns error string or nullptr if OK.
-  const char* Validate() const {
-    if (max_generated_tokens > gcpp::kSeqLen) {
-      return "max_generated_tokens is larger than the maximum sequence length "
-             "(see configs.h).";
-    }
-    return nullptr;
-  }
-
-  template <class Visitor>
-  void ForEach(const Visitor& visitor) {
-    visitor(max_generated_tokens, "max_generated_tokens", size_t{2048},
-            "Maximum number of tokens to generate.");
-
-    visitor(prefill_tbatch_size, "prefill_tbatch", size_t{256},
-            "Prefill: max tokens per batch.");
-    visitor(decode_qbatch_size, "decode_qbatch", size_t{16},
-            "Decode: max queries per batch.");
-
-    visitor(temperature, "temperature", 1.0f, "Temperature for top-K", 2);
-    visitor(top_k, "top_k", size_t{1}, "Number of top-K tokens to sample from",
-            2);
-    visitor(deterministic, "deterministic", false,
-            "Make top-k sampling deterministic", 2);
-    visitor(multiturn, "multiturn", false,
-            "Multiturn mode\n    0 = clear KV cache after every "
-            "interaction\n    1 = continue KV cache after every interaction\n  "
-            "  Default : 0 (conversation "
-            "resets every turn)");
-    visitor(image_file, "image_file", Path(), "Image file to load.");
-  }
-
-  void CopyTo(RuntimeConfig& runtime_config) const {
-    runtime_config.max_generated_tokens = max_generated_tokens;
-    runtime_config.prefill_tbatch_size = prefill_tbatch_size;
-    runtime_config.decode_qbatch_size = decode_qbatch_size;
-    if (prefill_tbatch_size > MMStorage::kMaxM) {
-      HWY_ABORT(
-          "prefill_tbatch_size %zu > kMaxM %zu: specify a smaller value, "
-          "or increase the constant in MMStorage.\n",
-          prefill_tbatch_size, MMStorage::kMaxM);
-    }
-    if (decode_qbatch_size > MMStorage::kMaxM) {
-      HWY_ABORT(
-          "decode_qbatch_size %zu > kMaxM %zu: specify a smaller value, "
-          "or increase the constant in MMStorage.\n",
-          decode_qbatch_size, MMStorage::kMaxM);
-    }
-
-    runtime_config.temperature = temperature;
-    runtime_config.top_k = top_k;
-  }
-};
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
diff --git a/util/args.h b/util/args.h
index ab496ae..32d54fb 100644
--- a/util/args.h
+++ b/util/args.h
@@ -23,7 +23,7 @@
 #include <algorithm>  // std::transform
 #include <string>
 
-#include "compression/io.h"
+#include "io/io.h"        // Path
 #include "util/basics.h"  // Tristate
 #include "hwy/base.h"  // HWY_ABORT
 
@@ -181,6 +181,10 @@ class ArgsBase {
   void ForEach(Visitor& visitor) {
     static_cast<Args*>(this)->ForEach(visitor);
   }
+  template <class Visitor>
+  void ForEach(Visitor& visitor) const {
+    const_cast<ArgsBase*>(this)->ForEach(visitor);
+  }
 
  public:
   // WARNING: cannot call from ctor because the derived ctor has not yet run.
@@ -189,12 +193,12 @@ class ArgsBase {
     ForEach(visitor);
   }
 
-  void Help() {
+  void Help() const {
     HelpVisitor visitor;
     ForEach(visitor);
   }
 
-  void Print(int verbosity = 0) {
+  void Print(int verbosity = 0) const {
     PrintVisitor visitor(verbosity);
     ForEach(visitor);
   }
@@ -225,7 +229,7 @@ static inline HWY_MAYBE_UNUSED bool HasHelp(int argc, char* argv[]) {
 }
 
 template <class TArgs>
-static inline HWY_MAYBE_UNUSED void AbortIfInvalidArgs(TArgs& args) {
+static inline HWY_MAYBE_UNUSED void AbortIfInvalidArgs(const TArgs& args) {
   if (const char* err = args.Validate()) {
     args.Help();
     HWY_ABORT("Problem with args: %s\n", err);
diff --git a/util/basics.h b/util/basics.h
index b8f2735..40545fd 100644
--- a/util/basics.h
+++ b/util/basics.h
@@ -67,10 +67,7 @@ struct TokenAndProb {
 // Entire size of a 2D array.
 struct Extents2D {
   constexpr Extents2D() : rows(0), cols(0) {}
-  constexpr Extents2D(size_t rows, size_t cols) : rows(rows), cols(cols) {
-    HWY_DASSERT(rows != 0);
-    HWY_DASSERT(cols != 0);
-  }
+  constexpr Extents2D(size_t rows, size_t cols) : rows(rows), cols(cols) {}
 
   size_t Area() const { return rows * cols; }
 
diff --git a/util/mat.cc b/util/mat.cc
new file mode 100644
index 0000000..f81767d
--- /dev/null
+++ b/util/mat.cc
@@ -0,0 +1,99 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "util/mat.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "util/threading_context.h"
+#include "hwy/base.h"
+#include "hwy/per_target.h"  // VectorBytes
+#include "hwy/profiler.h"
+
+namespace gcpp {
+
+void CopyMat(const MatPtr& from, MatPtr& to) {
+  PROFILER_FUNC;
+  HWY_ASSERT_M(from.HasPtr() && to.HasPtr(), to.Name());
+  HWY_ASSERT(to.Rows() == from.Rows() && to.Cols() == from.Cols());
+  HWY_ASSERT(to.GetType() == from.GetType());
+  to.SetScale(from.Scale());
+
+  if (to.IsPacked() && from.IsPacked()) {
+    HWY_ASSERT(to.PackedBytes() == from.PackedBytes());
+    hwy::CopyBytes(from.Packed(), to.Packed(), to.PackedBytes());
+    return;
+  }
+  const size_t row_bytes = to.Cols() * to.ElementBytes();
+  for (size_t r = 0; r < to.Rows(); ++r) {
+    const uint8_t* from_row = from.RowBytes(r);
+    uint8_t* to_row = to.RowBytes(r);
+    hwy::CopyBytes(from_row, to_row, row_bytes);
+  }
+}
+
+void ZeroInit(MatPtr& mat) {
+  PROFILER_FUNC;
+  HWY_ASSERT_M(mat.HasPtr(), mat.Name());
+  mat.SetScale(1.0f);
+
+  if (mat.IsPacked()) {
+    hwy::ZeroBytes(mat.Packed(), mat.PackedBytes());
+    return;
+  }
+  // Also zero-initialize padding (required by MatMul).
+  hwy::ZeroBytes(mat.RowBytes(0),
+                 mat.Stride() * mat.ElementBytes() * mat.Rows());
+}
+
+size_t Stride(MatPadding padding, size_t cols, size_t element_bytes,
+              size_t line_bytes) {
+  switch (padding) {
+    case MatPadding::kPacked:
+    default:
+      return cols;
+    case MatPadding::kOdd: {
+      // Round up to an odd number of cache lines to prevent 4K aliasing and
+      // reduce conflict misses (coprime with the cache associativity).
+      HWY_DASSERT(line_bytes >= 32);
+      HWY_DASSERT(line_bytes % element_bytes == 0);
+      const size_t lines = hwy::DivCeil(cols * element_bytes, line_bytes);
+      const size_t padded_cols = (lines | 1) * line_bytes / element_bytes;
+      HWY_DASSERT(padded_cols >= cols);
+      return padded_cols;
+    }
+  }
+}
+
+void MatOwner::AllocateFor(MatPtr& mat, const Allocator& allocator,
+                           MatPadding padding) {
+  const bool is_nuq = mat.GetType() == Type::kNUQ;
+  if (is_nuq) padding = MatPadding::kPacked;
+  const size_t stride =
+      Stride(padding, mat.Cols(), mat.ElementBytes(), allocator.LineBytes());
+  const size_t num = is_nuq ? mat.PackedBytes() : mat.Rows() * stride;
+  // `compress-inl` requires up to 2 BF16 vectors of padding. `MatPadding`
+  // might not be enough, hence add extra. `MatT` is at least one byte, which
+  // is half of BF16, hence adding `VectorBytes` *elements* is enough.
+  const size_t bytes = (num + hwy::VectorBytes()) * mat.ElementBytes();
+  // Allow binding the entire matrix.
+  const size_t padded_bytes =
+      hwy::RoundUpTo(bytes, allocator.QuantumBytes() / mat.ElementBytes());
+  storage_ = allocator.AllocBytes(padded_bytes);
+  mat.SetPtr(storage_.get(), stride);
+}
+
+}  // namespace gcpp
diff --git a/util/mat.h b/util/mat.h
new file mode 100644
index 0000000..b0de72d
--- /dev/null
+++ b/util/mat.h
@@ -0,0 +1,503 @@
+// Copyright 2023 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Tensor metadata and in-memory representation.
+#ifndef THIRD_PARTY_GEMMA_CPP_UTIL_MAT_H_
+#define THIRD_PARTY_GEMMA_CPP_UTIL_MAT_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+// IWYU pragma: begin_exports
+#include "compression/types.h"  // Type
+#include "gemma/tensor_info.h"
+#include "io/fields.h"
+#include "util/allocator.h"  // AlignedPtr
+#include "util/basics.h"  // Extents2D
+// IWYU pragma: end_exports
+#include "hwy/base.h"
+
+namespace gcpp {
+
+// Type-safe wrapper over type-erased uint8_t row pointers from MatPtr. Used
+// for C (KV output), in future also for A or even B.
+template <typename T>
+class RowPtrs {
+ public:
+  RowPtrs(uint8_t** row_ptrs) : row_ptrs_(row_ptrs) {}
+
+  T* HWY_RESTRICT operator[](size_t row_idx) const {
+    return HWY_RCAST_ALIGNED(T*, row_ptrs_[row_idx]);
+  }
+
+ private:
+  uint8_t** row_ptrs_;
+};
+
+// Type-erased, non-owning pointer and metadata for rank-1 or 2 tensors (vector
+// or matrix). Base class of the non-type-erased `MatPtrT`. Use this class
+// to store hetereogeneous tensor references in a vector.
+//
+// Copyable, (de)serializable via `fields.h` for `model_store.h`.
+class MatPtr : public IFields {
+ public:
+  MatPtr() = default;
+  // `name`: see `SetName`. Note that `stride` is initially `cols` and only
+  // differs after deserializing, or calling `SetPtr`.
+  MatPtr(const char* name, Type type, Extents2D extents)
+      : private_rows_(static_cast<uint32_t>(extents.rows)),
+        cols_(static_cast<uint32_t>(extents.cols)) {
+    SetName(name);
+    SetType(type);
+    SetPtr(nullptr, cols_);
+  }
+
+  // Copying allowed because the metadata is small.
+  MatPtr(const MatPtr& other) = default;
+  MatPtr& operator=(const MatPtr& other) = default;
+
+  virtual ~MatPtr() = default;
+
+  // Only for use by ctor, `AllocateFor` and 'loading' memory-mapped tensors.
+  void SetPtr(void* ptr, size_t stride) {
+    HWY_ASSERT(stride >= Cols());
+    ptr_ = ptr;
+    stride_ = static_cast<uint32_t>(stride);
+
+    // If row pointers were already attached, `SetPtr` would invalidate them.
+    HWY_DASSERT_M(row_ptrs_ == nullptr, "Do not call after AttachRowPtrs.");
+
+    // NUQ streams must not be padded because that would change the position of
+    // the group tables.
+    if (type_ == Type::kNUQ) {
+      HWY_ASSERT_M(GEMMA_ENABLE_NUQ, "Set GEMMA_ENABLE_NUQ=1.");
+      HWY_ASSERT(IsPacked());
+    }
+  }
+
+  bool HasPtr() const { return ptr_ != nullptr; }
+
+  // Caller has initialized Rows() pointers in row_ptrs[]. Note that this only
+  // changes `GetRowPtrs`, not `Row()`, because that would require branching
+  // and only a few call sites, in particular MatMul, use row pointers.
+  void AttachRowPtrs(uint8_t** row_ptrs) {
+    row_ptrs_ = row_ptrs;
+    for (size_t r = 0; r < Rows(); ++r) {
+      HWY_DASSERT(row_ptrs[r] != nullptr);
+    }
+  }
+
+  // Called by Activations to allocate once, rather than have to fill row
+  // pointers in each call to MatMul.
+  void AllocateAndAttachRowPtrs(
+      std::vector<hwy::AlignedFreeUniquePtr<uint8_t*[]>>& row_ptrs) {
+    if (!HasPtr()) return;
+    row_ptrs.push_back(hwy::AllocateAligned<uint8_t*>(Rows()));
+    uint8_t** ptrs = row_ptrs.back().get();
+    for (size_t r = 0; r < Rows(); ++r) {
+      ptrs[r] = RowBytes(r);
+    }
+    AttachRowPtrs(ptrs);
+  };
+
+  // If non-null, this array should be used instead of `Row()`.
+  uint8_t** GetRowPtrs() const { return row_ptrs_; }
+
+  // A single row counts as packed because there is no padding between rows.
+  bool IsPacked() const { return (stride_ == cols_) || (Rows() == 1); }
+
+  const void* Packed() const {
+    HWY_DASSERT_M(IsPacked(), name_.c_str());
+    return ptr_;
+  }
+  void* Packed() {
+    HWY_DASSERT_M(IsPacked(), name_.c_str());
+    return ptr_;
+  }
+
+  // Returns size in bytes for purposes of copying/initializing or I/O. Must
+  // only be called if `IsPacked`.
+  size_t PackedBytes() const {
+    HWY_DASSERT_M(IsPacked(), name_.c_str());
+    // num_elements_ already includes the NUQ tables.
+    return num_elements_ * element_bytes_;
+  }
+
+  // Works for any kind of padding and element type.
+  uint8_t* RowBytes(size_t row) {
+    HWY_DASSERT(row < Rows());
+    return static_cast<uint8_t*>(ptr_) + row * (stride_ * element_bytes_);
+  }
+  const uint8_t* RowBytes(size_t row) const {
+    HWY_DASSERT(row < Rows());
+    return static_cast<const uint8_t*>(ptr_) + row * (stride_ * element_bytes_);
+  }
+
+  Type GetType() const { return type_; }
+  void SetType(Type type) {
+    type_ = type;
+    if (type == Type::kUnknown) {
+      // Temporary invalid state. Happens during weights.h construction, before
+      // the ForEachTensor that loads them and sets the type.
+      element_bytes_ = 0;
+      num_elements_ = 0;
+      return;
+    }
+    element_bytes_ = static_cast<uint32_t>(hwy::DivCeil(TypeBits(type), 8));
+    num_elements_ = static_cast<uint32_t>(ComputeNumElements(type, Extents()));
+    HWY_DASSERT(0 != element_bytes_ && element_bytes_ <= 16);
+  }
+
+  size_t Rows() const {
+    return override_rows_ == 0 ? private_rows_ : override_rows_;
+  }
+  size_t Cols() const { return cols_; }
+  Extents2D Extents() const { return Extents2D(Rows(), cols_); }
+  bool IsEmpty() const { return Rows() == 0 || cols_ == 0; }
+  bool SameShape(const MatPtr& other) const {
+    return Rows() == other.Rows() && cols_ == other.cols_;
+  }
+  // Future calls to `Rows()` during this class' lifetime (not serialized)
+  // will return this value. Used to set the actual number of rows for
+  // activations preallocated according to the batch size.
+  void OverrideRows(size_t rows) {
+    HWY_ASSERT(rows <= private_rows_);
+    override_rows_ = static_cast<uint32_t>(rows);
+  }
+
+  // Offset by which to advance pointers to the next row.
+  size_t Stride() const { return stride_; }
+
+  // For use by `BlobStore`, `CopyMat` and `ZeroInit`.
+  size_t ElementBytes() const { return element_bytes_; }
+
+  // Decoded elements should be multiplied by this to restore their original
+  // range. This is required because `SfpStream` can only encode a limited range
+  // of magnitudes.
+  float Scale() const { return scale_; }
+  void SetScale(float scale) { scale_ = scale; }
+
+  // A terse identifier unique across all tensors of the model.
+  const char* Name() const override { return name_.c_str(); }
+  // `MakeKey` in `blob_store.cc` requires that this be <= 16 bytes, including
+  // the `LayerSuffix` for per-layer tensors.
+  void SetName(const char* name) {
+    name_ = name;
+    HWY_ASSERT_M(name_.size() <= sizeof(hwy::uint128_t), name);
+  }
+
+  void VisitFields(IFieldsVisitor& visitor) override {
+    // Order determines the order of serialization and must not change.
+    visitor(name_);
+    visitor(type_);
+    visitor(element_bytes_);
+    visitor(num_elements_);
+    visitor(private_rows_);
+    visitor(cols_);
+    visitor(scale_);
+    visitor(stride_);
+  }
+
+ protected:
+  // For initializing `num_elements_`: "elements" are how many objects we
+  // actually store in order to represent rows * cols values. For NUQ, this is
+  // greater because it includes additional per-group tables. This is the only
+  // place where we compute this fixup. Note that elements are independent of
+  // padding, which is anyway not supported for NUQ because `compress-inl.h`
+  // assumes a contiguous stream for its group indexing.
+  static size_t ComputeNumElements(Type type, Extents2D extents) {
+    size_t num_elements = extents.Area();
+    if (type == Type::kNUQ) {
+      // `CompressedArrayElements` is a wrapper function that has the same
+      // effect, but that requires a template argument, not `type`.
+      num_elements = NuqStream::PackedEnd(num_elements);
+    }
+    return num_elements;
+  }
+
+  std::string name_;  // See `SetName`.
+  Type type_;
+
+  // Most members are u32 because that is the preferred type of fields.h.
+
+  // Bytes per element. This is fully determined by `type_`, but stored here
+  // for convenience and backward compatibility.
+  uint32_t element_bytes_ = 0;
+  // Number of elements to store (including NUQ tables but not padding).
+  // This a function of `type_` and `Extents()` and stored for compatibility.
+  uint32_t num_elements_ = 0;
+  uint32_t private_rows_ = 0;  // Only access via Rows()! See OverrideRows().
+  uint32_t cols_ = 0;
+
+  uint32_t override_rows_ = 0;  // not serialized
+
+  // Non-owning pointer, must not be freed. The underlying memory must outlive
+  // this object.
+  void* ptr_ = nullptr;  // not serialized
+
+  // Points to an array of pointers, one per row, or nullptr if `AttachRowPtrs`
+  // was not called. Only used for MatMul output tensors, hence we
+  // minimize the cost for other tensors by only holding a non-owning pointer.
+  uint8_t** row_ptrs_ = nullptr;  // not serialized
+
+  // Offset by which to advance pointers to the next row, >= `cols_`.
+  uint32_t stride_;
+
+  float scale_ = 1.0f;  // multiplier for each value, for MatMul.
+};
+
+// Non-type erased version of `MatPtr`: provides type-safe `Row()` and ensures
+// the template argument and `Type` are consistent.
+template <typename MatT>
+class MatPtrT : public MatPtr {
+ public:
+  using T = MatT;
+
+  // Called by `MatStorageT`.
+  MatPtrT(const char* name, Extents2D extents)
+      : MatPtr(name, TypeEnum<MatT>(), extents) {}
+
+  // Copying allowed because the metadata is small.
+  MatPtrT(const MatPtr& other) : MatPtr(other) {
+    // Happens in weights.h when constructing via MatFinder, which does not
+    // know the type. Setting the type here avoids having to keep the
+    // initializer list and member type in sync.
+    if (GetType() == Type::kUnknown) {
+      SetType(TypeEnum<MatT>());
+    } else {
+      HWY_ASSERT(other.GetType() == TypeEnum<MatT>());
+    }
+  }
+  MatPtrT& operator=(const MatPtr& other) {
+    MatPtr::operator=(other);
+    return *this;
+  }
+  MatPtrT(const MatPtrT& other) = default;
+  MatPtrT& operator=(const MatPtrT& other) = default;
+
+  // Returns the entire tensor after checking the scale is 1.0 because callers
+  // will ignore it. Used for `MatMul` bias vectors and norm weights.
+  const MatT* PackedScale1() const {
+    HWY_DASSERT(Scale() == 1.0f);
+    return HWY_RCAST_ALIGNED(const MatT*, ptr_);
+  }
+
+  MatT* Row(size_t row) { return HWY_RCAST_ALIGNED(T*, RowBytes(row)); }
+  const MatT* Row(size_t row) const {
+    return HWY_RCAST_ALIGNED(const T*, RowBytes(row));
+  }
+
+  PackedSpan<const MatT> PaddedSpan() const {
+    return MakeConstSpan(HWY_RCAST_ALIGNED(MatT*, ptr_), Rows() * Stride());
+  }
+
+  // For `compress-inl.h` functions, which assume contiguous streams and thus
+  // require packed layout.
+  PackedSpan<MatT> Span() {
+    HWY_ASSERT(IsPacked());
+    return MakeSpan(HWY_RCAST_ALIGNED(MatT*, ptr_), num_elements_);
+  }
+  PackedSpan<const MatT> Span() const {
+    HWY_ASSERT(IsPacked());
+    return MakeConstSpan(HWY_RCAST_ALIGNED(MatT*, ptr_), num_elements_);
+  }
+};
+
+template <typename T>
+RowPtrs<T> GetOrSetTempRowPtrs(
+    const MatPtrT<T>& mat,
+    const hwy::AlignedFreeUniquePtr<uint8_t*[]>& storage) {
+  if (HWY_LIKELY(mat.GetRowPtrs())) return RowPtrs<T>(mat.GetRowPtrs());
+
+  if constexpr (HWY_IS_DEBUG_BUILD) {
+    fprintf(stderr,
+            "MatMul perf warning: setting row pointers because "
+            "%s.AttachRowPtrs() was not called.\n",
+            mat.Name());
+  }
+  HWY_DASSERT(mat.HasPtr());
+  for (size_t r = 0; r < mat.Rows(); ++r) {
+    storage[r] = reinterpret_cast<uint8_t*>(const_cast<T*>(mat.Row(r)));
+  }
+  return RowPtrs<T>(storage.get());
+}
+
+// Calls `func` with `MatPtrT<T>*` plus the optional `args`. This supports all
+// types used as weights.
+template <class Func, typename... Args>
+decltype(auto) CallUpcasted(const MatPtr* base, const Func& func,
+                            Args&&... args) {
+#if GEMMA_ENABLE_NUQ
+  if (base->GetType() == Type::kNUQ) {
+    const MatPtrT<NuqStream> mat(*base);
+    return func(&mat, std::forward<Args>(args)...);
+  }
+#endif  // GEMMA_ENABLE_NUQ
+
+  if (base->GetType() == Type::kF32) {
+    const MatPtrT<float> mat(*base);
+    return func(&mat, std::forward<Args>(args)...);
+  } else if (base->GetType() == Type::kBF16) {
+    const MatPtrT<BF16> mat(*base);
+    return func(&mat, std::forward<Args>(args)...);
+  } else if (base->GetType() == Type::kSFP) {
+    const MatPtrT<SfpStream> mat(*base);
+    return func(&mat, std::forward<Args>(args)...);
+  } else {
+    HWY_ABORT("Unhandled type %s.", TypeName(base->GetType()));
+  }
+}
+
+// Calls `func(base1, base2, args...)`.
+template <class Func, typename... Args>
+decltype(auto) CallUpcastedSame(const MatPtr* base1, const MatPtr* base2,
+                                const Func& func, Args&&... args) {
+  HWY_DASSERT(base1->GetType() == base2->GetType());
+
+#if GEMMA_ENABLE_NUQ
+  if (base1->GetType() == Type::kNUQ) {
+    const MatPtrT<NuqStream> mat1(*base1);
+    const MatPtrT<NuqStream> mat2(*base2);
+    return func(&mat1, &mat2, std::forward<Args>(args)...);
+  }
+#endif  // GEMMA_ENABLE_NUQ
+
+  if (base1->GetType() == Type::kF32) {
+    const MatPtrT<float> mat1(*base1);
+    const MatPtrT<float> mat2(*base2);
+    return func(&mat1, &mat2, std::forward<Args>(args)...);
+  } else if (base1->GetType() == Type::kBF16) {
+    const MatPtrT<BF16> mat1(*base1);
+    const MatPtrT<BF16> mat2(*base2);
+    return func(&mat1, &mat2, std::forward<Args>(args)...);
+  } else if (base1->GetType() == Type::kSFP) {
+    const MatPtrT<SfpStream> mat1(*base1);
+    const MatPtrT<SfpStream> mat2(*base2);
+    return func(&mat1, &mat2, std::forward<Args>(args)...);
+  } else {
+    HWY_ABORT("Unhandled type %s.", TypeName(base1->GetType()));
+  }
+}
+
+// Like CallUpcasted, but only for activation types: kBF16 and kF32.
+template <class Func, typename... Args>
+decltype(auto) CallUpcastedActivation(const MatPtr* base, const Func& func,
+                                      Args&&... args) {
+  if (base->GetType() == Type::kF32) {
+    const MatPtrT<float> mat(*base);
+    return func(&mat, std::forward<Args>(args)...);
+  } else if (base->GetType() == Type::kBF16) {
+    const MatPtrT<BF16> mat(*base);
+    return func(&mat, std::forward<Args>(args)...);
+  } else {
+    HWY_ABORT("Unhandled type %s.", TypeName(base->GetType()));
+  }
+}
+
+void CopyMat(const MatPtr& from, MatPtr& to);
+void ZeroInit(MatPtr& mat);
+
+// Our tensors are always row-major. This enum indicates how much (if any)
+// padding comes after each row.
+enum class MatPadding {
+  // None, stride == cols. `compress-inl.h` requires this layout because its
+  // interface assumes a continuous 1D array, without awareness of rows. Note
+  // that tensors which were written via `compress-inl.h` (i.e. most in
+  // `BlobStore`) are not padded, which also extends to memory-mapped tensors.
+  // However, `BlobStore` is able to insert padding via row-wise I/O when
+  // reading from disk via `Mode::kRead`.
+  kPacked,
+  // Enough to round up to an odd number of cache lines, which can reduce
+  // cache conflict misses or 4K aliasing.
+  kOdd,
+};
+
+// The stride (offset in elements between rows) that `MatOwner/MatStorageT`
+// will use.
+size_t Stride(MatPadding padding, size_t cols, size_t element_bytes,
+              size_t line_bytes);
+
+// Type-erased, allows storing `AlignedPtr<T[]>` for various T in the same
+// vector.
+class MatOwner {
+ public:
+  MatOwner() = default;
+  // Allow move for `MatStorageT`.
+  MatOwner(MatOwner&&) = default;
+  MatOwner& operator=(MatOwner&&) = default;
+
+  // Allocates the type/extents indicated by `mat` and sets its pointer.
+  // Ignores `padding` for NUQ tensors, which are always packed.
+  // Thread-compatible, weights are allocated in parallel.
+  void AllocateFor(MatPtr& mat, const Allocator& allocator, MatPadding padding);
+
+ private:
+  AlignedPtr<uint8_t[]> storage_;
+};
+
+// `MatStorageT` IS-A `MatPtrT` and HAS-A `MatOwner`. Used by tests to allocate
+// and access tensors of a known type. By contrast, the heterogeneous model
+// weights are owned by vectors of `MatOwner`.
+template <typename MatT>
+class MatStorageT : public MatPtrT<MatT> {
+ public:
+  MatStorageT(const char* name, Extents2D extents, const Allocator& allocator,
+              MatPadding padding)
+      : MatPtrT<MatT>(name, extents) {
+    if (extents.Area() != 0) owner_.AllocateFor(*this, allocator, padding);
+  }
+  // Shorthand for 1D tensors: packing does not help, hence `kPacked`.
+  MatStorageT(const char* name, size_t cols, const Allocator& allocator)
+      : MatStorageT(name, Extents2D(1, cols), allocator, MatPadding::kPacked) {}
+  ~MatStorageT() = default;
+
+  // Allow move for KVCache.
+  MatStorageT(MatStorageT&&) = default;
+  MatStorageT& operator=(MatStorageT&&) = default;
+
+ private:
+  MatOwner owner_;
+};
+
+// Helper for initializing members which are `MatStorageT<T>`: avoids having to
+// specify Extents2D and MatPadding at each call site.
+class MatFactory {
+ public:
+  // The constructor captures all the necessary arguments.
+  MatFactory(const char* name, size_t rows, size_t cols,
+             const Allocator& allocator, MatPadding padding = MatPadding::kOdd)
+      : name_(name),
+        extents_(rows, cols),
+        allocator_(allocator),
+        padding_(padding) {}
+
+  // Templated conversion so we do not have to specify the type in the
+  // member initializer.
+  template <typename T>
+  operator MatStorageT<T>() const {
+    return MatStorageT<T>(name_.c_str(), extents_, allocator_, padding_);
+  }
+
+ private:
+  const std::string name_;
+  Extents2D extents_;
+  const Allocator& allocator_;
+  MatPadding padding_;
+};
+
+}  // namespace gcpp
+#endif  // THIRD_PARTY_GEMMA_CPP_UTIL_MAT_H_
diff --git a/util/threading.cc b/util/threading.cc
index c2f8bb7..6930612 100644
--- a/util/threading.cc
+++ b/util/threading.cc
@@ -13,12 +13,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "util/threading.h"
+#include "util/threading.h"  // NOT threading_context..
+// to ensure there is no deadlock.
 
 #include <stdio.h>
 
 #include <algorithm>  // std::sort
 #include <atomic>
+#include <memory>
 #include <optional>
 #include <vector>
 
@@ -57,8 +59,8 @@ class Pinning {
 
   // If want_pin_, tries to pin each worker in `pool` to an LP in `cluster`,
   // and sets `any_error_` if any fails.
-  void MaybePin(size_t pkg_idx, size_t cluster_idx,
-                const BoundedTopology::Cluster& cluster,
+  void MaybePin(const BoundedTopology& topology, size_t pkg_idx,
+                size_t cluster_idx, const BoundedTopology::Cluster& cluster,
                 hwy::ThreadPool& pool) {
     const std::vector<size_t> lps = cluster.LPVector();
     HWY_ASSERT(pool.NumWorkers() <= lps.size());
@@ -66,17 +68,21 @@ class Pinning {
       HWY_ASSERT(task == thread);  // each worker has one task
 
       char buf[16];  // Linux limitation
-      const int bytes_written =
-          snprintf(buf, sizeof(buf), "P%zu X%02zu C%03d", pkg_idx, cluster_idx,
-                   static_cast<int>(task));
-      HWY_ASSERT(bytes_written < sizeof(buf));
+      const int bytes_written = snprintf(
+          buf, sizeof(buf), "P%zu X%02zu C%03d",
+          topology.SkippedPackages() + pkg_idx,
+          topology.SkippedClusters() + cluster_idx, static_cast<int>(task));
+      HWY_ASSERT(bytes_written < static_cast<int>(sizeof(buf)));
       hwy::SetThreadName(buf, 0);  // does not support varargs
 
       if (HWY_LIKELY(want_pin_)) {
         if (HWY_UNLIKELY(!hwy::PinThreadToLogicalProcessor(lps[task]))) {
-          fprintf(
-              stderr, "Pinning failed for task %d of %zu to %zu (size %zu)\n",
-              static_cast<int>(task), pool.NumWorkers(), lps[task], lps.size());
+          // Apple does not support pinning, hence do not warn there.
+          if (!HWY_OS_APPLE) {
+            HWY_WARN("Pinning failed for task %d of %zu to %zu (size %zu)\n",
+                     static_cast<int>(task), pool.NumWorkers(), lps[task],
+                     lps.size());
+          }
           (void)any_error_.test_and_set();
         }
       }
@@ -107,16 +113,16 @@ static Pinning& GetPinning() {
   return pinning;
 }
 
-static PoolPtr MakePool(size_t num_workers,
+static PoolPtr MakePool(const Allocator& allocator, size_t num_workers,
                         std::optional<size_t> node = std::nullopt) {
   // `ThreadPool` expects the number of threads to create, which is one less
   // than the number of workers, but avoid underflow if zero.
   const size_t num_threads = num_workers == 0 ? 0 : num_workers - 1;
-  PoolPtr ptr = Allocator::AllocClasses<hwy::ThreadPool>(1, num_threads);
+  PoolPtr ptr = allocator.AllocClasses<hwy::ThreadPool>(1, num_threads);
   const size_t bytes =
-      hwy::RoundUpTo(sizeof(hwy::ThreadPool), Allocator::QuantumBytes());
-  if (node.has_value() && Allocator::ShouldBind()) {
-    Allocator::BindMemory(ptr.get(), bytes, node.value());
+      hwy::RoundUpTo(sizeof(hwy::ThreadPool), allocator.QuantumBytes());
+  if (node.has_value() && allocator.ShouldBind()) {
+    allocator.BindMemory(ptr.get(), bytes, node.value());
   }
   return ptr;
 }
@@ -133,22 +139,22 @@ static size_t DivideMaxAcross(const size_t max, const size_t instances) {
   return max;
 }
 
-NestedPools::NestedPools(const BoundedTopology& topology, size_t max_threads,
+NestedPools::NestedPools(const BoundedTopology& topology,
+                         const Allocator& allocator, size_t max_threads,
                          Tristate pin) {
   GetPinning().SetPolicy(pin);
   packages_.resize(topology.NumPackages());
-  all_packages_ = MakePool(packages_.size());
+  all_packages_ = MakePool(allocator, packages_.size());
   const size_t max_workers_per_package =
       DivideMaxAcross(max_threads, packages_.size());
   // Each worker in all_packages_, including the main thread, will be the
-  // calling thread of an all_clusters[0].Run, and hence pinned to one of the
+  // calling thread of an all_clusters->Run, and hence pinned to one of the
   // `cluster.lps` if `pin`.
-  all_packages_[0].Run(
-      0, packages_.size(), [&](uint64_t pkg_idx, size_t thread) {
-        HWY_ASSERT(pkg_idx == thread);  // each thread has one task
-        packages_[pkg_idx] =
-            Package(topology, pkg_idx, max_workers_per_package);
-      });
+  all_packages_->Run(0, packages_.size(), [&](uint64_t pkg_idx, size_t thread) {
+    HWY_ASSERT(pkg_idx == thread);  // each thread has one task
+    packages_[pkg_idx] =
+        Package(topology, allocator, pkg_idx, max_workers_per_package);
+  });
 
   all_pinned_ = GetPinning().AllPinned(&pin_string_);
 
@@ -172,28 +178,29 @@ static inline size_t CapIfNonZero(size_t num, size_t max_or_zero) {
   return (max_or_zero == 0) ? num : HWY_MIN(num, max_or_zero);
 }
 
-NestedPools::Package::Package(const BoundedTopology& topology, size_t pkg_idx,
+NestedPools::Package::Package(const BoundedTopology& topology,
+                              const Allocator& allocator, size_t pkg_idx,
                               size_t max_workers_per_package) {
   // Pre-allocate because elements are set concurrently.
   clusters_.resize(topology.NumClusters(pkg_idx));
   const size_t max_workers_per_cluster =
       DivideMaxAcross(max_workers_per_package, clusters_.size());
 
-  all_clusters_ =
-      MakePool(clusters_.size(), topology.GetCluster(pkg_idx, 0).Node());
+  all_clusters_ = MakePool(allocator, clusters_.size(),
+                           topology.GetCluster(pkg_idx, 0).Node());
   // Parallel so we also pin the calling worker in `all_clusters` to
   // `cluster.lps`.
-  all_clusters_[0].Run(
-      0, all_clusters_[0].NumWorkers(), [&](size_t cluster_idx, size_t thread) {
+  all_clusters_->Run(
+      0, all_clusters_->NumWorkers(), [&](size_t cluster_idx, size_t thread) {
         HWY_ASSERT(cluster_idx == thread);  // each thread has one task
         const BoundedTopology::Cluster& cluster =
             topology.GetCluster(pkg_idx, cluster_idx);
-        clusters_[cluster_idx] =
-            MakePool(CapIfNonZero(cluster.Size(), max_workers_per_cluster),
-                     cluster.Node());
+        clusters_[cluster_idx] = MakePool(
+            allocator, CapIfNonZero(cluster.Size(), max_workers_per_cluster),
+            cluster.Node());
         // Pin workers AND the calling thread from `all_clusters`.
-        GetPinning().MaybePin(pkg_idx, cluster_idx, cluster,
-                              clusters_[cluster_idx][0]);
+        GetPinning().MaybePin(topology, pkg_idx, cluster_idx, cluster,
+                              *clusters_[cluster_idx]);
       });
 }
 
diff --git a/util/threading.h b/util/threading.h
index d7de410..8d2c013 100644
--- a/util/threading.h
+++ b/util/threading.h
@@ -23,6 +23,7 @@
 
 // IWYU pragma: begin_exports
 #include "util/allocator.h"
+#include "util/args.h"
 #include "util/basics.h"  // Tristate
 #include "util/topology.h"
 #include "hwy/base.h"  // HWY_ASSERT
@@ -73,10 +74,8 @@ class NestedPools {
   // would cause huge slowdowns when spinning, the `BoundedSlice` arguments
   // only impose upper bounds on the number of detected packages and clusters
   // rather than defining the actual number of threads.
-  //
-  // Caller must have called `Allocator::Init` before this.
-  NestedPools(const BoundedTopology& topology, size_t max_threads = 0,
-              Tristate pin = Tristate::kDefault);
+  NestedPools(const BoundedTopology& topology, const Allocator& allocator,
+              size_t max_threads = 0, Tristate pin = Tristate::kDefault);
 
   bool AllPinned() const { return all_pinned_; }
 
@@ -103,7 +102,7 @@ class NestedPools {
   }
 
   size_t NumPackages() const { return packages_.size(); }
-  hwy::ThreadPool& AllPackages() { return all_packages_[0]; }
+  hwy::ThreadPool& AllPackages() { return *all_packages_; }
   hwy::ThreadPool& AllClusters(size_t pkg_idx) {
     HWY_DASSERT(pkg_idx < NumPackages());
     return packages_[pkg_idx].AllClusters();
@@ -149,36 +148,36 @@ class NestedPools {
   class Package {
    public:
     Package() = default;  // for vector
-    Package(const BoundedTopology& topology, size_t pkg_idx,
-            size_t max_workers_per_package);
+    Package(const BoundedTopology& topology, const Allocator& allocator,
+            size_t pkg_idx, size_t max_workers_per_package);
 
     size_t NumClusters() const { return clusters_.size(); }
     size_t MaxWorkersPerCluster() const {
       size_t max_workers_per_cluster = 0;
       for (const PoolPtr& cluster : clusters_) {
         max_workers_per_cluster =
-            HWY_MAX(max_workers_per_cluster, cluster[0].NumWorkers());
+            HWY_MAX(max_workers_per_cluster, cluster->NumWorkers());
       }
       return max_workers_per_cluster;
     }
     size_t TotalWorkers() const {
       size_t total_workers = 0;
       for (const PoolPtr& cluster : clusters_) {
-        total_workers += cluster[0].NumWorkers();
+        total_workers += cluster->NumWorkers();
       }
       return total_workers;
     }
 
-    hwy::ThreadPool& AllClusters() { return all_clusters_[0]; }
+    hwy::ThreadPool& AllClusters() { return *all_clusters_; }
     hwy::ThreadPool& Cluster(size_t cluster_idx) {
       HWY_DASSERT(cluster_idx < clusters_.size());
-      return clusters_[cluster_idx][0];
+      return *clusters_[cluster_idx];
     }
 
     void SetWaitMode(hwy::PoolWaitMode wait_mode) {
-      all_clusters_[0].SetWaitMode(wait_mode);
+      all_clusters_->SetWaitMode(wait_mode);
       for (PoolPtr& cluster : clusters_) {
-        cluster[0].SetWaitMode(wait_mode);
+        cluster->SetWaitMode(wait_mode);
       }
     }
 
@@ -188,7 +187,7 @@ class NestedPools {
   };  // Package
 
   void SetWaitMode(hwy::PoolWaitMode wait_mode) {
-    all_packages_[0].SetWaitMode(wait_mode);
+    all_packages_->SetWaitMode(wait_mode);
     for (Package& package : packages_) {
       package.SetWaitMode(wait_mode);
     }
@@ -322,6 +321,51 @@ void ParallelizeTwoRanges(const IndexRangePartition& get1,
   });
 }
 
+// Calls `func(task, worker)` for each task in `[0, num_tasks)`. Parallelizes
+// over clusters of ONE package, then within each cluster.
+template <class Func>
+void ParallelFor(size_t num_tasks, NestedPools& pools, size_t pkg_idx,
+                 const Func& func) {
+  const size_t pkg_base = pkg_idx * pools.MaxWorkersPerPackage();
+
+  // If few tasks, run on a single cluster. Also avoids a bit of overhead if
+  // there is only one cluster.
+  hwy::ThreadPool& all_clusters = pools.AllClusters(pkg_idx);
+  const size_t num_clusters = all_clusters.NumWorkers();
+  hwy::ThreadPool& cluster = pools.Cluster(pkg_idx, 0);
+  if (num_clusters == 1 || num_tasks <= cluster.NumWorkers()) {
+    return cluster.Run(0, num_tasks, [&](uint64_t task, size_t thread) {
+      func(task, pkg_base + thread);
+    });
+  }
+
+  // Assign each cluster a sub-range.
+  const IndexRangePartition ranges =
+      StaticPartition(IndexRange(0, num_tasks), num_clusters, 1);
+  ParallelizeOneRange(
+      ranges, all_clusters,
+      [&](const IndexRange& range, const size_t cluster_idx) {
+        hwy::ThreadPool& cluster = pools.Cluster(pkg_idx, cluster_idx);
+        const size_t cluster_base =
+            pkg_base + cluster_idx * pools.MaxWorkersPerCluster();
+        cluster.Run(range.begin(), range.end(),
+                    [&](uint64_t task, size_t thread) {
+                      func(task, cluster_base + thread);
+                    });
+      });
+}
+
+// As above, but for lightweight tasks. Uses only one pool.
+template <class Func>
+void SmallParallelFor(size_t num_tasks, NestedPools& pools, size_t pkg_idx,
+                      const Func& func) {
+  const size_t pkg_base = pkg_idx * pools.MaxWorkersPerPackage();
+
+  pools.Pool(pkg_idx).Run(0, num_tasks, [&](uint64_t task, size_t thread) {
+    func(task, pkg_base + thread);
+  });
+}
+
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_H_
diff --git a/backprop/prompt.h b/util/threading_context.cc
similarity index 59%
rename from backprop/prompt.h
rename to util/threading_context.cc
index 76acb56..14eca95 100644
--- a/backprop/prompt.h
+++ b/util/threading_context.cc
@@ -1,4 +1,4 @@
-// Copyright 2024 Google LLC
+// Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,22 +13,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_PROMPT_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_PROMPT_H_
-
-#include <stddef.h>
-#include <vector>
+#include "util/threading_context.h"
 
 namespace gcpp {
 
-struct Prompt {
-  std::vector<int> tokens;
-  size_t context_size;
-  std::vector<int> context() const {
-    return std::vector<int>(tokens.begin(), tokens.begin() + context_size);
-  }
-};
+ThreadingContext::ThreadingContext(const ThreadingArgs& args)
+    : topology(BoundedSlice(args.skip_packages, args.max_packages),
+               BoundedSlice(args.skip_clusters, args.max_clusters),
+               BoundedSlice(args.skip_lps, args.max_lps)),
+      allocator(topology, args.bind != Tristate::kFalse),
+      pools(topology, allocator, args.max_threads, args.pin) {}
 
 }  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_PROMPT_H_
diff --git a/util/threading_context.h b/util/threading_context.h
new file mode 100644
index 0000000..564ea90
--- /dev/null
+++ b/util/threading_context.h
@@ -0,0 +1,99 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
+#define THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
+
+// Separate component to ensure `threading.cc` does not have access to
+// `ThreadingContext`, because that could deadlock.
+
+#include <stddef.h>
+#include <stdint.h>
+
+// IWYU pragma: begin_exports
+#include "util/allocator.h"
+#include "util/args.h"
+#include "util/basics.h"  // Tristate
+#include "util/threading.h"
+#include "util/topology.h"
+// IWYU pragma: end_exports
+
+namespace gcpp {
+
+// Optional arguments for `ThreadingContext` from the command line.
+class ThreadingArgs : public ArgsBase<ThreadingArgs> {
+ public:
+  ThreadingArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
+  ThreadingArgs() { Init(); };
+
+  // For BoundedTopology:
+  size_t skip_packages;
+  size_t max_packages;
+  size_t skip_clusters;
+  size_t max_clusters;
+  size_t skip_lps;
+  size_t max_lps;
+
+  Tristate bind;
+
+  // For NestedPools:
+  size_t max_threads;  // divided among the detected clusters
+  Tristate pin;        // pin threads?
+  Tristate spin;       // use spin waits?
+
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
+    // These can be used to partition CPU sockets/packages and their
+    // clusters/CCXs across several program instances. The default is to use
+    // all available resources.
+    visitor(skip_packages, "skip_packages", size_t{0},
+            "Index of the first socket to use; default 0 = unlimited.", 2);
+    visitor(max_packages, "max_packages", size_t{0},
+            "Max sockets to use; default 0 = all unless large batch size.", 2);
+    visitor(skip_clusters, "skip_clusters", size_t{0},
+            "Index of the first CCX to use; default 0 = unlimited.", 2);
+    visitor(max_clusters, "max_clusters", size_t{0},
+            "Max CCXs to use; default 0 = unlimited.", 2);
+    // These are only used when CPU topology is unknown.
+    visitor(skip_lps, "skip_lps", size_t{0},
+            "Index of the first LP to use; default 0 = unlimited.", 2);
+    visitor(max_lps, "max_lps", size_t{0},
+            "Max LPs to use; default 0 = unlimited.", 2);
+
+    // The exact meaning is more subtle: see the comment at NestedPools ctor.
+    visitor(max_threads, "num_threads", size_t{0},
+            "Max threads to use; default 0 = unlimited.", 2);
+    visitor(pin, "pin", Tristate::kDefault,
+            "Pin threads? -1 = auto, 0 = no, 1 = yes.", 2);
+    visitor(spin, "spin", Tristate::kDefault,
+            "Use spin waits? -1 = auto, 0 = no, 1 = yes.", 2);
+
+    visitor(bind, "bind", Tristate::kDefault,
+            "Bind memory to sockets? -1 = auto, 0 = no, 1 = yes.", 2);
+  }
+};
+
+struct ThreadingContext {
+  // Expected to be called early in the program, before threading starts.
+  explicit ThreadingContext(const ThreadingArgs& args);
+
+  BoundedTopology topology;
+  Allocator allocator;
+  NestedPools pools;
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
diff --git a/util/threading_test.cc b/util/threading_test.cc
index e7fe021..ac2746b 100644
--- a/util/threading_test.cc
+++ b/util/threading_test.cc
@@ -13,8 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "util/threading.h"
-
 #include <stddef.h>
 #include <stdio.h>
 
@@ -22,9 +20,9 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "util/allocator.h"
 #include "util/basics.h"
-#include "hwy/aligned_allocator.h"
+#include "util/threading_context.h"
+#include "hwy/aligned_allocator.h"  // Span
 #include "hwy/auto_tune.h"
 #include "hwy/base.h"  // HWY_ASSERT
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -282,8 +280,6 @@ std::vector<uint64_t> MeasureForkJoin(hwy::ThreadPool& pool) {
   }
   const double t1 = hwy::platform::Now();
 
-// TODO(janwas): enable after Highway update
-#if 0
   if (pool.AutoTuneComplete()) {
     hwy::Span<hwy::CostDistribution> cd = pool.AutoTuneCosts();
     std::vector<double> costs;
@@ -310,10 +306,6 @@ std::vector<uint64_t> MeasureForkJoin(hwy::ThreadPool& pool) {
   } else {
     HWY_WARN("Auto-tuning did not complete yet.");
   }
-#else
-  (void)t0;
-  (void)t1;
-#endif
 
   char cpu100[100];
   static const bool have_stop = hwy::platform::HaveTimerStop(cpu100);
@@ -385,9 +377,9 @@ TEST(ThreadingTest, BenchJoin) {
     }
   };
 
-  BoundedTopology topology;
-  Allocator::Init(topology, true);
-  NestedPools pools(topology);
+  ThreadingArgs threading_args;
+  ThreadingContext ctx(threading_args);
+  NestedPools& pools = ctx.pools;
   // Use last package because the main thread has been pinned to it.
   const size_t pkg_idx = pools.NumPackages() - 1;
 
diff --git a/util/topology.cc b/util/topology.cc
index 239be4d..0d32f22 100644
--- a/util/topology.cc
+++ b/util/topology.cc
@@ -59,8 +59,11 @@ static LPS EnabledLPs(const BoundedSlice& lp_slice) {
     });
   } else {
     const size_t num_lps = hwy::TotalLogicalProcessors();
-    HWY_WARN("unknown OS affinity, max %zu LPs and slice %zu.", num_lps,
-             lp_slice.Num(num_lps));
+    // Do not warn on Apple, where affinity is not supported.
+    if (!HWY_OS_APPLE) {
+      HWY_WARN("unknown OS affinity, max %zu LPs and slice %zu.", num_lps,
+               lp_slice.Num(num_lps));
+    }
     for (size_t lp = 0; lp < num_lps; ++lp) {
       if (lp_slice.Contains(num_lps, lp)) {
         enabled_lps.Set(lp);
@@ -83,12 +86,13 @@ static LPS EnabledLPs(const BoundedSlice& lp_slice) {
 
 BoundedTopology::BoundedTopology(BoundedSlice package_slice,
                                  BoundedSlice cluster_slice,
-                                 BoundedSlice lp_slice) {
+                                 BoundedSlice lp_slice)
+    : package_slice_(package_slice), cluster_slice_(cluster_slice) {
   const LPS enabled_lps = EnabledLPs(lp_slice);
 
 #if !GEMMA_DISABLE_TOPOLOGY
   if (HWY_LIKELY(!topology_.packages.empty())) {
-    InitFromTopology(enabled_lps, package_slice, cluster_slice);
+    InitFromTopology(enabled_lps);
   }
 #endif
 
@@ -138,13 +142,13 @@ BoundedTopology::Cluster::Cluster(const LPS& enabled_lps,
         }
         if (HWY_UNLIKELY(private_kib_ != tcluster.private_kib)) {
           warned = true;
-          HWY_WARN("lp %zu private_kib %zu != cluster %zu.", lp, private_kib_,
-                   tcluster.private_kib);
+          HWY_WARN("lp %zu private_kib %zu != cluster %u.", lp, private_kib_,
+                   static_cast<unsigned>(tcluster.private_kib));
         }
         if (HWY_UNLIKELY(shared_kib_ != tcluster.shared_kib)) {
           warned = true;
-          HWY_WARN("lp %zu shared_kib %zu != cluster %zu.", lp, shared_kib_,
-                   tcluster.shared_kib);
+          HWY_WARN("lp %zu shared_kib %zu != cluster %u.", lp, shared_kib_,
+                   static_cast<unsigned>(tcluster.shared_kib));
         }
       }  // !warned
     }
@@ -270,16 +274,14 @@ static void ScanTClusters(hwy::Topology& topology_, size_t& max_tclusters,
 }
 
 // Main part of ctor, called when topology is known.
-void BoundedTopology::InitFromTopology(const LPS& enabled_lps,
-                                       BoundedSlice package_slice,
-                                       BoundedSlice cluster_slice) {
+void BoundedTopology::InitFromTopology(const LPS& enabled_lps) {
   size_t max_tclusters, max_tcluster_cores, max_tcluster_lps;
   ScanTClusters(topology_, max_tclusters, max_tcluster_cores, max_tcluster_lps);
 
   // (Possibly empty) subset of `Topology` packages that have `enabled_lps`.
-  package_slice.Foreach(
+  package_slice_.Foreach(
       "package", topology_.packages.size(), [&](size_t pkg_idx) {
-        Package package(enabled_lps, topology_, pkg_idx, cluster_slice);
+        Package package(enabled_lps, topology_, pkg_idx, cluster_slice_);
         // Skip if empty, i.e. too few `enabled_lps`.
         if (HWY_LIKELY(!package.clusters.empty())) {
           packages_.push_back(std::move(package));
diff --git a/util/topology.h b/util/topology.h
index c721fd2..b844bd9 100644
--- a/util/topology.h
+++ b/util/topology.h
@@ -148,6 +148,12 @@ class BoundedTopology {
   const hwy::Topology& FullTopology() const { return topology_; }
 #endif
 
+  // In case we are running with a subset of packages/clusters, these are added
+  // to the package/cluster indices for purposes of the thread name, so that
+  // they are distinct.
+  size_t SkippedPackages() const { return package_slice_.Begin(); }
+  size_t SkippedClusters() const { return cluster_slice_.Begin(); }
+
  private:
   struct Package {
     explicit Package(const LPS& enabled_lps);
@@ -160,13 +166,14 @@ class BoundedTopology {
     std::vector<Cluster> clusters;
   };  // Package
 
-  void InitFromTopology(const LPS& enabled_lps, BoundedSlice package_slice,
-                        BoundedSlice cluster_slice);
+  void InitFromTopology(const LPS& enabled_lps);
   void InitFromLPs(const LPS& enabled_lps);
 
 #if !GEMMA_DISABLE_TOPOLOGY
   hwy::Topology topology_;
 #endif
+  BoundedSlice package_slice_;
+  BoundedSlice cluster_slice_;
   std::vector<Package> packages_;
   char topology_string_[96];
   LPS nodes_;