initial commit

2024-02-13 06:30:41 +00:00 · 2024-02-13 06:30:41 +00:00 · e29cd566cf
commit e29cd566cf
28 changed files with 7125 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,79 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.11)
+
+include(FetchContent)
+
+project(gemma)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f)
+FetchContent_MakeAvailable(highway)
+
+## Note: absl meeds tp be installed by sentencepiece. This will only happen if
+## cmake is invoked with -DSPM_ENABLE_SHARED=OFF and -DSPM_ABSL_PROVIDER=module
+FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)
+FetchContent_MakeAvailable(sentencepiece)
+
+set(SOURCES
+  gemma.cc
+  compression/blob_store.cc
+  compression/blob_store.h
+  compression/compress.h
+  compression/compress-inl.h
+  compression/nuq.h
+  compression/nuq-inl.h
+  compression/sfp.h
+  compression/sfp-inl.h
+  util/app.h
+  util/args.h
+  )
+
+add_compile_options($<$<CONFIG:Release>:-O2>)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+# Allowable types for WEIGHT_TYPE:
+# float - slow, not recommended
+# hwy::bfloat16_t - bfloat16 as impemented by https://github.com/google/highway
+# SfpStream - 8-bit switched floating point (recommended)
+# NuqStream - experimental, work-in-progress
+option(WEIGHT_TYPE "Set weight type" "")
+
+if (WEIGHT_TYPE)
+  add_definitions(-DGEMMA_WEIGHT_T=${WEIGHT_TYPE})
+endif()
+
+# Executable Target
+
+add_executable(gemma run.cc)
+target_sources(gemma PRIVATE ${SOURCES})
+set_property(TARGET gemma PROPERTY CXX_STANDARD 17)
+target_link_libraries(gemma hwy hwy_contrib sentencepiece)
+target_include_directories(gemma PRIVATE ./)
+FetchContent_GetProperties(sentencepiece)
+target_include_directories(gemma PRIVATE ${sentencepiece_SOURCE_DIR})
+
+## Library Target
+
+add_library(libgemma ${SOURCES})
+set_property(TARGET libgemma PROPERTY CXX_STANDARD 17)
+set_target_properties(libgemma PROPERTIES PREFIX "")
+target_include_directories(libgemma PUBLIC ./)
+target_link_libraries(libgemma hwy hwy_contrib sentencepiece)
+target_include_directories(libgemma PRIVATE ${sentencepiece_SOURCE_DIR})
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@ -0,0 +1,72 @@
+# Developer Notes
+
+## Motivation: A Minimalist C++ LLM Runtime for Research and Experimentation
+
+In the past, neural network inference has been similar to a simple, opaque,
+stateless function function with a single input and output. By contrast,
+foundation model runtimes are better considered as systems with multiple forms
+of state, subsystems, and heterogeneous inputs and outputs. They are often
+integrated with a wide variety of other systems that have their own resources
+(e.g. RAG and tools) and potentially interact with an external environment. They
+have become compute engines to embed proximal tasks and goals within expansively
+broad, general-purpose world models.
+
+With this in mind, we believe that developing an experimental runtime that is
+flexible and approachable will allow us to explore the design space of co-design
+between high level model concerns and low-level runtime computation.
+
+## Design Priorities
+
+Given these motivations, we propose the following priorities for
+making decisions regarding the direction and design of the codebase.
+
+**Maximize Leverage with a Narrow Scope.** We focus on direct implementations of
+foundation models like Gemma. This allows us to focus effort on bottlenecks of
+specific models. We are willing to trade off generality to keep implementation
+code relatively simple and readable at all layers of the stack, achieve good
+performance, and maintain the velocity of a small team.
+
+**Data Oriented Design.** Follow data oriented design principles where possible
+to minimize unnecessary performance pessimization. It's best to apply these
+optimizations during the initial design, or when refactoring a subcomponent. The
+first step is to think in terms of batches or tuples of plain old data (POD)
+types: separate arrays, instead of an array of structs. The second is to
+de-emphasize control flow (if statements, virtual functions and class
+hierarchies). The third step is to know intrinsic properties of data and bake
+that into the layout and algorithm.
+
+**Prioritize Small Batch Latency** Since production serving solutions are
+available for large-scale serving powered by accelerators and optimizing for
+throughput, this project focuses on the possibilities of local, interactive use
+of foundation models. Although throughput remains important, low latency and
+small batch sizes are prioritized, other things being equal.
+
+**Maintain a Portable Baseline** Our starting point is a portable CPU SIMD (via
+[highway](https://github.com/google/highway)). We expect to add accelerator and
+hybrid CPU/GPU support in the future, but the project should continue to allow
+builds using this portable baseline. This ensures that research-oriented and
+experimental runtimes and hardware platforms will have a minimum viable option
+to run Gemma even if specialized production-ready deployment paths are not
+available.
+
+## Code Organization
+
+The implementation code is roughly split into 4 layers, from high to low level:
+
+1.  Frontends (`run.cc`) - Either interactive interfaces or automation
+    orchestration that interacts. Frontend code implements a use case objective
+    in terms of invocations to model inference and generation (2). Projects that
+    use gemma.cpp as a library are considered alternative frontends to `run.cc`.
+    We will add examples of additional frontends in the future.
+
+2.  Models (`gemma.cc`, `gemma.h`, `configs.h`) - Implements the compute graph
+    of the model including supporting functions such as loading and compressing
+    weights using transformer operations provided by layer (3).
+
+3.  Operations (`ops.h`) - A minimal set of transformer and supporting
+    mathematical operations implementations using compute backends (4). This
+    code should be agnostic to the specifics of the compute graph of the model
+    implementation (2).
+
+4.  Backend (`highway`) - Low-level hardware interface (SIMD in the case of
+    highway) supporting the implementations in (3).
--- a/202
+++ b/202
@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/26
+++ b/26
@ -0,0 +1,26 @@
+Copyright (c) The gemma.cpp Project Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1.  Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+
+2.  Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+3.  Neither the name of the copyright holder nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,335 @@
+# gemma.cpp
+
+gemma.cpp is a lightweight, standalone C++ inference engine for the Gemma
+foundation models from Google.
+
+For additional information about Gemma, see
+[ai.google.dev/gemma](https://ai.google.dev/gemma). Model weights, including gemma.cpp
+specific artifacts, are [available on
+kaggle](https://www.kaggle.com/models/google/gemma).
+
+## Who is this project for?
+
+Modern LLM inference engines are sophisticated systems, often with bespoke
+capabilities extending beyond traditional neural network runtimes. With this
+comes opportunities for research and innovation through co-design of high level
+algorithms and low-level computation. However, there is a gap between
+deployment-oriented C++ inference runtimes, which are not designed for
+experimentation, and Python-centric ML research frameworks, which abstract away
+low-level computation through compilation.
+
+gemma.cpp provides a minimalist implementation of Gemma 2B and 7B models,
+focusing on simplicity and directness rather than full generality. This is
+inspired by vertically-integrated model implementations such as
+[ggml](https://github.com/ggerganov/ggml),
+[llama.c](https://github.com/karpathy/llama2.c), and
+[llama.rs](https://github.com/srush/llama2.rs).
+
+gemma.cpp targets experimentation and research use cases. It is intended to be
+straightforward to embed in other projects with minimal dependencies and also
+easily modifiable with a small ~2K LoC core implementation (along with ~4K LoC
+of supporting utilities). We use the [Google
+Highway](https://github.com/google/highway) Library to take advantage of
+portable SIMD for CPU inference.
+
+For production-oriented edge deployments we recommend standard deployment
+pathways using Python frameworks like JAX, Keras, PyTorch, and Transformers
+([all model variations here](https://www.kaggle.com/models/google/gemma)).
+
+Community contributions large and small are welcome. This project follows
+[Google's Open Source Community
+Guidelines](https://opensource.google.com/conduct/).
+
+## Quick Start
+
+### System requirements
+
+Before starting, you should have installed:
+
+- [CMake](https://cmake.org/)
+- [Clang C++ compiler](https://clang.llvm.org/get_started.html), supporting at
+  least C++17.
+- `tar` for extracting archives from Kaggle.
+
+### Step 1: Obtain model weights and tokenizer from Kaggle
+
+Visit [the Gemma model page on
+Kaggle](https://www.kaggle.com/models/google/gemma) and select `Model Variations
+|> Gemma C++`. On this tab, the `Variation` dropdown includes the options below.
+Note bfloat16 weights are higher fidelity, while 8-bit switched floating point
+weights enable faster inference.
+
+2B instruction-tuned (`it`) and pre-trained (`pt`) models:
+
+| Model name  | Description |
+| ----------- | ----------- |
+| `2b-it`     | 2 billion parameter instruction-tuned model, bfloat16 |
+| `2b-it-sfp` | 2 billion parameter instruction-tuned model, 8-bit switched floating point |
+| `2b-pt`     | 2 billion parameter pre-trained model, bfloat16 |
+| `2b-pt-sfp` | 2 billion parameter pre-trained model, 8-bit switched floating point |
+
+7B instruction-tuned (`it`) and pre-trained (`pt`) models:
+
+| Model name  | Description |
+| ----------- | ----------- |
+| `7b-it`     | 7 billion parameter instruction-tuned model, bfloat16 |
+| `7b-it-sfp` | 7 billion parameter instruction-tuned model, 8-bit switched floating point |
+| `7b-pt`     | 7 billion parameter pre-trained model, bfloat16 |
+| `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |
+
+> [!NOTE]
+> We *recommend starting with `2b-it-sfp`* to get up and running.
+
+### Step 2: Extract Files
+
+After filling out the consent form, the download should proceed to retrieve a
+tar archive file `archive.tar.gz`. Extract files from `archive.tar.gz` (this can
+take a few minutes):
+
+```
+tar -xf archive.tar.gz
+```
+
+This should produce a file containing model weights such as `2b-it-sfp.sbs` and
+a tokenizer file (`tokenizer.spm`). You may want to move these files to a
+convenient directory location (e.g. the `build/` directory in this repo).
+
+### Step 3: Build
+
+The build system uses [CMake](https://cmake.org/). To build the gemma inference
+runtime, create a build directory and generate the build files using `cmake`
+from the top-level project directory:
+
+```sh
+(cd build && cmake ..)
+```
+
+Then run `make` to build the `./gemma` executable:
+
+```sh
+cd build
+make -j [number of parallel threads to use] gemma
+```
+
+For example, `make -j 8 gemma`. If this is successful, you should now have a
+`gemma` executable in the `build/` directory.
+
+> [!NOTE]
+> On Windows Subsystem for Linux (WSL) users should set the number of
+> parallel threads to 1. Using a larger number may result in errors.
+
+### Step 4: Run
+
+You can now run `gemma` from inside the `build/` directory.
+
+`gemma` has the following required arguments:
+
+| Argument | Description | Example value |
+| -------- | ----------- | ------------- |
+| `--model` | The model type. | `2b-it`, `2b-pt`, `7b-it`, `7b-pt`, ... (see above) |
+| `--compressed_weights` | The compressed weights file. | `2b-it-sfp.sbs`, ... (see above) |
+| `--tokenizer` | The tokenizer file. | `tokenizer.spm` |
+
+
+`gemma` is invoked as:
+
+```sh
+./gemma \
+--tokenizer [tokenizer file] \
+--compressed_weights [compressed weights file] \
+--model [2b-it or 2b-pt or 7b-it or 7b-pt or ...]
+```
+
+Example invocation for the following configuration:
+
+- Compressed weights file `2b-it-sfp.sbs` (2B instruction-tuned model, 8-bit
+  switched floating point).
+- Tokenizer file `tokenizer.spm`.
+
+```sh
+./gemma \
+--tokenizer tokenizer.spm \
+--compressed_weights 2b-it-sfp.sbs \
+--model 2b-it
+```
+
+## Usage
+
+`gemma` has different usage modes, controlled by the verbosity flag.
+
+All usage modes are currently interactive, triggering text generation upon
+newline input.
+
+| Verbosity       | Usage mode | Details                                       |
+| --------------- | ---------- | --------------------------------------------- |
+| `--verbosity 0` | Minimal | Only prints generation output. Suitable as a CLI tool. |
+| `--verbosity 1` | Default | Standard user-facing terminal UI. |
+| `--verbosity 2` | Detailed | Shows additional developer and debug info. |
+
+### Interactive Terminal App
+
+By default, verbosity is set to 1, bringing up a terminal-based interactive
+interface when `gemma` is invoked:
+
+```console
+$ ./gemma [...]
+  __ _  ___ _ __ ___  _ __ ___   __ _   ___ _ __  _ __
+ / _` |/ _ \ '_ ` _ \| '_ ` _ \ / _` | / __| '_ \| '_ \
+| (_| |  __/ | | | | | | | | | | (_| || (__| |_) | |_) |
+ \__, |\___|_| |_| |_|_| |_| |_|\__,_(_)___| .__/| .__/
+  __/ |                                    | |   | |
+ |___/                                     |_|   |_|
+
+tokenizer                     : tokenizer.spm
+compressed_weights            : 2b-it-sfp.sbs
+model                         : 2b-it
+weights                       : [no path specified]
+max_tokens                    : 3072
+max_generated_tokens          : 2048
+
+*Usage*
+  Enter an instruction and press enter (%Q quits).
+
+*Examples*
+  - Write an email to grandma thanking her for the cookies.
+  - What are some historical attractions to visit around Massachusetts?
+  - Compute the nth fibonacci number in javascript.
+  - Write a standup comedy bit about WebGPU programming.
+
+> What are some outdoorsy places to visit around Boston?
+
+[ Reading prompt ] .....................
+
+
+**Boston Harbor and Islands:**
+
+* **Boston Harbor Islands National and State Park:** Explore pristine beaches, wildlife, and maritime history.
+* **Charles River Esplanade:** Enjoy scenic views of the harbor and city skyline.
+* **Boston Harbor Cruise Company:** Take a relaxing harbor cruise and admire the city from a different perspective.
+* **Seaport Village:** Visit a charming waterfront area with shops, restaurants, and a seaport museum.
+
+**Forest and Nature:**
+
+* **Forest Park:** Hike through a scenic forest with diverse wildlife.
+* **Quabbin Reservoir:** Enjoy boating, fishing, and hiking in a scenic setting.
+* **Mount Forest:** Explore a mountain with breathtaking views of the city and surrounding landscape.
+
+...
+```
+
+### Usage as a Command Line Tool
+
+For using the `gemma` executable as a command line tool, it may be useful to
+create an alias for gemma.cpp with arguments fully specified:
+
+```sh
+alias gemma2b="~/gemma.cpp/build/gemma -- --tokenizer ~/gemma.cpp/build/tokenizer.spm --compressed_weights ~/gemma.cpp/build/2b-it-sfp.sbs --model 2b-it --verbosity 0"
+```
+
+Replace the above paths with your own paths to the model and tokenizer paths
+from the download.
+
+Here is an example of prompting `gemma` with a truncated input
+file (using a `gemma2b` alias like defined above):
+
+```sh
+cat configs.h | tail -35 | tr '\n' ' ' | xargs -0 echo "What does this C++ code do: " | gemma2b
+```
+
+> [!NOTE]
+> CLI usage of gemma.cpp is experimental and should take context length
+> limitations into account.
+
+The output of the above command should look like:
+
+```console
+$ cat configs.h | tail -35 | tr '\n' ' ' | xargs -0 echo "What does this C++ code do: " | gemma2b
+[ Reading prompt ] ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
+The code defines two C++ structs, `ConfigGemma7B` and `ConfigGemma2B`, which are used for configuring a deep learning model.
+
+**ConfigGemma7B**:
+
+* `seq_len`: Stores the length of the sequence to be processed. It's set to 7168.
+* `vocab_size`: Stores the size of the vocabulary, which is 256128.
+* `n_layers`: Number of layers in the deep learning model. It's set to 28.
+* `dim_model`: Dimension of the model's internal representation. It's set to 3072.
+* `dim_ffw_hidden`: Dimension of the feedforward and recurrent layers' hidden representations. It's set to 16 * 3072 / 2.
+
+**ConfigGemma2B**:
+
+* `seq_len`: Stores the length of the sequence to be processed. It's also set to 7168.
+* `vocab_size`: Size of the vocabulary, which is 256128.
+* `n_layers`: Number of layers in the deep learning model. It's set to 18.
+* `dim_model`: Dimension of the model's internal representation. It's set to 2048.
+* `dim_ffw_hidden`: Dimension of the feedforward and recurrent layers' hidden representations. It's set to 16 * 2048 / 2.
+
+These structs are used to configure a deep learning model with specific parameters for either Gemma7B or Gemma2B architecture.
+```
+
+### Incorporating gemma.cpp as a Library in your Project
+
+The easiest way to incorporate gemma.cpp in your own project is to pull in
+gemma.cpp and dependencies using `FetchContent`. You can add the following to your
+CMakeLists.txt:
+
+```
+include(FetchContent)
+
+FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)
+FetchContent_MakeAvailable(sentencepiece)
+
+FetchContent_Declare(gemma GIT_REPOSITORY https://github.com/google/gemma.cpp GIT_TAG origin/main)
+FetchContent_MakeAvailable(gemma)
+
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f)
+FetchContent_MakeAvailable(highway)
+```
+
+Note for the gemma.cpp `GIT_TAG`, you may replace `origin/main` for a specific
+commit hash if you would like to pin the library version.
+
+After your executable is defined (substitute your executable name for
+`[Executable Name]` below):
+
+```
+target_link_libraries([Executable Name] libgemma hwy hwy_contrib sentencepiece)
+FetchContent_GetProperties(gemma)
+FetchContent_GetProperties(sentencepiece)
+target_include_directories([Executable Name] PRIVATE ${gemma_SOURCE_DIR})
+target_include_directories([Executable Name] PRIVATE ${sentencepiece_SOURCE_DIR})
+```
+
+### Building gemma.cpp as a Library
+
+gemma.cpp can also be used as a library dependency in your own project. The
+shared library artifact can be built by modifying the make invocation to build
+the `libgemma` target instead of `gemma`.
+
+> [!NOTE]
+> If you are using gemma.cpp in your own project with the `FetchContent` steps
+> in the previous section, building the library is done automatically by `cmake`
+> and this section can be skipped.
+
+First, run `cmake`:
+
+```sh
+(cd build && cmake ..)
+```
+
+Then, run `make` with the `libgemma` target:
+
+```sh
+cd build
+make -j [number of parallel threads to use] libgemma
+```
+
+If this is successful, you should now have a
+`libgemma` library file in the `build/` directory. On linux the filename is `libgemma.a`.
+
+## Acknowledgements and Contacts
+
+gemma.cpp was started in fall 2023 by [Austin Huang](austinvhuang@google.com)
+and [Jan Wassenberg](janwas@google.com), and subsequently released February 2024
+thanks to contributions from Phil Culliton, Paul Chang, and Dan Zheng.
+
+This is not an officially supported Google product.
--- a/build/.gitignore
+++ b/build/.gitignore
@ -0,0 +1,3 @@
+*
+!.gitignore
+!.hgignore
--- a/compression/analyze.h
+++ b/compression/analyze.h
@ -0,0 +1,244 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Normal include guard to placate lint.
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_ANALYZE_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_ANALYZE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>  // memcpy
+
+#include <cmath>    // std::signbit
+#include <cstdlib>  // std::abs
+#include <vector>
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/distortion.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/nuq.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/stats.h"
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/timer.h"
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_ANALYZE_H_
+
+// Actual per-target include guard.
+#if defined(THIRD_PARTY_GEMMA_CPP_ANALYZE_TOGGLE) == defined(HWY_TARGET_TOGGLE)
+#ifdef THIRD_PARTY_GEMMA_CPP_ANALYZE_TOGGLE
+#undef THIRD_PARTY_GEMMA_CPP_ANALYZE_TOGGLE
+#else
+#define THIRD_PARTY_GEMMA_CPP_ANALYZE_TOGGLE
+#endif
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/nuq-inl.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/sfp-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+
+class PerThread {
+ public:
+  void NotifyGroup(const float* group) {
+    Stats s_group;
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      // Skip zero so we can see the lowest actual magnitude
+      if (group[i] == 0.0f || group[i] == -0.0f) continue;
+      s_all_.Notify(group[i]);
+      s_group.Notify(group[i]);
+
+      num_tiny_ += std::abs(group[i]) < 1e-3f;
+
+      // b_magn100_.Notify(group[i] * 40.0f + 20.0f);
+      const uint32_t binary32 =
+          hwy::BitCastScalar<uint32_t>(std::abs(group[i]));
+
+      // const int32_t exp = (binary32 >> 23) - 127;
+      b_exp256_.Notify(binary32 >> 23);
+      const uint32_t m4 = (binary32 & 0x7FFFFF) >> (23 - 4);
+      b_m4_.Notify(m4);
+    }
+    s_group_ranges_.Notify(s_group.Max() - s_group.Min());
+    s_group_mins_.Notify(s_group.Min());
+    s_group_maxs_.Notify(s_group.Max());
+
+    float desc[kGroupSize];
+    memcpy(desc, group, kGroupSize * sizeof(group[0]));
+    hn::VQSortStatic(desc, kGroupSize, hwy::SortDescending());
+
+    // Find largest |max/min| (dynamic range)
+    float max_ratio = 0.0f;
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      if (desc[i] != 0.0f && desc[i] != -0.0f) {
+        max_ratio = std::max(max_ratio, std::abs(desc[0] / desc[i]));
+      }
+    }
+    s_group_max_vs_min_.Notify(max_ratio);
+
+    // Relative errors
+    float diffs[kGroupSize];
+    for (size_t i = 0; i < kGroupSize - 1; ++i) {
+      // was in descending order. Avoid div by 0. Ignore sign changes.
+      diffs[i] = std::abs(desc[i]) < 1e-5
+                     ? 0
+                     : std::abs((desc[i] - desc[i + 1]) / desc[i]);
+    }
+    hn::VQSortStatic(diffs, kGroupSize, hwy::SortDescending());
+    s_cut15_.Notify(diffs[15]);
+  }
+
+  void Assimilate(const PerThread& other) {
+    num_tiny_ += other.num_tiny_;
+    s_all_.Assimilate(other.s_all_);
+    s_group_ranges_.Assimilate(other.s_group_ranges_);
+    s_group_mins_.Assimilate(other.s_group_mins_);
+    s_group_maxs_.Assimilate(other.s_group_maxs_);
+    s_group_max_vs_min_.Assimilate(other.s_group_max_vs_min_);
+    s_erange_.Assimilate(other.s_erange_);
+    s_km_1_.Assimilate(other.s_km_1_);
+    s_km_2_.Assimilate(other.s_km_2_);
+    s_cut15_.Assimilate(other.s_cut15_);
+    b_magn100_.Assimilate(other.b_magn100_);
+    b_exp256_.Assimilate(other.b_exp256_);
+    b_m4_.Assimilate(other.b_m4_);
+  }
+
+  void PrintAll() {
+    const int skip = Stats::kNoGeomean;
+    fprintf(stderr, "num tiny %zu\n", num_tiny_);
+    fprintf(stderr, "weights %s\n", s_all_.ToString(skip).c_str());
+    fprintf(stderr, " ranges %s\n", s_group_ranges_.ToString(skip).c_str());
+    fprintf(stderr, "   mins %s\n", s_group_mins_.ToString(skip).c_str());
+    fprintf(stderr, "   maxs %s\n", s_group_maxs_.ToString(skip).c_str());
+    fprintf(stderr, "   Mvm  %s\n", s_group_max_vs_min_.ToString(skip).c_str());
+    fprintf(stderr, "  cut15 %s\n", s_cut15_.ToString(skip).c_str());
+    fprintf(stderr, " erange %s\n", s_erange_.ToString(skip).c_str());
+    fprintf(stderr, "   km1 %s\n", s_km_1_.ToString(skip).c_str());
+    fprintf(stderr, "   km2 %s\n", s_km_2_.ToString(skip).c_str());
+
+    // b_magn100_.Print("magn100");
+    // b_exp256_.Print("exp");
+    // b_m4_.Print("mantissa bits4");
+
+    fprintf(stderr, "\n");
+  }
+
+ private:
+  size_t num_tiny_ = 0;
+  Stats s_all_;
+  Stats s_group_ranges_;
+  Stats s_group_mins_;
+  Stats s_group_maxs_;
+  Stats s_group_max_vs_min_;
+  Stats s_erange_;
+  Stats s_km_1_;
+  Stats s_km_2_;
+  Stats s_cut15_;
+  Bins<100> b_magn100_;
+  Bins<256> b_exp256_;
+  Bins<16> b_m4_;
+  uint8_t padding_[64];  // prevent false sharing
+};
+
+class PerLayer {
+ public:
+  void NotifyGroup(const float* group) {
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      s_layer_.Notify(group[i]);
+    }
+  }
+
+  void UpdateOutliers(const float* layer, size_t weights_per_layer) {
+    const float layer_mean = s_layer_.Mean();
+    const float layer_sd = s_layer_.StandardDeviation();
+    for (size_t i = 0; i < weights_per_layer; ++i) {
+      num_outliers_ +=
+          std::abs(std::abs(layer[i]) - layer_mean) >= 3.0f * layer_sd;
+    }
+  }
+
+  const Stats& GetStats() const { return s_layer_; }
+  size_t Outliers() const { return num_outliers_; }
+
+ private:
+  Stats s_layer_;
+  size_t num_outliers_ = 0;
+  uint8_t padding[64];  // prevent false sharing
+};
+
+static HWY_NOINLINE void Analyze(const char* caption, float* mat, size_t layers,
+                                 size_t weights_per_layer,
+                                 hwy::ThreadPool& pool) {
+  std::vector<PerThread> tls;
+  std::vector<PerLayer> per_layer(layers);
+  const auto init = [&](size_t num_threads) {
+    tls.resize(num_threads);
+    return true;
+  };
+
+  pool.Run(0, static_cast<uint32_t>(layers), init,
+           [&](uint32_t idx_layer, size_t idx_thread) {
+             PerThread& self = tls[idx_thread];
+             const float* layer = &mat[idx_layer * weights_per_layer];
+             // For each whole group in the layer
+             for (size_t group_start = 0;
+                  group_start + kGroupSize <= weights_per_layer;
+                  group_start += kGroupSize) {
+               const float* group = layer + group_start;
+               per_layer[idx_layer].NotifyGroup(group);
+               self.NotifyGroup(group);
+             }
+
+             per_layer[idx_layer].UpdateOutliers(layer, weights_per_layer);
+           });
+
+  const int skip = Stats::kNoGeomean;
+  fprintf(stderr, "\n------------%s\n", caption);
+
+  for (size_t i = 1; i < pool.NumThreads(); ++i) {
+    tls[0].Assimilate(tls[i]);
+  }
+  tls[0].PrintAll();
+
+  Stats s_layer_ranges;
+  Stats s_layer_outliers;
+  for (size_t i = 0; i < layers; ++i) {
+    fprintf(stderr, "  %02zu %s\n", i,
+            per_layer[i].GetStats().ToString(skip).c_str());
+    const float range =
+        per_layer[i].GetStats().Max() - per_layer[i].GetStats().Min();
+    s_layer_ranges.Notify(range);
+    s_layer_outliers.Notify((100.0 * per_layer[i].Outliers()) /
+                            weights_per_layer);
+  }
+  fprintf(stderr, "layer outliers%% %s\n",
+          s_layer_outliers.ToString(skip).c_str());
+  fprintf(stderr, "layer ranges %s\n", s_layer_ranges.ToString(skip).c_str());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_ANALYZE_H_
--- a/compression/blob_store.cc
+++ b/compression/blob_store.cc
@ -0,0 +1,348 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/blob_store.h"
+
+#include <fcntl.h>  // open
+#include <stdint.h>
+#include <stdio.h>     // SEEK_END - unistd isn't enough for IDE.
+#include <sys/stat.h>  // O_RDONLY
+#include <unistd.h>    // read, close
+
+#include <atomic>
+#include <vector>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/detect_compiler_arch.h"
+
+namespace gcpp {
+
+hwy::uint128_t MakeKey(const char* string) {
+  size_t length = 0;
+  for (size_t i = 0; string[i] != '\0'; ++i) {
+    ++length;
+  }
+  if (length > 16) {
+    HWY_ABORT("Key %s is too long, please truncate to 16 chars.", string);
+  }
+
+  hwy::uint128_t ret;
+  hwy::ZeroBytes<sizeof(ret)>(&ret);
+  hwy::CopyBytes(string, &ret, length);
+  return ret;
+}
+
+static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
+                                 std::vector<BlobIO>& requests) {
+  // Split into chunks for load-balancing even if blob sizes vary.
+  constexpr size_t kChunkSize = 4 * 1024 * 1024;
+
+  // Split into whole chunks and possibly one remainder.
+  uint64_t pos = 0;
+  if (size >= kChunkSize) {
+    for (; pos <= size - kChunkSize; pos += kChunkSize) {
+      requests.emplace_back(offset + pos, kChunkSize, data + pos, 0);
+    }
+  }
+  if (pos != size) {
+    requests.emplace_back(offset + pos, size - pos, data + pos, 0);
+  }
+}
+
+struct IO {
+  // Returns size in bytes or 0.
+  static uint64_t FileSize(const char* filename) {
+    int fd = open(filename, O_RDONLY);
+    if (fd >= 0) {
+      const off_t size = lseek(fd, 0, SEEK_END);
+      HWY_ASSERT(close(fd) != -1);
+      if (size != static_cast<off_t>(-1)) {
+        return static_cast<uint64_t>(size);
+      }
+    }
+
+    return 0;
+  }
+
+  static bool Read(int fd, uint64_t offset, uint64_t size, void* to) {
+    uint8_t* bytes = reinterpret_cast<uint8_t*>(to);
+    uint64_t pos = 0;
+    for (;;) {
+      // pread seems to be faster than lseek + read when parallelized.
+      const auto bytes_read = pread(fd, bytes + pos, size - pos, offset + pos);
+      if (bytes_read <= 0) break;
+      pos += bytes_read;
+      HWY_ASSERT(pos <= size);
+      if (pos == size) break;
+    }
+    return pos == size;  // success if managed to read desired size
+  }
+
+  static bool Write(const void* from, uint64_t size, uint64_t offset, int fd) {
+    const uint8_t* bytes = reinterpret_cast<const uint8_t*>(from);
+    uint64_t pos = 0;
+    for (;;) {
+      const auto bytes_written =
+          pwrite(fd, bytes + pos, size - pos, offset + pos);
+      if (bytes_written <= 0) break;
+      pos += bytes_written;
+      HWY_ASSERT(pos <= size);
+      if (pos == size) break;
+    }
+    return pos == size;  // success if managed to write desired size
+  }
+};  // IO
+
+static_assert(HWY_IS_LITTLE_ENDIAN, "Assumes little endian");
+
+// On-disk representation (little-endian).
+//
+// Deliberately omits a version number because this file format is unchanging.
+// Additional data may be added only inside new blobs. Changes to the blob
+// contents or type should be handled by renaming keys.
+#pragma pack(push, 1)
+class BlobStore {
+  static constexpr uint32_t kMagic = 0x0A534253;  // SBS\n
+
+  // Blob offsets on disk and memory addresses are a multiple of this, because
+  // we pad the header and each blob's size. This matches CUDA alignment and the
+  // maximum SVE vector size, and exceeds typical x86 cache line sizes (64 or
+  // 128), which can help performance.
+  static constexpr size_t kAlign = 256;
+
+ public:
+  // NOT including padding, so that we can also use ZeroFillPadding after
+  // copying the header.
+  static constexpr size_t HeaderSize(size_t num_blobs) {
+    // 16-byte fixed fields plus per-blob: 16-byte key, 16-byte offset/size.
+    return 16 + 32 * num_blobs;
+  }
+
+  // Returns how many bytes to allocate for the header without the subsequent
+  // blobs. Requires num_blobs_ to already be set, typically by reading
+  // sizeof(BlobStore) bytes from disk.
+  size_t PaddedHeaderSize() const {
+    return hwy::RoundUpTo(HeaderSize(num_blobs_), kAlign);
+  }
+
+  // Returns aligned offset and zero-fills between that and `offset`.
+  uint64_t ZeroFillPadding(uint64_t offset) {
+    uint8_t* const bytes = reinterpret_cast<uint8_t*>(this);
+    const uint64_t padded = hwy::RoundUpTo(offset, kAlign);
+    hwy::ZeroBytes(bytes + offset, padded - offset);
+    return padded;
+  }
+
+  BlobError CheckValidity(const uint64_t file_size) {
+    if (magic_ != kMagic) return __LINE__;
+    if (num_blobs_ == 0) return __LINE__;
+    if (file_size_ != file_size) return __LINE__;
+
+    // Ensure blobs are back to back, and zero-pad.
+    uint64_t offset = ZeroFillPadding(HeaderSize(num_blobs_));
+    for (size_t i = 0; i < num_blobs_; ++i) {
+      const hwy::uint128_t val = keys_[num_blobs_ + i];
+      if (val.lo != offset) return __LINE__;
+      offset = ZeroFillPadding(offset + val.hi);
+    }
+
+    if (offset != file_size_) return __LINE__;
+
+    return 0;  // all OK
+  }
+
+  static BlobStorePtr Allocate(uint64_t total_size) {
+    uint8_t* bytes =
+        static_cast<uint8_t*>(hwy::AllocateAlignedBytes(total_size));
+    if (!bytes) return BlobStorePtr();
+    return BlobStorePtr(new (bytes) BlobStore(), hwy::AlignedFreer());
+  }
+
+  static std::vector<BlobIO> PrepareWriteRequests(
+      const hwy::uint128_t keys[], const hwy::Span<uint8_t> blobs[],
+      size_t num_blobs) {
+    // Sanity check and ensure the cast below is safe.
+    HWY_ASSERT(num_blobs < (1ULL << 20));
+
+    // Allocate var-length header.
+    const size_t header_size = HeaderSize(num_blobs);
+    const size_t padded_header_size = hwy::RoundUpTo(header_size, kAlign);
+    BlobStorePtr bs = Allocate(padded_header_size);
+    const uint64_t padded_header_end = bs->ZeroFillPadding(header_size);
+    HWY_ASSERT(padded_header_end == padded_header_size);
+
+    // All-zero buffer used to write padding to the file without copying the
+    // input blobs.
+    static uint8_t zeros[kAlign] = {0};
+
+    // Total file size will be the header plus all padded blobs.
+    uint64_t payload = 0;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      payload += hwy::RoundUpTo(blobs[i].size(), kAlign);
+    }
+    const size_t total_size = padded_header_size + payload;
+
+    // Fill header.
+    bs->magic_ = kMagic;
+    bs->num_blobs_ = static_cast<uint32_t>(num_blobs);
+    bs->file_size_ = total_size;
+    hwy::CopyBytes(keys, bs->keys_, num_blobs * sizeof(keys[0]));
+
+    // First IO request is for the header (not yet filled!).
+    std::vector<BlobIO> requests;
+    requests.reserve(1 + 2 * num_blobs);
+    requests.emplace_back(/*offset=*/0, padded_header_size,
+                          reinterpret_cast<uint8_t*>(bs.get()), 0);
+
+    // Fill second half of keys_ with offset/size and prepare IO requests.
+    uint64_t offset = padded_header_end;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      bs->keys_[num_blobs + i].lo = offset;
+      bs->keys_[num_blobs + i].hi = blobs[i].size();
+
+      EnqueueChunkRequests(offset, blobs[i].size(), blobs[i].data(), requests);
+      offset += blobs[i].size();
+      const size_t padded_size = hwy::RoundUpTo(blobs[i].size(), kAlign);
+      if (padded_size != blobs[i].size()) {
+        const size_t padding = padded_size - blobs[i].size();
+        HWY_ASSERT(padding <= kAlign);
+        requests.emplace_back(offset, padding, zeros, 0);
+        offset += padding;
+      }
+    }
+
+    HWY_ASSERT(offset == total_size);
+    return requests;
+  }
+
+  bool FindKey(const hwy::uint128_t key, uint64_t& offset, size_t& size) const {
+    for (size_t i = 0; i < num_blobs_; ++i) {
+      if (keys_[i] == key) {
+        const hwy::uint128_t val = keys_[num_blobs_ + i];
+        offset = val.lo;
+        size = val.hi;
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  uint32_t magic_;
+  uint32_t num_blobs_;      // never 0
+  uint64_t file_size_;      // must match actual size of file
+  hwy::uint128_t keys_[1];  // length: 2 * num_blobs
+  // Padding, then the blob identified by keys[0], then padding etc.
+};
+#pragma pack(pop)
+
+BlobError BlobReader::Open(const char* filename) {
+  fd_ = open(filename, O_RDONLY);
+  if (fd_ < 0) return __LINE__;
+
+#if _POSIX_C_SOURCE >= 200112L
+  // Doubles the readahead window, which seems slightly faster when cached.
+  (void)posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+
+  // Read first part of header to get actual size.
+  BlobStore bs;
+  if (!IO::Read(fd_, 0, sizeof(bs), &bs)) return __LINE__;
+  const size_t padded_size = bs.PaddedHeaderSize();
+  HWY_ASSERT(padded_size >= sizeof(bs));
+
+  // Allocate full header.
+  blob_store_ = BlobStore::Allocate(padded_size);
+  if (!blob_store_) return __LINE__;
+
+  // Copy what we already read (more efficient than seek + re-read).
+  hwy::CopySameSize(&bs, blob_store_.get());
+  // Read the rest of the header, but not the full file.
+  uint8_t* bytes = reinterpret_cast<uint8_t*>(blob_store_.get());
+  if (!IO::Read(fd_, sizeof(bs), padded_size - sizeof(bs),
+                bytes + sizeof(bs))) {
+    return __LINE__;
+  }
+
+  return blob_store_->CheckValidity(IO::FileSize(filename));
+}
+
+BlobReader::~BlobReader() {
+  if (fd_ >= 0) {
+    HWY_ASSERT(close(fd_) != -1);
+  }
+}
+
+BlobError BlobReader::Enqueue(hwy::uint128_t key, void* data, size_t size) {
+  uint64_t offset;
+  size_t actual_size;
+  if (!blob_store_->FindKey(key, offset, actual_size)) return __LINE__;
+  if (actual_size != size) return __LINE__;
+
+  EnqueueChunkRequests(offset, actual_size, reinterpret_cast<uint8_t*>(data),
+                       requests_);
+  return 0;
+}
+
+// Parallel synchronous I/O. Alternatives considered:
+// - readv is limited to 0x7FFFF000 bytes on Linux (even 64-bit). Note that
+//   pread calls preadv with a single iovec.
+// - O_DIRECT seems undesirable because we do want to use the OS cache
+//   between consecutive runs.
+// - memory-mapped I/O is less predictable and adds noise to measurements.
+BlobError BlobReader::ReadAll(hwy::ThreadPool& pool) {
+  const int fd = fd_;
+  const auto& requests = requests_;
+  std::atomic_flag err = ATOMIC_FLAG_INIT;
+  // >5x speedup from parallel reads when cached.
+  pool.Run(0, requests.size(),
+           [fd, &requests, &err](uint64_t i, size_t /*thread*/) {
+             if (!IO::Read(fd, requests[i].offset, requests[i].size,
+                           requests[i].data)) {
+               err.test_and_set();
+             }
+           });
+  if (err.test_and_set()) return __LINE__;
+  return 0;
+}
+
+BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
+                               const char* filename) const {
+  HWY_ASSERT(keys_.size() == blobs_.size());
+
+  // Concatenate blobs in memory.
+  std::vector<BlobIO> requests = BlobStore::PrepareWriteRequests(
+      keys_.data(), blobs_.data(), keys_.size());
+
+  // Create/replace existing file.
+  const int fd = open(filename, O_CREAT | O_RDWR | O_TRUNC, 0644);
+  if (fd < 0) return __LINE__;
+
+  std::atomic_flag err = ATOMIC_FLAG_INIT;
+  pool.Run(0, requests.size(),
+           [fd, &requests, &err](uint64_t i, size_t /*thread*/) {
+             if (!IO::Write(requests[i].data, requests[i].size,
+                            requests[i].offset, fd)) {
+               err.test_and_set();
+             }
+           });
+  if (err.test_and_set()) return __LINE__;
+  return 0;
+}
+
+}  // namespace gcpp
--- a/compression/blob_store.h
+++ b/compression/blob_store.h
@ -0,0 +1,90 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_BLOB_STORE_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_BLOB_STORE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"  // hwy::uint128_t
+#include "hwy/contrib/thread_pool/thread_pool.h"
+
+namespace gcpp {
+
+// Convenient way to construct a key from a string (<= 16 chars).
+hwy::uint128_t MakeKey(const char* string);
+
+// Ordered list of opaque blobs (~hundreds), identified by unique opaque
+// 128-bit keys.
+class BlobStore;
+
+// Incomplete type, so dtor will not be called.
+using BlobStorePtr = hwy::AlignedFreeUniquePtr<BlobStore>;
+
+// 0 if successful, otherwise the line number of the failing check.
+using BlobError = int;
+
+struct BlobIO {
+  BlobIO(uint64_t offset, size_t size, void* data, uint64_t padding)
+      : offset(offset), size(size), data(data), padding(padding) {}
+
+  uint64_t offset;
+  size_t size;
+  void* data;
+  uint64_t padding;
+};
+
+class BlobReader {
+ public:
+  BlobReader() { requests_.reserve(500); }
+  ~BlobReader();
+
+  // Opens `filename` and reads its header.
+  BlobError Open(const char* filename);
+
+  // Enqueues read requests if `key` is found and its size matches `size`.
+  BlobError Enqueue(hwy::uint128_t key, void* data, size_t size);
+
+  // Reads all enqueued requests.
+  BlobError ReadAll(hwy::ThreadPool& pool);
+
+ private:
+  BlobStorePtr blob_store_;  // holds header, not the entire file
+  std::vector<BlobIO> requests_;
+  int fd_ = 0;
+};
+
+class BlobWriter {
+ public:
+  void Add(hwy::uint128_t key, void* data, size_t size) {
+    keys_.push_back(key);
+    blobs_.emplace_back(static_cast<uint8_t*>(data), size);
+  }
+
+  // Stores all blobs to disk in the given order with padding for alignment.
+  BlobError WriteAll(hwy::ThreadPool& pool, const char* filename) const;
+
+ private:
+  std::vector<hwy::uint128_t> keys_;
+  std::vector<hwy::Span<uint8_t>> blobs_;
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_BLOB_STORE_H_
--- a/compression/compress-inl.h
+++ b/compression/compress-inl.h
@ -0,0 +1,467 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Include guard for headers.
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_COMPRESS_INL_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_COMPRESS_INL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <array>
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/blob_store.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/compress.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/distortion.h"
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/timer.h"
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_COMPRESS_INL_H_
+
+// Include guard for (potentially) SIMD code.
+#if defined(THIRD_PARTY_GEMMA_CPP_COMPRESS_TOGGLE) == defined(HWY_TARGET_TOGGLE)
+#ifdef THIRD_PARTY_GEMMA_CPP_COMPRESS_TOGGLE
+#undef THIRD_PARTY_GEMMA_CPP_COMPRESS_TOGGLE
+#else
+#define THIRD_PARTY_GEMMA_CPP_COMPRESS_TOGGLE
+#endif
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/nuq-inl.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/sfp-inl.h"
+#include "hwy/contrib/dot/dot-inl.h"
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Enables generic code independent of compression type.
+template <typename T>  // primary, must specialize
+struct CompressTraits {};
+
+template <>
+struct CompressTraits<float> {
+  using MatT = float;
+
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
+                                  size_t num, CompressPerThread& tls,
+                                  size_t /*out_capacity*/,
+                                  MatT* HWY_RESTRICT out, size_t out_ofs) {
+    using VF = hn::Vec<decltype(df)>;
+    const size_t N = hn::Lanes(df);
+    HWY_DASSERT(num >= 2 * N && num % (2 * N) == 0);
+
+    for (size_t i = 0; i < num; i += 2 * N) {
+      const VF in0 = hn::LoadU(df, in + i);
+      const VF in1 = hn::LoadU(df, in + i + N);
+      hn::StoreU(in0, df, out + out_ofs + i);
+      hn::StoreU(in1, df, out + out_ofs + i + N);
+    }
+  }
+
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Decompress(DF df, size_t /*in_capacity*/,
+                                    const MatT* HWY_RESTRICT in, size_t in_ofs,
+                                    float* HWY_RESTRICT out, size_t num) {
+    using VF = hn::Vec<decltype(df)>;
+    const size_t N = hn::Lanes(df);
+    HWY_DASSERT(num >= 2 * N && num % (2 * N) == 0);
+
+    for (size_t i = 0; i < num; i += 2 * N) {
+      const VF in0 = hn::LoadU(df, in + in_ofs + i);
+      const VF in1 = hn::LoadU(df, in + in_ofs + i + N);
+      hn::StoreU(in0, df, out + i);
+      hn::StoreU(in1, df, out + i + N);
+    }
+  }
+
+  // VecT can be float or hwy::bfloat16_t.
+  template <class DF, typename VecT, HWY_IF_F32_D(DF)>
+  static HWY_INLINE float Dot(DF df, size_t /*in_capacity*/,
+                              const MatT* HWY_RESTRICT in, size_t in_ofs,
+                              const VecT* HWY_RESTRICT vec_aligned,
+                              size_t num) {
+    HWY_DASSERT(num >= hn::Lanes(df) && (num % hn::Lanes(df)) == 0);
+    HWY_DASSERT(hn::IsAligned(df, vec_aligned));
+    constexpr int kAssumptions =
+        hn::Dot::kAtLeastOneVector | hn::Dot::kMultipleOfVector;
+    // vec_aligned must be the second argument because hn::Dot supports f32*bf16
+    // and f32*f32.
+    return hn::Dot::Compute<kAssumptions>(df, in + in_ofs, vec_aligned, num);
+  }
+};
+
+template <>
+struct CompressTraits<hwy::bfloat16_t> {
+  using MatT = hwy::bfloat16_t;
+
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
+                                  size_t num, CompressPerThread& tls,
+                                  size_t /*out_capacity*/,
+                                  MatT* HWY_RESTRICT out, size_t out_ofs) {
+    const hn::RebindToUnsigned<decltype(df)> du;
+    const hn::Repartition<hwy::bfloat16_t, decltype(df)> dbf;
+    using VF = hn::Vec<decltype(df)>;
+    const size_t N = hn::Lanes(df);
+
+    hn::Vec<decltype(du)> or_sum = hn::Zero(du);
+
+    size_t i = 0;
+    if (num >= 2 * N) {
+      for (; i <= num - 2 * N; i += 2 * N) {
+        const VF in0 = hn::LoadU(df, in + i);
+        const VF in1 = hn::LoadU(df, in + i + N);
+
+        // Sticky bits so we can warn if any lower bits were set.
+        or_sum = hn::Or3(or_sum, hn::BitCast(du, in0), hn::BitCast(du, in1));
+        hn::StoreU(hn::OrderedDemote2To(dbf, in0, in1), dbf, out + out_ofs + i);
+
+        if (COMPRESS_STATS) {
+          DistortionStats stats;
+          for (size_t j = 0; j < 2 * N; ++j) {
+            stats.Notify(in[i + j], hwy::F32FromBF16(out[out_ofs + i + j]));
+          }
+          tls.stats.Notify(stats);
+        }
+      }
+    }
+
+    size_t remaining = num - i;
+    if (remaining != 0) {
+      const VF in0 = hn::LoadN(df, in + i, remaining);
+      const size_t remaining1 = remaining - HWY_MIN(remaining, N / 2);
+      const VF in1 = hn::LoadN(df, in + i + N, remaining1);
+
+      // Sticky bits so we can warn if any lower bits were set.
+      or_sum = hn::Or3(or_sum, hn::BitCast(du, in0), hn::BitCast(du, in1));
+      hn::StoreU(hn::OrderedDemote2To(dbf, in0, in1), dbf, out + out_ofs + i);
+
+      if (COMPRESS_STATS) {
+        DistortionStats stats;
+        for (size_t j = 0; j < remaining; ++j) {
+          stats.Notify(in[i + j], hwy::F32FromBF16(out[out_ofs + i + j]));
+        }
+        tls.stats.Notify(stats);
+      }
+    }
+
+    // If the lower 16 bits are not zero, we should implement rounding.
+    or_sum = hn::And(or_sum, hn::Set(du, 0xFFFF));
+    if (!hn::AllTrue(du, hn::Eq(or_sum, hn::Zero(du)))) {
+      // fprintf(stderr, "Warning: Lossy truncation.");
+    }
+  }
+
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Decompress(DF df, size_t /*in_capacity*/,
+                                    const MatT* HWY_RESTRICT in, size_t in_ofs,
+                                    float* HWY_RESTRICT out, size_t num) {
+    const hn::Repartition<hwy::bfloat16_t, decltype(df)> dbf;
+    using VBF = hn::Vec<decltype(dbf)>;
+    using VF = hn::Vec<decltype(df)>;
+    const size_t N16 = hn::Lanes(dbf);
+
+    size_t i = 0;
+    if (num >= N16) {
+      for (i = 0; i <= num - N16; i += N16) {
+        const VBF in16 = hn::LoadU(dbf, in + in_ofs + i);
+        const VF in0 = hn::PromoteLowerTo(df, in16);
+        const VF in1 = hn::PromoteUpperTo(df, in16);
+        hn::StoreU(in0, df, out + i);
+        hn::StoreU(in1, df, out + i + N16 / 2);
+      }
+    }
+
+    size_t remaining = num - i;
+    if (remaining != 0) {
+      const VBF in16 = hn::LoadN(dbf, in + in_ofs + i, remaining);
+      const VF in0 = hn::PromoteLowerTo(df, in16);
+      const VF in1 = hn::PromoteUpperTo(df, in16);
+      hn::StoreN(in0, df, out + i, remaining);
+      // Avoid wraparound, potentially store nothing.
+      const size_t remaining1 = remaining - HWY_MIN(remaining, N16 / 2);
+      hn::StoreN(in1, df, out + i + N16 / 2, remaining1);
+    }
+  }
+
+  // VecT can be float or hwy::bfloat16_t.
+  template <class DF, typename VecT, HWY_IF_F32_D(DF)>
+  static HWY_INLINE float Dot(DF df, size_t /*in_capacity*/,
+                              const MatT* HWY_RESTRICT in, size_t in_ofs,
+                              const VecT* HWY_RESTRICT vec_aligned,
+                              size_t num) {
+    HWY_DASSERT(num >= hn::Lanes(df) && (num % hn::Lanes(df)) == 0);
+    HWY_DASSERT(hn::IsAligned(df, vec_aligned));
+
+    const hn::Repartition<VecT, decltype(df)> d_vec;
+
+    constexpr int kAssumptions =
+        hn::Dot::kAtLeastOneVector | hn::Dot::kMultipleOfVector;
+    // vec_aligned must be first argument because hn::Dot supports f32*bf16 and
+    // bf16*bf16.
+    return hn::Dot::Compute<kAssumptions>(d_vec, vec_aligned, in + in_ofs, num);
+  }
+};
+
+template <>
+struct CompressTraits<SfpStream> {
+  using MatT = SfpStream;
+
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Compress(DF df, const float* in, size_t num,
+                                  CompressPerThread& tls,
+                                  size_t /*out_capacity*/, MatT* out,
+                                  size_t out_ofs) {
+    SfpCodec::Enc(df, in, num, out + out_ofs);
+
+    if (COMPRESS_STATS) {
+      const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+      auto distorted = hwy::AllocateAligned<hwy::bfloat16_t>(num);
+      SfpCodec::Dec(dbf, out + out_ofs, num, distorted.get());
+      DistortionStats stats;
+      for (size_t i = 0; i < num; ++i) {
+        stats.Notify(in[i], hwy::F32FromBF16(distorted[i]));
+      }
+      tls.stats.Notify(stats);
+    }
+  }
+
+  template <class D, typename OutT>
+  static HWY_INLINE void Decompress(D d, size_t /*in_capacity*/, const MatT* in,
+                                    size_t in_ofs, OutT* out, size_t num) {
+    SfpCodec::Dec(d, in + in_ofs, num, out);
+  }
+
+  template <class DF, typename VecT, HWY_IF_F32_D(DF)>
+  static HWY_INLINE float Dot(DF df, size_t /*in_capacity*/, const MatT* in,
+                              size_t in_ofs, const VecT* vec_aligned,
+                              size_t num) {
+    using VF = hn::Vec<decltype(df)>;
+    VF sum0 = hn::Zero(df);
+    VF sum1 = hn::Zero(df);
+    VF sum2 = hn::Zero(df);
+    VF sum3 = hn::Zero(df);
+
+    SfpCodec::Dot(df, in + in_ofs, num, vec_aligned, sum0, sum1, sum2, sum3);
+
+    // Reduction tree: sum of all accumulators, then their lanes
+    sum0 = hn::Add(sum0, sum1);
+    sum2 = hn::Add(sum2, sum3);
+    sum0 = hn::Add(sum0, sum2);
+    return hn::ReduceSum(df, sum0);
+  }
+};
+
+template <>
+struct CompressTraits<NuqStream> {
+  using MatT = NuqStream;
+
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Compress(DF df, const float* in, size_t num,
+                                  CompressPerThread& tls, size_t out_capacity,
+                                  MatT* out, size_t out_ofs) {
+    NuqCodec::Enc(df, in, num, tls.buf, out_capacity, out, out_ofs);
+
+    if (COMPRESS_STATS) {
+      for (size_t i = 0; i < num; ++i) {
+        tls.stats.NotifyIn(in[i] * 100 + 500);
+      }
+
+      const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+      auto distorted = hwy::AllocateAligned<hwy::bfloat16_t>(num);
+      NuqCodec::Dec(dbf, out_capacity, out, out_ofs, distorted.get(), num);
+      DistortionStats stats;
+      for (size_t i = 0; i < num; ++i) {
+        stats.Notify(in[i], hwy::F32FromBF16(distorted[i]));
+      }
+      tls.stats.Notify(stats);
+    }
+  }
+
+  template <class D, typename OutT>
+  static HWY_INLINE void Decompress(D d, size_t in_capacity, const MatT* in,
+                                    size_t in_ofs, OutT* out, size_t num) {
+    NuqCodec::Dec(d, in_capacity, in, in_ofs, out, num);
+  }
+
+  template <class DF, typename VecT, HWY_IF_F32_D(DF)>
+  static HWY_INLINE float Dot(DF df, size_t in_capacity, const MatT* in,
+                              size_t in_ofs,
+                              const VecT* HWY_RESTRICT vec_aligned,
+                              size_t num) {
+    using VF = hn::Vec<decltype(df)>;
+    VF sum0 = hn::Zero(df);
+    VF sum1 = hn::Zero(df);
+    VF sum2 = hn::Zero(df);
+    VF sum3 = hn::Zero(df);
+
+    NuqCodec::Dot(df, in_capacity, in, in_ofs, vec_aligned, num, sum0, sum1,
+                  sum2, sum3);
+
+    // Reduction tree: sum of all accumulators, then their lanes
+    sum0 = hn::Add(hn::Add(sum0, sum1), hn::Add(sum2, sum3));
+    return hn::ReduceSum(df, sum0);
+  }
+};
+
+// Compresses `num` inputs to `out` starting at `out_ofs`. This can be used for
+// compressing sub-regions of an array.
+template <typename MatT>
+HWY_NOINLINE void Compress(const float* in, size_t num,
+                           CompressWorkingSet& work, size_t out_capacity,
+                           MatT* out, size_t out_ofs, hwy::ThreadPool& pool) {
+  HWY_DASSERT(out_ofs + num <= out_capacity);
+  work.tls.resize(pool.NumThreads());
+  if (COMPRESS_STATS) {
+    for (auto& tls : work.tls) {
+      tls.stats.Reset();
+    }
+  }
+
+  const double t0 = hwy::platform::Now();
+
+  using Traits = CompressTraits<MatT>;
+  constexpr size_t kBatch = 8192;
+  const size_t num_batches = hwy::DivCeil(num, kBatch);
+  pool.Run(0, num_batches,
+           [&](const uint32_t idx_batch, size_t thread) HWY_ATTR {
+             const hn::ScalableTag<float> df;
+
+             const size_t in_ofs = idx_batch * kBatch;
+             const size_t my_num =
+                 idx_batch == num_batches - 1 ? (num - in_ofs) : kBatch;
+             Traits::Compress(df, in + in_ofs, my_num, work.tls[thread],
+                              out_capacity, out, out_ofs + in_ofs);
+           });
+
+  const double t1 = hwy::platform::Now();
+  const double mb = num * sizeof(in[0]) * 1E-6;
+  const double mbps = mb / (t1 - t0);
+  fprintf(stderr, "Compress %.1f MB/s\n", mbps);
+
+  if (COMPRESS_STATS) {
+    for (size_t i = 1; i < work.tls.size(); ++i) {
+      work.tls[0].stats.Assimilate(work.tls[i].stats);
+    }
+    work.tls[0].stats.PrintAll();
+  }
+}
+
+// Compresses an entire std::array into `out`, which is assumed to have exactly
+// that much capacity.
+template <size_t kCapacity, typename MatT>
+HWY_INLINE void Compress(const std::array<float, kCapacity>& in,
+                         CompressWorkingSet& work,
+                         CompressedArray<MatT, kCapacity>& compressed,
+                         hwy::ThreadPool& pool) {
+  Compress(in.data(), kCapacity, work, kCapacity, compressed.data(), 0, pool);
+}
+
+// Decompresses `num` values from `compressed` starting at `compressed_ofs`.
+template <typename MatT, size_t kCapacity, typename OutT>
+HWY_NOINLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
+                             size_t compressed_ofs, OutT* out, size_t num) {
+  HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
+  const hn::ScalableTag<OutT> d;
+  using Traits = CompressTraits<MatT>;
+  Traits::Decompress(d, kCapacity, compressed.data(), compressed_ofs, out, num);
+}
+
+// As above, but with threading and benchmarking.
+template <typename MatT, size_t kCapacity, typename OutT>
+HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
+                           size_t compressed_ofs, OutT* out, size_t num,
+                           hwy::ThreadPool& pool) {
+  HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
+  const double t0 = hwy::platform::Now();
+
+  using Traits = CompressTraits<MatT>;
+  constexpr size_t kBatch = 8192;
+  const size_t num_batches = hwy::DivCeil(num, kBatch);
+  pool.Run(
+      0, num_batches, [&](const uint32_t idx_batch, size_t thread) HWY_ATTR {
+        const hn::ScalableTag<OutT> d;
+
+        const size_t ofs = idx_batch * kBatch;
+        const size_t num = idx_batch == num_batches - 1 ? (num - ofs) : kBatch;
+        Traits::Decompress(d, compressed.NumElements(), compressed.data(),
+                           compressed_ofs + ofs, out + ofs, num);
+      });
+
+  const double t1 = hwy::platform::Now();
+  const double mb = num * sizeof(MatT) * 1E-6;
+  const double mbps = mb / (t1 - t0);
+  fprintf(stderr, "Decompress %.1f MB/s\n", mbps);
+}
+
+// Returns dot product with `vec_aligned` of length `num`.
+template <class DF, typename MatT, size_t kCapacity, typename VecT>
+HWY_INLINE float Dot(DF df, const CompressedArray<MatT, kCapacity>& compressed,
+                     size_t compressed_ofs, const VecT* vec_aligned,
+                     size_t num) {
+  HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
+  HWY_DASSERT(hn::IsAligned(df, vec_aligned));
+  using Traits = CompressTraits<MatT>;
+  return Traits::Dot(df, kCapacity, compressed.data(), compressed_ofs,
+                     vec_aligned, num);
+}
+
+// Callback used by ForeachTensor.
+class Compressor {
+ public:
+  explicit Compressor(hwy::ThreadPool& pool) : pool_(pool) {}
+
+  // Called for each tensor; compresses it and stores to the cache.
+  template <typename MatT, size_t kCapacity>
+  void operator()(const char* name, const float* weights,
+                  CompressedArray<MatT, kCapacity>& compressed) {
+    fprintf(stderr, "Regenerating %s (%zuM), please wait\n", name,
+            kCapacity / (1000 * 1000));
+    Compress(weights, kCapacity, work_, kCapacity, compressed.data(), 0, pool_);
+    writer_.Add(CacheKey<MatT>(name), compressed.data(),
+                compressed.CompressedSize());
+  }
+
+  void WriteAll(hwy::ThreadPool& pool, const char* blob_filename) {
+    const BlobError err = writer_.WriteAll(pool, blob_filename);
+    if (err != 0) {
+      fprintf(stderr, "Failed to write blobs to %s (error %d)\n", blob_filename,
+              err);
+    }
+  }
+
+ private:
+  CompressWorkingSet work_;
+  hwy::ThreadPool& pool_;
+  BlobWriter writer_;
+};
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#endif  // NOLINT
--- a/compression/compress.h
+++ b/compression/compress.h
@ -0,0 +1,215 @@
+// Copyright 2023 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Target-independent definitions.
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_COMPRESS_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_COMPRESS_H_
+
+#define COMPRESS_STATS 0
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include <array>
+#include <string>
+#include <vector>
+
+// IWYU pragma: begin_exports
+// copybara:import_next_line:gemma_cpp
+#include "compression/blob_store.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/nuq.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/sfp.h"
+// IWYU pragma: end_exports
+// copybara:import_next_line:gemma_cpp
+#include "compression/distortion.h"
+#include "hwy/base.h"  // hwy::bfloat16_t
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#if COMPRESS_STATS
+// copybara:import_next_line:gemma_cpp
+#include "compression/stats.h"
+#endif
+
+namespace gcpp {
+
+static inline const char* TypeName(float) { return "f32"; }
+static inline const char* TypeName(hwy::bfloat16_t) { return "b16"; }
+
+namespace detail {
+// How many MatT are required to store `capacity` weights. For all but
+// NuqStream, this is the same as `capacity`. For use by CompressedArray.
+template <typename MatT>
+constexpr size_t CompressedArrayLen(size_t capacity) {
+  return capacity;
+}
+template <>
+constexpr size_t CompressedArrayLen<NuqStream>(size_t capacity) {
+  return NuqStream::PackedEnd(capacity);
+}
+}  // namespace detail
+
+// Compressed representation of floating-point elements. The array length may
+// differ from the number of elements. Associated operations such as Dot are
+// implemented in SIMD code and are thus non-member functions.
+template <typename MatT, size_t kCapacity>
+class CompressedArray {
+  static constexpr size_t NumCompressed() {
+    return detail::CompressedArrayLen<MatT>(kCapacity);
+  }
+
+ public:
+  MatT* data() { return data_.data(); }
+  const MatT* data() const { return data_.data(); }
+
+  constexpr size_t NumElements() const { return kCapacity; }
+
+  constexpr size_t CompressedSize() const {
+    return NumCompressed() * sizeof(MatT);
+  }
+
+ private:
+  std::array<MatT, NumCompressed()> data_;
+};
+
+#if COMPRESS_STATS
+class CompressStats {
+ public:
+  void Notify(const DistortionStats& stats) {
+    const float pnorm = stats.PNorm();
+    const float snr = stats.GeomeanValueDivL1();
+    num_exact_ += stats.NumExact();
+    s_pnorm_.Notify(pnorm);
+    // No loss - skip to avoid dragging down the average.
+    if (snr != 0.0f) {
+      s_snr_.Notify(snr);
+    }
+  }
+
+  void NotifyIn(int sfp) { hist_weights_.Notify(sfp); }
+
+  void Assimilate(const CompressStats& other) {
+    s_pnorm_.Assimilate(other.s_pnorm_);
+    s_snr_.Assimilate(other.s_snr_);
+    num_exact_ += other.num_exact_;
+    hist_weights_.Assimilate(other.hist_weights_);
+  }
+
+  void PrintAll() {
+    const int skip = Stats::kNoGeomean;
+    fprintf(stderr, "  pnorm %s\n", s_pnorm_.ToString(skip).c_str());
+    fprintf(stderr, "   SNR  %s\n", s_snr_.ToString(skip).c_str());
+    fprintf(stderr, "  #exact %.3E\n", static_cast<double>(num_exact_));
+    // hist_weights_.Print("indices");
+  }
+
+  void Reset() {
+    s_pnorm_.Reset();
+    s_snr_.Reset();
+    num_exact_ = 0;
+    hist_weights_.Reset();
+  }
+
+ private:
+  Stats s_pnorm_;
+  Stats s_snr_;
+  size_t num_exact_ = 0;
+  Bins<1000> hist_weights_;
+  char padding_[64];  // prevent false sharing
+};
+#else
+struct CompressStats {
+  void Notify(const DistortionStats&) {}
+  void NotifyIn(int) {}
+  void Assimilate(const CompressStats&) {}
+  void PrintAll() {}
+  void Reset() {}
+};
+#endif  // COMPRESS_STATS
+
+struct CompressPerThread {
+  CompressStats stats;
+  ClusterBuf buf;
+};
+
+struct CompressWorkingSet {
+  std::vector<CompressPerThread> tls;
+};
+
+// Returns key for the given tensor name. Also encodes the type, so that
+// changing the representation automatically invalidates prior cached files
+// (the new blob name will not be found).
+template <typename MatT>
+hwy::uint128_t CacheKey(const char* name) {
+  // Already used/retired: s, S, n, 1
+  const char prefix = hwy::IsSame<MatT, float>()             ? 'F'
+                      : hwy::IsSame<MatT, hwy::bfloat16_t>() ? 'B'
+                      : hwy::IsSame<MatT, SfpStream>()       ? '$'
+                      : hwy::IsSame<MatT, NuqStream>()       ? '2'
+                                                             : '?';
+
+  return MakeKey((std::string(1, prefix) + name).c_str());
+}
+
+class CacheLoader {
+ public:
+  explicit CacheLoader(const char* blob_filename) {
+    err_ = reader_.Open(blob_filename);
+    if (err_ != 0) {
+      fprintf(stderr,
+              "Cached compressed weights does not exist yet (code %d), "
+              "compressing weights and creating file: %s.\n",
+              err_, blob_filename);
+    }
+  }
+
+  // Called for each tensor, enqueues read requests.
+  template <typename MatT, size_t kCapacity>
+  void operator()(const char* name, const float* null,
+                  CompressedArray<MatT, kCapacity>& compressed) {
+    HWY_DASSERT(null == nullptr);
+
+    // Skip if reader_ is invalid or any load failed: we will regenerate
+    // everything because it's rare to update only a few tensors.
+    if (err_ != 0) return;
+
+    err_ = reader_.Enqueue(CacheKey<MatT>(name), compressed.data(),
+                           compressed.CompressedSize());
+    if (err_ != 0) {
+      fprintf(stderr, "Failed to read cache %s (error %d)\n", name, err_);
+    }
+  }
+
+  // Returns whether all tensors are successfully loaded from cache.
+  bool ReadAll(hwy::ThreadPool& pool) {
+    // reader_ invalid or any Enqueue failed
+    if (err_ != 0) return false;
+
+    err_ = reader_.ReadAll(pool);
+    if (err_ != 0) {
+      fprintf(stderr, "Failed to read all tensors (error %d)\n", err_);
+      return false;
+    }
+
+    return true;
+  }
+
+ private:
+  BlobReader reader_;
+  BlobError err_ = 0;
+};
+
+}  // namespace gcpp
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_COMPRESS_H_
--- a/compression/distortion.h
+++ b/compression/distortion.h
@ -0,0 +1,99 @@
+// Copyright 2023 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_DISTORTION_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_DISTORTION_H_
+#include <math.h>  // pow
+#include <stddef.h>
+
+#include "hwy/base.h"  // ScalarAbs
+
+namespace gcpp {
+
+class DistortionStats {
+ public:
+  void Notify(float original, float distorted) {
+    const double l1 = hwy::ScalarAbs(original - distorted);
+
+    if (l1 > max_l1_) {
+      max_l1_ = l1;
+      max_idx_ = n_;
+    }
+
+    const double pow3 = l1 * l1 * l1;
+    sum_pow3_ += pow3;
+    sum_pow6_ += pow3 * pow3;
+    n_ += 1;
+
+    // Avoid division by zero, which happens when there is no error. NumExact()
+    // reports the number of times this happens.
+    if (l1 != 0.0) {
+      const double rel = 1.0 + hwy::ScalarAbs(original) / l1;
+      // Logarithm is required to prevent overflow. A hierarchical geomean
+      // could also work, but that is more complex and not necessarily better.
+      sum_log_rel_ += log(rel);
+      num_rel_ += 1;
+    }
+  }
+
+  void Assimilate(const DistortionStats& other) {
+    if (other.max_l1_ > max_l1_) {
+      max_l1_ = other.max_l1_;
+      max_idx_ = other.max_idx_;
+    }
+
+    sum_pow3_ += other.sum_pow3_;
+    sum_pow6_ += other.sum_pow6_;
+    n_ += other.n_;
+
+    sum_log_rel_ += other.sum_log_rel_;
+    num_rel_ += other.num_rel_;
+  }
+
+  size_t NumExact() const { return n_ - num_rel_; }
+
+  double GeomeanValueDivL1() const {
+    if (num_rel_ == 0) return 0.0;
+    return exp(sum_log_rel_ / num_rel_);
+  }
+
+  double PNorm() const {
+    // p-norms are a compromise between max-norm (penalizes the largest error
+    // without dilution, but does not notice any other errors) and L1 (all
+    // errors contribute, but large errors are diluted by smaller ones).
+    const double norm3 = pow(sum_pow3_ / n_, 1.0 / 3);
+    const double norm6 = pow(sum_pow6_ / n_, 1.0 / 6);
+    return 0.5 * (norm3 + norm6);
+  }
+
+  size_t MaxIndex() const { return max_idx_; }
+  double MaxL1() const { return max_l1_; }
+
+ private:
+  size_t n_ = 0;
+  size_t max_idx_ = 0;  // index that had l1 = max_l1_.
+  double max_l1_ = -1.0;
+
+  double sum_pow3_ = 0.0;
+  double sum_pow6_ = 0.0;
+
+  double sum_log_rel_ = 0.0;
+  size_t num_rel_ = 0;
+  double padding_;  // prevents false sharing
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_DISTORTION_H_
--- a/compression/nuq-inl.h
+++ b/compression/nuq-inl.h
@ -0,0 +1,730 @@
+// Copyright 2023 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Normal include guard.
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_INL_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_INL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/nuq.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/sfp.h"
+#include "hwy/base.h"
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_INL_H_
+
+// Actual per-target include guard.
+#if defined(THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_INL_TOGGLE) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_INL_TOGGLE
+#undef THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_INL_TOGGLE
+#else
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_INL_TOGGLE
+#endif
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/sfp-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+// For internal use by NuqCodec.
+class NuqClustering {
+  // To go from sorted order back to the original order in O(1), we store the
+  // original index in the lower bits of the float32 mantissa, which means they
+  // are sorted alongside the value.
+  struct FloatPayload {
+    // Resets payload to zero; useful for displaying the actual value.
+    static HWY_INLINE float Clear(float f) {
+      const uint32_t binary32 = hwy::BitCastScalar<uint32_t>(f);
+      return hwy::BitCastScalar<float>(binary32 &
+                                       ~static_cast<uint32_t>(kGroupSize - 1));
+    }
+
+    // Sets payload to `bits`.
+    static HWY_INLINE float Set(float f, size_t bits) {
+      HWY_DASSERT(bits < kGroupSize);
+      const uint32_t binary32 = hwy::BitCastScalar<uint32_t>(Clear(f));
+      return hwy::BitCastScalar<float>(static_cast<uint32_t>(binary32 | bits));
+    }
+
+    // Obtains the payload (index) previously set by `Set`.
+    static HWY_INLINE size_t Get(float f) {
+      return hwy::BitCastScalar<uint32_t>(f) &
+             static_cast<uint32_t>(kGroupSize - 1);
+    }
+  };
+
+  // Cumulative sums for O(1) mean and interval sums.
+  class ClusterCost {
+   public:
+    explicit ClusterCost(const float* sorted) {
+      cumsum_[0] = cumsum2_[0] = 0.0;
+      for (size_t i = 0; i < kGroupSize; ++i) {
+        const float x = FloatPayload::Clear(sorted[i]);
+        cumsum_[1 + i] = x + cumsum_[i];
+        cumsum2_[1 + i] = x * x + cumsum2_[i];
+      }
+
+      inv_len_[0] = 0.0f;  // unused
+      for (size_t i = 0; i <= kGroupSize; ++i) {
+        inv_len_[i] = 1.0f / i;
+      }
+    }
+
+    float SumOfSorted(size_t first, size_t last) const {
+      return cumsum_[last + 1] - cumsum_[first];
+    }
+
+    // Returns cost of clustering first..last with their mean, for a vector of
+    // last. O(1) thanks to cumulative sums, which works for Lp-norms with p >
+    // 1; we choose p=2 for simplicity (fewer terms).
+    template <class DF>
+    hn::Vec<DF> operator()(DF df, size_t first, size_t last) const {
+      // Callers are responsible for ignoring lanes where last < first.
+      HWY_DASSERT(first < kGroupSize);
+      HWY_DASSERT(last < kGroupSize);
+      const size_t len = last - first + 1;
+      const hn::Vec<DF> vlen =
+          hn::Iota(df, static_cast<float>(static_cast<int>(len)));
+
+      const hn::Vec<DF> u_lo = hn::Set(df, cumsum_[first]);
+      const hn::Vec<DF> u_lo2 = hn::Set(df, cumsum2_[first]);
+      const hn::Vec<DF> hi = hn::LoadU(df, cumsum_ + last + 1);
+      const hn::Vec<DF> hi2 = hn::LoadU(df, cumsum2_ + last + 1);
+      const hn::Vec<DF> sum = hn::Sub(hi, u_lo);
+      const hn::Vec<DF> sum2 = hn::Sub(hi2, u_lo2);
+
+      // Compute mean: table lookup is faster than division.
+      const hn::Vec<DF> mu = hn::Mul(sum, hn::LoadU(df, inv_len_ + len));
+
+      // (x - mu)^2 = sum2 - 2mu*sum + mu^2
+      const hn::Vec<DF> mu2 = hn::Mul(mu, mu);
+      const hn::Vec<DF> two_mu = hn::Add(mu, mu);
+      return hn::NegMulAdd(two_mu, sum, hn::MulAdd(vlen, mu2, sum2));
+    }
+
+   private:
+    // Float has enough precision for our relatively small kGroupSize (128).
+    float cumsum_[kGroupSize + 1];
+    float cumsum2_[kGroupSize + 1];
+    float inv_len_[kGroupSize + 1];
+  };
+
+  // Cost of clustering 0..last, where the rightmost cluster is j..last. This is
+  // called in a loop over j, and we return the vector of costs for a batch of
+  // last = [last, last + N).
+  template <class DF>
+  static HWY_INLINE hn::Vec<DF> ClusterDynProg(
+      DF df, const AlignedMatrix<float>& D, const ClusterCost& cc,
+      const size_t num_clusters, const size_t last, const size_t j) {
+    HWY_DASSERT(last < kGroupSize);
+    HWY_DASSERT(0 != j && j < kGroupSize);
+
+    const hn::RebindToSigned<decltype(df)> di;
+    using VF = hn::Vec<decltype(df)>;
+    using VI = hn::Vec<decltype(di)>;
+    using MI = hn::Mask<decltype(di)>;
+
+    const VI vlast = hn::Iota(di, static_cast<int32_t>(last));
+
+    // We have a non-empty rightmost cluster if j <= last <==> j-1 < last.
+    const MI valid = hn::Lt(hn::Set(di, static_cast<int32_t>(j) - 1), vlast);
+    // If not valid, return an arbitrary high cost, which will not be the min.
+    const VF max = hn::Set(df, 1E38f);
+    // Cost of clustering 0..j-1 with one fewer cluster than now.
+    const VF vd = hn::Set(df, D(num_clusters - 1, j - 1));
+    // Eq2: add to that the cost of another cluster from j..last.
+    return hn::MaskedAddOr(max, RebindMask(df, valid), vd, cc(df, j, last));
+  }
+
+ public:
+  // Clusters `kGroupSize` values in `x`, which need not be sorted already nor
+  // aligned, by choosing and filling `centers` (size `kClusters`, ascending
+  // order, not necessarily equal to one of the `x`). Fills `indices` with the
+  // index of the cluster to which each `x` belongs (16-bit for bit-packing).
+  // `buf` is per-thread.
+  //
+  // Returns the number of unused clusters, i.e., the starting index within
+  // `centers`; prior centers are zero-initialized.
+  //
+  // O(kClusters * kGroupSize * kGroupSize), but the constant factors are so low
+  // that this is about 10 times as fast as the O(kClusters * kGroupSize) SMAWK
+  // as implemented in FAISS, for our kGroupSize <= 128.
+  template <class DF>
+  static HWY_NOINLINE size_t ClusterExactL2(DF df, const float* x,
+                                            ClusterBuf& buf,
+                                            float* HWY_RESTRICT centers,
+                                            uint16_t* HWY_RESTRICT indices) {
+    const hn::RebindToSigned<decltype(df)> di;
+    using VF = hn::Vec<decltype(df)>;
+    using VI = hn::Vec<decltype(di)>;
+    const VI k1 = hn::Set(di, 1);
+    const size_t N = hn::Lanes(df);
+
+    HWY_ALIGN float sorted_and_i[kGroupSize];
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      sorted_and_i[i] = FloatPayload::Set(x[i], i);
+    }
+    hn::VQSortStatic(sorted_and_i, kGroupSize, hwy::SortAscending());
+    ClusterCost cc(sorted_and_i);
+
+    // Reference: https://arxiv.org/abs/1701.07204
+    // D[k-1][m] is the lowest cost of clustering x1..m into k clusters.
+    AlignedMatrix<float>& D = buf.d;
+    // T[k][m] is the starting index within sorted_and_i[] of the k-th cluster.
+    AlignedMatrix<int32_t>& T = buf.t;
+
+    // Initialize the first rows for a single cluster.
+    for (size_t last = 0; last < kGroupSize; last += N) {
+      hn::Store(cc(df, 0, last), df, &D(0, last));  // Cost of 0..last
+      hn::Store(Zero(di), di, &T(0, last));         // Cluster index = 0
+    }
+
+    for (size_t num_clusters = 1; num_clusters < kClusters; ++num_clusters) {
+      // For each batch starting at `last`, one per lane:
+      for (size_t last = 0; last < kGroupSize; last += N) {
+        VF min = cc(df, 0, last);
+        VI arg = hn::Zero(di);
+        // For each j (start of rightmost cluster):
+        VI vj = k1;
+        for (size_t j = 1; j < last + N; ++j, vj = Add(vj, k1)) {
+          const VF c = ClusterDynProg(df, D, cc, num_clusters, last, j);
+
+          // Retain the min cost and the j index that caused it.
+          const auto less = hn::Lt(c, min);
+          min = hn::IfThenElse(less, c, min);
+          arg = hn::IfThenElse(RebindMask(di, less), vj, arg);
+        }
+        hn::Store(min, df, &D(num_clusters, last));
+        hn::Store(arg, di, &T(num_clusters, last));
+      }
+    }
+
+    // Backtrack to find centers. Clusters are [T(k, last), last].
+    size_t last = kGroupSize - 1;
+    size_t unused_clusters = 0;
+    for (size_t k = kClusters - 1; k < kClusters; --k) {
+      const size_t start = static_cast<size_t>(T(k, last));
+      // Center = mean, O(1) thanks to cumulative sums.
+      const float sum = cc.SumOfSorted(start, last);
+      const int size = static_cast<int>(last) - static_cast<int>(start) + 1;
+      HWY_DASSERT(0 < size && size <= kGroupSize);
+      centers[k] = sum / size;
+
+      // We know the range inside sorted_and_i[]; translate to original indices,
+      // which are stored inside each of the sorted_and_i mantissas.
+      for (size_t i = start; i <= last; ++i) {
+        const size_t idx_x = FloatPayload::Get(sorted_and_i[i]);
+        HWY_DASSERT(idx_x < kGroupSize);
+        indices[idx_x] = static_cast<uint16_t>(k);
+      }
+
+      // Not using all clusters. Avoid out of bounds accesses by stopping early.
+      if (start == 0) {
+        unused_clusters = k;
+        for (size_t cluster = 0; cluster < unused_clusters; ++cluster) {
+          centers[cluster] = 0.0f;
+        }
+        break;
+      }
+
+      last = start - 1;
+      HWY_DASSERT(last < kGroupSize);
+    }
+
+    if (HWY_IS_DEBUG_BUILD) {
+      // Centers are in ascending order.
+      for (size_t i = unused_clusters + 1; i < kClusters; ++i) {
+        HWY_DASSERT(centers[i] >= centers[i - 1]);
+      }
+    }
+    return unused_clusters;
+  }
+};  // NuqClustering
+
+// Bit-packing 4-bit values is trivial if we have 2 or 4 independent vectors:
+// simply shift+OR them together into a full vector of 8 or 16-bit lanes.
+// However, the order then depends on the vector length, which is unacceptable
+// because we may store the encoding to disk and decode on another CPU.
+//
+// The dependency on vector length could be removed by introducing fixed-size
+// packets and loading the next vector from a fixed offset known to be at
+// least the vector length. However, this may require packets that are larger
+// than the seek granularity of the application (e.g. matrix rows).
+//
+// We instead choose a continuous stream layout, which seems to entail the
+// nibbles being stored and decoded in-order. This involves nontrivial shuffle
+// operations which benefit from special-casing for target and vector length.
+class NibbleCodec {
+ public:
+  // Packs four u16 vectors' lanes to nibbles within one vector, in order, and
+  // stores that vector to `out`.
+  template <class D16, class V16 = hn::Vec<D16>>
+  static HWY_INLINE void OrderedPackU16(D16 d16, V16 in0, V16 in1, V16 in2,
+                                        V16 in3, uint8_t* HWY_RESTRICT out) {
+    const hn::Repartition<uint8_t, D16> d8;
+    const hn::Repartition<uint32_t, D16> d32;
+    const hn::Repartition<uint64_t, D16> d64;
+    using V8 = hn::Vec<decltype(d8)>;
+
+    // Pairwise compaction of a single vector so nibbles are packed in-order.
+    // v16 lanes hold a 4-bit value; OR together adjacent pairs into the lower
+    // byte of *even* u16.
+    const auto combine_u16_pair_to_8 = [d16, d32](V16 v16) HWY_ATTR {
+      return hn::Xor(
+          v16, hn::BitCast(d16, hn::ShiftRight<12>(hn::BitCast(d32, v16))));
+    };
+
+    const V16 u8_0 = combine_u16_pair_to_8(in0);
+    const V16 u8_1 = combine_u16_pair_to_8(in1);
+    const V16 u8_2 = combine_u16_pair_to_8(in2);
+    const V16 u8_3 = combine_u16_pair_to_8(in3);
+    V8 packed;
+    if (HWY_TARGET <= HWY_AVX3_DL || !HWY_ARCH_X86) {
+      // 8-bit ConcatEven is efficient. Let digits denote eight u8 lanes
+      // of u8_1/0: ?d?3 ?c?2 / ?b?1 ?a?0. 8-bit ConcatEven = d3c2 b1a0, and
+      // again with the second x2_1 gives 7654 3210.
+      const V8 x2_0 = hn::ConcatEven(d8, BitCast(d8, u8_1), BitCast(d8, u8_0));
+      const V8 x2_1 = hn::ConcatEven(d8, BitCast(d8, u8_3), BitCast(d8, u8_2));
+      packed = hn::ConcatEven(d8, x2_1, x2_0);
+    } else {
+      // To avoid expensive 8-bit ConcatEven, compact pairs of u32 into the
+      // lower 16 bits in each u64, with other bits undefined.
+      const auto combine_u32_pair_to_16 = [d16, d64](V16 v16) HWY_ATTR {
+        return hn::Xor(
+            v16, hn::BitCast(d16, hn::ShiftRight<24>(hn::BitCast(d64, v16))));
+      };
+      const V16 u16_0 = combine_u32_pair_to_16(u8_0);
+      const V16 u16_1 = combine_u32_pair_to_16(u8_1);
+      const V16 u16_2 = combine_u32_pair_to_16(u8_2);
+      const V16 u16_3 = combine_u32_pair_to_16(u8_3);
+      // In-order compaction of four vectors into one, keeping only the low
+      // u16 of every u64. This is the same as above but with 16-bit Concat.
+      const V16 x2_0 = hn::ConcatEven(d16, u16_1, u16_0);
+      const V16 x2_1 = hn::ConcatEven(d16, u16_3, u16_2);
+      packed = hn::BitCast(d8, hn::ConcatEven(d16, x2_1, x2_0));
+    }
+    hn::StoreU(packed, d8, out);
+  }
+
+  // Unpacks `Lanes(d16)` nibbles to u16 lanes. The first comes from the low
+  // nibble of packed[0], then its high nibble, then the next low nibble, etc.
+  template <class D16, class V16 = hn::Vec<D16>>
+  static HWY_INLINE V16 OrderedUnpackU16(D16 d16, const uint8_t* packed) {
+    const hn::Repartition<uint8_t, D16> d8;
+    using V8 = hn::Vec<decltype(d8)>;
+    const hn::CappedTag<uint8_t, d16.MaxBytes() / 4> d_load;
+
+    // We replicate each byte 4x, so that its two nibbles propagate to both
+    // u16 lanes that they will initialize. The only performance-portable op to
+    // replicate bytes is TableLookupBytes, which shuffles 128-bit blocks
+    // independently. Thus each block receives 4 packed bytes, replicates them
+    // 4x, shifts/masks, and casts to 8 u16 lanes.
+    //
+    // Loading 16 bytes via LoadDup128 only works on AVX3; for smaller vectors,
+    // it may trigger asan errors from overrunning the end. We thus special-case
+    // vector lengths, handling any non-constexpr, and constexpr <= 512 bit.
+    V8 rep4;
+    if (HWY_HAVE_SCALABLE) {
+      // Non constexpr length: 4 per whole block equals size/4.
+      const size_t num_bytes = HWY_MAX(1, hn::Lanes(d8) / 4);
+      const V8 bytes = hn::LoadN(d8, packed, num_bytes);
+      // Replicate bytes 4x: lowest 4 = 0, next 4 = 1 etc.
+      const V8 idx = hn::And(hn::Iota(d8, 0), hn::Set(d8, 0xFCu));
+      rep4 = hn::TableLookupBytes(bytes, idx);
+    } else if (hn::MaxLanes(d16) <= 8) {  // <= 128-bit
+      const V8 bytes = hn::ResizeBitCast(d8, hn::LoadU(d_load, packed));
+      alignas(16) static constexpr uint8_t kRep4[16] = {
+          HWY_REP4(0), HWY_REP4(1), HWY_REP4(2), HWY_REP4(3)};
+      rep4 = hn::TableLookupBytes(bytes, hn::Load(d8, kRep4));
+    } else if (HWY_TARGET <= HWY_AVX3_DL || !HWY_ARCH_X86) {
+      // Plain load, can do 256..512-bit permute across blocks.
+      const V8 bytes = hn::ResizeBitCast(d8, hn::LoadU(d_load, packed));
+      alignas(64) static constexpr uint8_t kRep4[64] = {
+          HWY_REP4(0),  HWY_REP4(1),  HWY_REP4(2),  HWY_REP4(3),
+          HWY_REP4(4),  HWY_REP4(5),  HWY_REP4(6),  HWY_REP4(7),
+          HWY_REP4(8),  HWY_REP4(9),  HWY_REP4(10), HWY_REP4(11),
+          HWY_REP4(12), HWY_REP4(13), HWY_REP4(14), HWY_REP4(15)};
+      rep4 = hn::TableLookupLanes(bytes, hn::SetTableIndices(d8, kRep4));
+    } else if (hn::MaxLanes(d16) == 16) {  // 256-bit
+      const V8 bytes = hn::ResizeBitCast(d8, hn::LoadU(d_load, packed));
+      // First copy to upper block for TableLookupBytes. This is slightly
+      // faster than 64-bit BroadcastLane.
+      const V8 bcast = hn::ConcatLowerLower(d8, bytes, bytes);
+      alignas(32) static constexpr uint8_t kRep4[32] = {
+          HWY_REP4(0), HWY_REP4(1), HWY_REP4(2), HWY_REP4(3),
+          HWY_REP4(4), HWY_REP4(5), HWY_REP4(6), HWY_REP4(7)};
+      rep4 = hn::TableLookupBytes(bcast, hn::Load(d8, kRep4));
+    } else if (hn::MaxLanes(d16) == 32) {  // 512-bit
+      const V8 bytes = hn::LoadDup128(d8, packed);
+      alignas(64) static constexpr uint8_t kRep4[64] = {
+          HWY_REP4(0),  HWY_REP4(1),  HWY_REP4(2),  HWY_REP4(3),
+          HWY_REP4(4),  HWY_REP4(5),  HWY_REP4(6),  HWY_REP4(7),
+          HWY_REP4(8),  HWY_REP4(9),  HWY_REP4(10), HWY_REP4(11),
+          HWY_REP4(12), HWY_REP4(13), HWY_REP4(14), HWY_REP4(15)};
+      rep4 = hn::TableLookupBytes(bytes, hn::Load(d8, kRep4));
+    } else {
+      HWY_DASSERT(false);
+    }
+
+    const V16 mask4 = hn::Set(d16, 0xF);
+    const V16 u16 = BitCast(d16, rep4);
+    // In-order unpack. Right-shift odd u16 by 4. Example with two packed
+    // bytes, one digit representing a nibble:
+    // 32 32 32 32 | 10 10 10 10  u16
+    // z3 23 32 32 | z1 01 10 10  OddEven+ShiftRight
+    // zz z3 zz z2 | zz z1 zz z0  And (unpacked result)
+    return hn::And(mask4, hn::OddEven(hn::ShiftRight<4>(u16), u16));
+  }
+};
+
+// Encode/decode functions.
+class NuqCodec {
+  // 256-bit vectors can hold 16 bf16, otherwise we require 2x128-bit.
+  template <class DU>
+  static constexpr size_t NumTables(DU du) {
+    return (!HWY_HAVE_SCALABLE && du.MaxBytes() >= 32) ? 1 : 2;
+  }
+
+  // Unpacks `centers` from SFP into bf16 and loads them into one or two vectors
+  // for use by [Two]TableLookups. Returns as u16 because TableLookupLanes might
+  // not be available for bf16.
+  template <class DU, HWY_IF_U16_D(DU)>
+  static HWY_INLINE hn::Vec<DU> LoadTable(DU du, const uint8_t* centers,
+                                          hn::Vec<DU>* HWY_RESTRICT tbl1) {
+    // Cap to the table size (kClusters) for decoding SFP - sufficient, and may
+    // be faster than a large vector.
+    const hn::CappedTag<hwy::bfloat16_t, kClusters> d_table;
+    // We ResizeCast tables to DU: if DU is bigger, table lookups will only
+    // access lanes < kClusters. If DU is smaller (128-bit), we have 2 tables.
+    HWY_DASSERT(hn::Lanes(du) >= hn::Lanes(d_table) || NumTables(du) == 2);
+
+    HWY_ALIGN hwy::bfloat16_t table[kClusters];
+    SfpCodec::Dec(d_table, reinterpret_cast<const SfpStream*>(centers),
+                  kClusters, table);
+
+    // If we assume >= 128-bit vectors, we can use [Two]TableLookupLanes
+    // instead of TableLookupBytes, which requires extra interleaving of lo/hi.
+    HWY_DASSERT(hn::Lanes(du) >= 8);
+
+    if (NumTables(du) == 2) {
+      // Reduce cap for second half to avoid loading past the end of the table.
+      const hn::CappedTag<hwy::bfloat16_t, kClusters / 2> d_table2;
+      *tbl1 = hn::ResizeBitCast(du, hn::LoadU(d_table2, table + kClusters / 2));
+    }
+    return hn::ResizeBitCast(du, hn::Load(d_table, table));
+  }
+
+  // Unpacks per-weight indices and sets c0/c1 to the corresponding centers.
+  template <class DU>
+  static HWY_INLINE void TableLookups(DU du, hn::Vec<DU> tbl0, hn::Vec<DU> tbl1,
+                                      const uint8_t* packed, hn::Vec<DU>& c0,
+                                      hn::Vec<DU>& c1) {
+    using V16 = hn::Vec<decltype(du)>;
+    const size_t N16 = hn::Lanes(du);
+
+    const V16 idx0 = NibbleCodec::OrderedUnpackU16(du, packed);
+    const V16 idx1 = NibbleCodec::OrderedUnpackU16(du, packed + N16 / 2);
+
+    const auto indices0 = hn::IndicesFromVec(du, idx0);
+    const auto indices1 = hn::IndicesFromVec(du, idx1);
+
+    if (NumTables(du) == 1) {
+      (void)tbl1;
+      c0 = hn::TableLookupLanes(tbl0, indices0);
+      c1 = hn::TableLookupLanes(tbl0, indices1);
+    } else {
+      c0 = hn::TwoTablesLookupLanes(du, tbl0, tbl1, indices0);
+      c1 = hn::TwoTablesLookupLanes(du, tbl0, tbl1, indices1);
+    }
+  }
+
+ public:
+  // Encodes `num` floats starting from `in`. `out` points to compressed
+  // storage for `out_capacity` values and `out_ofs` indicates the destination
+  // offset within it, in units of float values, for parallel encoding by
+  // multiple threads. `num`, `out_capacity`, and `out_ofs` must all be
+  // multiples of `kGroupSize`. Returns the total number of unused clusters,
+  // which is expected to be zero.
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE size_t Enc(DF df, const float* const in, const size_t num,
+                               ClusterBuf& buf, const size_t out_capacity,
+                               NuqStream* const out, const size_t out_ofs) {
+    const hn::Repartition<uint8_t, DF> d8;
+    const hn::Repartition<uint16_t, DF> d16;
+    using V8 = hn::Vec<decltype(d8)>;
+    using V16 = hn::Vec<decltype(d16)>;
+
+    const size_t N16 = hn::Lanes(d16);
+    HWY_ASSERT(kGroupSize >= 4 * N16);
+
+    HWY_ASSERT(out_ofs + num <= out_capacity);
+    buf.Resize(num);
+    HWY_ASSERT(num % kGroupSize == 0);
+    HWY_ASSERT(out_capacity % kGroupSize == 0);
+    HWY_ASSERT(out_ofs % kGroupSize == 0);
+    const size_t num_groups = num / kGroupSize;
+    const size_t ofs_groups = out_ofs / kGroupSize;
+
+    size_t unused_clusters = 0;
+    for (size_t g = 0; g < num_groups; ++g) {
+      const float* HWY_RESTRICT g_in = in + g * kGroupSize;
+      float* HWY_RESTRICT g_centers = buf.centers.get() + g * kClusters;
+      uint16_t* HWY_RESTRICT g_idx = buf.idx.get() + g * kGroupSize;
+      unused_clusters +=
+          NuqClustering::ClusterExactL2(df, g_in, buf, g_centers, g_idx);
+    }
+
+    uint8_t* centers = &out->byte + ofs_groups * kClusters;
+    SfpCodec::Enc(df, buf.centers.get(), num_groups * kClusters,
+                  reinterpret_cast<SfpStream*>(centers));
+    uint8_t* packed_start = &out->byte + NuqStream::PackedStart(out_capacity) +
+                            ofs_groups * kGroupSize / 2;
+
+    HWY_UNROLL(1)
+    for (size_t g = 0; g < num_groups; ++g) {
+      const uint16_t* HWY_RESTRICT g_idx = buf.idx.get() + g * kGroupSize;
+      uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
+
+      HWY_UNROLL(1)
+      for (size_t i = 0; i < kGroupSize; i += 4 * N16) {
+        const V16 idx0 = hn::LoadU(d16, g_idx + i + N16 * 0);
+        const V16 idx1 = hn::LoadU(d16, g_idx + i + N16 * 1);
+        const V16 idx2 = hn::LoadU(d16, g_idx + i + N16 * 2);
+        const V16 idx3 = hn::LoadU(d16, g_idx + i + N16 * 3);
+        NibbleCodec::OrderedPackU16(d16, idx0, idx1, idx2, idx3,
+                                    g_packed + i / 2);
+      }
+    }
+
+    return unused_clusters;
+  }
+
+  // Decodes `num` values from the stream `in`, starting at the offset `in_ofs`
+  // (in units of values), to bf16 in `out`. `in_capacity`, `in_ofs` and `num`
+  // must all be multiples of `kGroupSize`.
+  template <class DF, HWY_IF_BF16_D(DF)>
+  static HWY_INLINE void Dec(DF dbf, const size_t in_capacity,
+                             const NuqStream* const in, const size_t in_ofs,
+                             hwy::bfloat16_t* const out, const size_t num) {
+    const hn::RebindToUnsigned<decltype(dbf)> d16;
+    using V16 = hn::Vec<decltype(d16)>;
+
+    const size_t N16 = hn::Lanes(d16);
+    HWY_DASSERT(kGroupSize >= 4 * N16);
+
+    HWY_DASSERT(in_ofs + num <= in_capacity);
+    HWY_DASSERT(in_capacity % kGroupSize == 0);
+    HWY_DASSERT(in_ofs % kGroupSize == 0);
+    HWY_DASSERT(num % kGroupSize == 0);
+    const size_t num_groups = num / kGroupSize;
+    const size_t ofs_groups = in_ofs / kGroupSize;
+    const uint8_t* tables = &in->byte + ofs_groups * kClusters;
+    const uint8_t* packed_start = &in->byte +
+                                  NuqStream::PackedStart(in_capacity) +
+                                  ofs_groups * kGroupSize / 2;
+
+    HWY_UNROLL(1)
+    for (size_t g = 0; g < num_groups; ++g) {
+      const uint8_t* g_centers = tables + g * kClusters;
+      const uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
+      hwy::bfloat16_t* HWY_RESTRICT g_out = out + g * kGroupSize;
+
+      V16 tbl1 = Zero(d16);
+      const V16 tbl0 = LoadTable(d16, g_centers, &tbl1);
+
+      HWY_UNROLL(1)
+      for (size_t i = 0; i < kGroupSize; i += 2 * N16) {
+        V16 c0, c1;
+        TableLookups(d16, tbl0, tbl1, g_packed + i / 2, c0, c1);
+        hn::StoreU(BitCast(dbf, c0), dbf, g_out + i + N16 * 0);
+        hn::StoreU(BitCast(dbf, c1), dbf, g_out + i + N16 * 1);
+      }
+    }
+  }
+
+  // Decodes `num` values from the stream `in`, starting at the offset
+  // `in_ofs` (in units of values), to f32 in `out`. `in_capacity`,
+  // `in_ofs` and `num` must all be multiples of `kGroupSize`.
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Dec(DF df, const size_t in_capacity,
+                             const NuqStream* const in, const size_t in_ofs,
+                             float* const out, const size_t num) {
+    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+    const hn::RebindToUnsigned<decltype(dbf)> d16;
+    using V16 = hn::Vec<decltype(d16)>;
+    using VF = hn::Vec<DF>;
+
+    const size_t NF = hn::Lanes(df);
+    HWY_DASSERT(kGroupSize >= 4 * NF);
+
+    HWY_DASSERT(in_ofs + num <= in_capacity);
+    HWY_DASSERT(in_capacity % kGroupSize == 0);
+    HWY_DASSERT(in_ofs % kGroupSize == 0);
+    HWY_DASSERT(num % kGroupSize == 0);
+    const size_t ofs_groups = in_ofs / kGroupSize;
+    const size_t num_groups = num / kGroupSize;
+    const uint8_t* tables = &in->byte + ofs_groups * kClusters;
+    const uint8_t* packed_start = &in->byte +
+                                  NuqStream::PackedStart(in_capacity) +
+                                  ofs_groups * kGroupSize / 2;
+
+    HWY_UNROLL(1)
+    for (size_t g = 0; g < num_groups; ++g) {
+      const uint8_t* g_centers = tables + g * kClusters;
+      const uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
+      float* HWY_RESTRICT g_out = out + g * kGroupSize;
+
+      V16 tbl1 = Zero(d16);
+      const V16 tbl0 = LoadTable(d16, g_centers, &tbl1);
+
+      HWY_UNROLL(1)
+      for (size_t i = 0; i < kGroupSize; i += 4 * NF) {
+        V16 c0, c1;
+        TableLookups(d16, tbl0, tbl1, g_packed + i / 2, c0, c1);
+        const VF f0 = hn::PromoteLowerTo(df, BitCast(dbf, c0));
+        const VF f1 = hn::PromoteUpperTo(df, BitCast(dbf, c0));
+        const VF f2 = hn::PromoteLowerTo(df, BitCast(dbf, c1));
+        const VF f3 = hn::PromoteUpperTo(df, BitCast(dbf, c1));
+        hn::StoreU(f0, df, g_out + i + NF * 0);
+        hn::StoreU(f1, df, g_out + i + NF * 1);
+        hn::StoreU(f2, df, g_out + i + NF * 2);
+        hn::StoreU(f3, df, g_out + i + NF * 3);
+      }
+    }
+  }
+
+  // Accumulates into `sum0..3` dot products of decoded values with `num` bf16
+  // from `vec_aligned`. DF is f32 because sum0..3 are also f32. `in_capacity`,
+  // `in_ofs` and `num` must all be multiples of `kGroupSize`.
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Dot(DF df, const size_t in_capacity,
+                             const NuqStream* const in, const size_t in_ofs,
+                             const hwy::bfloat16_t* const vec_aligned,
+                             const size_t num, hn::Vec<DF>& sum0,
+                             hn::Vec<DF>& sum1, hn::Vec<DF>& sum2,
+                             hn::Vec<DF>& sum3) {
+    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+    const hn::RebindToUnsigned<decltype(dbf)> d16;
+    using VBF = hn::Vec<decltype(dbf)>;
+    using V16 = hn::Vec<decltype(d16)>;
+    const size_t N16 = hn::Lanes(d16);
+    HWY_DASSERT(kGroupSize >= 4 * N16);
+
+    HWY_DASSERT(in_ofs + num <= in_capacity);
+    HWY_DASSERT(in_capacity % kGroupSize == 0);
+    HWY_DASSERT(in_ofs % kGroupSize == 0);
+    HWY_DASSERT(num % kGroupSize == 0);
+    const size_t ofs_groups = in_ofs / kGroupSize;
+    const size_t num_groups = num / kGroupSize;
+    const uint8_t* tables = &in->byte + ofs_groups * kClusters;
+    const uint8_t* packed_start = &in->byte +
+                                  NuqStream::PackedStart(in_capacity) +
+                                  ofs_groups * kGroupSize / 2;
+
+    HWY_UNROLL(1)
+    for (size_t g = 0; g < num_groups; ++g) {
+      const uint8_t* g_centers = tables + g * kClusters;
+      const uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
+      const hwy::bfloat16_t* HWY_RESTRICT g_in = vec_aligned + g * kGroupSize;
+
+      V16 tbl1 = Zero(d16);
+      const V16 tbl0 = LoadTable(d16, g_centers, &tbl1);
+
+      HWY_UNROLL(1)
+      for (size_t i = 0; i < kGroupSize; i += 2 * N16) {
+        V16 c0, c1;
+        TableLookups(d16, tbl0, tbl1, g_packed + i / 2, c0, c1);
+        const VBF in0 = hn::Load(dbf, g_in + i + N16 * 0);
+        const VBF in1 = hn::Load(dbf, g_in + i + N16 * 1);
+        sum0 = hn::ReorderWidenMulAccumulate(df, in0, BitCast(dbf, c0), sum0,
+                                             sum1);
+        sum2 = hn::ReorderWidenMulAccumulate(df, in1, BitCast(dbf, c1), sum2,
+                                             sum3);
+      }
+    }
+  }
+
+  // Accumulates into `sum0..3` dot products of decoded values with `num` f32
+  // from `vec_aligned`. `in_capacity`, `in_ofs` and `num` must all be
+  // multiples of `kGroupSize`.
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Dot(DF df, const size_t in_capacity,
+                             const NuqStream* const in, const size_t in_ofs,
+                             const float* const vec_aligned, const size_t num,
+                             hn::Vec<DF>& sum0, hn::Vec<DF>& sum1,
+                             hn::Vec<DF>& sum2, hn::Vec<DF>& sum3) {
+    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+    const hn::RebindToUnsigned<decltype(dbf)> d16;
+    using VF = hn::Vec<decltype(df)>;
+    using V16 = hn::Vec<decltype(d16)>;
+    const size_t NF = hn::Lanes(df);
+    HWY_DASSERT(kGroupSize >= 4 * NF);
+
+    HWY_DASSERT(in_ofs + num <= in_capacity);
+    HWY_DASSERT(in_capacity % kGroupSize == 0);
+    HWY_DASSERT(in_ofs % kGroupSize == 0);
+    HWY_DASSERT(num % kGroupSize == 0);
+    const size_t ofs_groups = in_ofs / kGroupSize;
+    const size_t num_groups = num / kGroupSize;
+    const uint8_t* tables = &in->byte + ofs_groups * kClusters;
+    const uint8_t* packed_start = &in->byte +
+                                  NuqStream::PackedStart(in_capacity) +
+                                  ofs_groups * kGroupSize / 2;
+
+    HWY_UNROLL(1)
+    for (size_t g = 0; g < num_groups; ++g) {
+      const uint8_t* g_centers = tables + g * kClusters;
+      const uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
+      const float* HWY_RESTRICT g_in = vec_aligned + g * kGroupSize;
+
+      V16 tbl1 = Zero(d16);
+      const V16 tbl0 = LoadTable(d16, g_centers, &tbl1);
+
+      HWY_UNROLL(1)
+      for (size_t i = 0; i < kGroupSize; i += 4 * NF) {
+        V16 c0, c1;
+        TableLookups(d16, tbl0, tbl1, g_packed + i / 2, c0, c1);
+        const VF in0 = hn::LoadU(df, g_in + i + NF * 0);
+        const VF in1 = hn::LoadU(df, g_in + i + NF * 1);
+        const VF in2 = hn::LoadU(df, g_in + i + NF * 2);
+        const VF in3 = hn::LoadU(df, g_in + i + NF * 3);
+        const VF f0 = hn::PromoteLowerTo(df, BitCast(dbf, c0));
+        const VF f1 = hn::PromoteUpperTo(df, BitCast(dbf, c0));
+        const VF f2 = hn::PromoteLowerTo(df, BitCast(dbf, c1));
+        const VF f3 = hn::PromoteUpperTo(df, BitCast(dbf, c1));
+        sum0 = hn::MulAdd(in0, f0, sum0);
+        sum1 = hn::MulAdd(in1, f1, sum1);
+        sum2 = hn::MulAdd(in2, f2, sum2);
+        sum3 = hn::MulAdd(in3, f3, sum3);
+      }
+    }
+  }
+};  // NuqCodec
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_INL_H_
--- a/compression/nuq.h
+++ b/compression/nuq.h
@ -0,0 +1,116 @@
+// Copyright 2023 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_H_
+
+// Non-uniform quantization: a compressed representation of f32 inputs that
+// supports seeking at a granularity of kGroupSize, decoding to bf16/f32, and a
+// fused decode/dot product with bf16/f32 vectors.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"  // HWY_INLINE
+
+namespace gcpp {
+
+// 4-bit indices are a sweet spot in terms of quality per size.
+static constexpr size_t kClusters = 16;
+
+// Number of weights that share a table. Larger = slower encode, higher error,
+// smaller size (table amortized over more weights). This is the minimum
+// granularity for seeking/decoding in the stream, and must be at least four
+// times the number of bf16 elements per vector.
+static constexpr size_t kGroupSize = 256;
+
+// Points to the *start* of a NUQ stream. Aligning the allocation (see
+// aligned_allocator.h) may be speed up decoding but is not required.
+//
+// See go/streaming-weight-decode for background and design. Layout: first one
+// table of kClusters entries per group, in ascending order of group index,
+// then two packed indices per byte.
+//
+// Indices are stored in-order to enable vector-length agnostic decode, because
+// streams may be persisted to disk and used by other CPUs.
+//
+// To enable parallel encoding and decoding, Enc/Dec have `offset` parameters
+// which refer to the stream, NOT the raw from/to pointers, which point directly
+// to the source/destination. Offsets are in units of values, NOT compressed
+// bytes within the stream.
+#pragma pack(push, 1)
+struct NuqStream {
+  // Returns offset of packed indices from the start of the stream. This matches
+  // the (padded) total table size because table entries are bytes. `capacity`
+  // is already a multiple of `kGroupSize`.
+  static constexpr size_t PackedStart(size_t capacity) {
+    // Round up to avoid cache-line splits when loading indices. No effect on
+    // size as long as capacity / kGroupSize is a multiple of 4.
+    return hwy::RoundUpTo((capacity / kGroupSize) * kClusters, 64);
+  }
+
+  // Returns number of NuqStream to allocate for the stream, which matches its
+  // size in bytes. `capacity` is already a multiple of `kGroupSize`.
+  static constexpr size_t PackedEnd(size_t capacity) {
+    return PackedStart(capacity) + capacity / 2;  // two 4-bit indices per byte.
+  }
+
+  uint8_t byte;
+};
+#pragma pack(pop)
+
+static inline const char* TypeName(NuqStream) { return "NUQ"; }
+
+// Storage for dynamic programming. There are two matrices; we use separate
+// allocations to avoid type punning.
+template <class T>
+class AlignedMatrix {
+ public:
+  AlignedMatrix() : mem_(hwy::AllocateAligned<T>(kClusters * kGroupSize)) {}
+
+  HWY_INLINE const T& operator()(size_t row, size_t col) const {
+    return mem_[row * kGroupSize + col];
+  }
+
+  HWY_INLINE T& operator()(size_t row, size_t col) {
+    return mem_[row * kGroupSize + col];
+  }
+
+ private:
+  hwy::AlignedFreeUniquePtr<T[]> mem_;
+};
+
+// Reuse memory across calls to Enc to avoid per-call allocations.
+struct ClusterBuf {
+  void Resize(size_t new_num) {
+    if (new_num < num) return;
+
+    num = new_num;
+    const size_t num_groups = hwy::DivCeil(num, kGroupSize);
+    centers = hwy::AllocateAligned<float>(num_groups * kClusters);
+    idx = hwy::AllocateAligned<uint16_t>(num);
+  }
+
+  AlignedMatrix<float> d;
+  AlignedMatrix<int32_t> t;
+
+  size_t num = 0;
+  hwy::AlignedFreeUniquePtr<float[]> centers;
+  hwy::AlignedFreeUniquePtr<uint16_t[]> idx;
+};
+
+}  // namespace gcpp
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_H_
--- a/compression/nuq_test.cc
+++ b/compression/nuq_test.cc
@ -0,0 +1,428 @@
+// Copyright 2023 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>  // std::shuffle
+#include <random>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE \
+  "third_party/gemma_cpp/compression/nuq_test.cc"  // NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+// Other headers that include Highway must come after foreach_target.h
+// copybara:import_next_line:gemma_cpp
+#include "compression/distortion.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/nuq-inl.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/nuq.h"
+#include "hwy/highway.h"
+#include "hwy/tests/hwy_gtest.h"
+#include "hwy/tests/test_util-inl.h"
+#include "hwy/timer.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+
+// All-equal inputs: only one cluster
+struct TestFlat {
+  template <typename T, class DF>
+  HWY_INLINE void operator()(T /*unused*/, DF df) {
+    // Run this simple test only once to save time/debug output.
+    if (!(HWY_ONCE && hn::Lanes(df) == hn::Lanes(hn::ScalableTag<float>()))) {
+      return;
+    }
+
+    auto in = hwy::AllocateAligned<float>(kGroupSize);
+    HWY_ASSERT(in);
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      in[i] = 0.5f;
+    }
+    ClusterBuf buf;
+    float centers[kClusters];
+    uint16_t indices[kGroupSize];
+    const size_t unused_clusters =
+        NuqClustering::ClusterExactL2(df, in.get(), buf, centers, indices);
+    HWY_ASSERT(unused_clusters == kClusters - 1);
+
+    for (size_t i = 0; i < unused_clusters; ++i) {
+      HWY_ASSERT(centers[i] == 0.0f);
+    }
+    HWY_ASSERT(centers[unused_clusters] == 0.5f);
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      HWY_ASSERT(indices[i] == unused_clusters);
+    }
+  }
+};
+
+void TestAllFlat() { hn::ForGEVectors<64, TestFlat>()(float()); }
+
+// Generate shuffled plateaus, one per cluster
+struct TestPlateaus {
+  template <typename T, class DF>
+  HWY_INLINE void operator()(T /*unused*/, DF df) {
+    // Run this simple test only once to save time/debug output.
+    if (!(HWY_ONCE && hn::Lanes(df) == hn::Lanes(hn::ScalableTag<float>()))) {
+      return;
+    }
+
+    auto in = hwy::AllocateAligned<float>(kGroupSize);
+    HWY_ASSERT(in);
+
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      const size_t idx_cluster = i / (kGroupSize / kClusters);
+      HWY_ASSERT(idx_cluster < kClusters);
+      in[i] = (1.0f * idx_cluster / kClusters) - 0.5f;
+      HWY_ASSERT(-0.5f <= in[i] && in[i] < 0.5f);
+    }
+
+    std::random_device rd;
+    std::mt19937 rng(rd());
+    std::shuffle(in.get(), in.get() + kGroupSize, rng);
+
+    ClusterBuf buf;
+    float centers[kClusters];
+    uint16_t indices[kGroupSize];
+    const size_t unused_clusters =
+        NuqClustering::ClusterExactL2(df, in.get(), buf, centers, indices);
+    HWY_ASSERT(unused_clusters == 0);
+
+    DistortionStats stats;
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      HWY_ASSERT(indices[i] < kClusters);
+      stats.Notify(in[i], centers[indices[i]]);
+    }
+    const float pnorm = stats.PNorm();
+    const float snr = stats.GeomeanValueDivL1();
+    fprintf(stderr, "p-norm %.3E snr %.2f @%zu = %.4E\n", pnorm, snr,
+            stats.MaxIndex(), stats.MaxL1());
+    HWY_ASSERT(pnorm == 0.0f);
+    HWY_ASSERT(snr == 0.0f);
+  }
+};
+
+void TestAllPlateaus() { hn::ForGEVectors<64, TestPlateaus>()(float()); }
+
+struct TestRamp {
+  template <typename T, class DF>
+  HWY_INLINE void operator()(T /*unused*/, DF df) {
+    // Run this simple test only once to save time/debug output.
+    if (!(HWY_ONCE && hn::Lanes(df) == hn::Lanes(hn::ScalableTag<float>()))) {
+      return;
+    }
+
+    auto in = hwy::AllocateAligned<float>(kGroupSize);
+    HWY_ASSERT(in);
+
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      in[i] = (1.0f * i / kGroupSize) - 0.45f;  // slightly asymmetric
+      HWY_ASSERT(-0.45f <= in[i] && in[i] < 0.55f);
+    }
+
+    std::random_device rd;
+    std::mt19937 rng(rd());
+    std::shuffle(in.get(), in.get() + kGroupSize, rng);
+
+    ClusterBuf buf;
+    float centers[kClusters];
+    uint16_t indices[kGroupSize];
+    const size_t unused_clusters =
+        NuqClustering::ClusterExactL2(df, in.get(), buf, centers, indices);
+    HWY_ASSERT(unused_clusters == 0);
+
+    DistortionStats stats;
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      HWY_ASSERT(indices[i] < kClusters);
+      stats.Notify(in[i], centers[indices[i]]);
+    }
+    const float pnorm = stats.PNorm();
+    const float snr = stats.GeomeanValueDivL1();
+    fprintf(stderr, "p-norm %.3E snr %.2f @%zu = %.4E\n", pnorm, snr,
+            stats.MaxIndex(), stats.MaxL1());
+    static_assert(kGroupSize == 128 || kGroupSize == 256, "Update expected");
+
+    const float expected_pnorm = kGroupSize == 128 ? 2.08E-2f : 2.1E-2f;
+    const float expected_snr = kGroupSize == 128 ? 16.9f : 17.6f;
+    HWY_ASSERT(expected_pnorm <= pnorm && pnorm < 1.02f * expected_pnorm);
+    HWY_ASSERT(expected_snr <= snr && snr < 1.01f * expected_snr);
+  }
+};
+
+void TestAllRamp() { hn::ForGEVectors<64, TestRamp>()(float()); }
+
+struct TestNormal {
+  template <typename T, class DF>
+  HWY_INLINE void operator()(T /*unused*/, DF df) {
+    auto in = hwy::AllocateAligned<float>(kGroupSize);
+    HWY_ASSERT(in);
+
+    std::mt19937 rng(123);
+    std::normal_distribution<float> dist{0.001f, 0.3f};
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      in[i] = dist(rng);
+    }
+    std::shuffle(in.get(), in.get() + kGroupSize, rng);
+
+    ClusterBuf buf;
+    float centers[kClusters];
+    uint16_t indices[kGroupSize];
+    double elapsed = hwy::HighestValue<double>();
+    for (size_t rep = 0; rep < 100; ++rep) {
+      const double t0 = hwy::platform::Now();
+      const size_t unused_clusters =
+          NuqClustering::ClusterExactL2(df, in.get(), buf, centers, indices);
+      HWY_ASSERT(unused_clusters == 0);
+      const double t1 = hwy::platform::Now();
+      elapsed = HWY_MIN(elapsed, t1 - t0);
+    }
+    fprintf(stderr, "Vec %zu Enc %.2f MB/s\n", Lanes(df) * 4,
+            kGroupSize * sizeof(float) * 1E-6 / elapsed);
+
+    DistortionStats stats;
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      HWY_ASSERT(indices[i] < kClusters);
+      stats.Notify(in[i], centers[indices[i]]);
+    }
+    const float pnorm = stats.PNorm();
+    const float snr = stats.GeomeanValueDivL1();
+    fprintf(stderr, "p-norm %.3E snr %.2f @%zu = %.4E\n", pnorm, snr,
+            stats.MaxIndex(), stats.MaxL1());
+    static_assert(kGroupSize == 128 || kGroupSize == 256, "Update expected");
+    const float expected_pnorm = kGroupSize == 128 ? 3E-2f : 3.4E-2f;
+    const float expected_snr = kGroupSize == 128 ? 17.4f : 13.1f;
+    HWY_ASSERT(expected_pnorm <= pnorm && pnorm < 1.02f * expected_pnorm);
+    HWY_ASSERT(expected_snr <= snr && snr < 1.01f * expected_snr);
+  }
+};
+
+void TestAllNormal() { hn::ForGEVectors<64, TestNormal>()(float()); }
+
+// Can encode and decode sub-regions.
+struct TestOffset {
+  template <typename T, class D>
+  HWY_INLINE void operator()(T /*unused*/, D d) {
+    const hn::Repartition<float, D> df;
+    const size_t total = 10 * kGroupSize;
+    const size_t kMidLen = 2 * kGroupSize;  // length of middle piece
+
+    auto in = hwy::AllocateAligned<float>(total);  // Enc() requires f32
+    auto dec1 = hwy::AllocateAligned<T>(total);
+    auto dec2 = hwy::AllocateAligned<T>(kMidLen);
+    auto nuq = hwy::AllocateAligned<NuqStream>(NuqStream::PackedEnd(total));
+    HWY_ASSERT(in && dec1 && dec2 && nuq);
+
+    std::mt19937 rng(123);
+    std::normal_distribution<float> dist{0.001f, 0.3f};
+    for (size_t i = 0; i < total; ++i) {
+      in[i] = dist(rng);
+    }
+
+    // Encode + decode everything
+    ClusterBuf buf;
+    (void)NuqCodec::Enc(df, in.get(), total, buf, total, nuq.get(), 0);
+    NuqCodec::Dec(d, total, nuq.get(), 0, dec1.get(), total);
+
+    // Overwrite middle with first inputs
+    const size_t offset = 5 * kGroupSize;
+    (void)NuqCodec::Enc(df, in.get(), kMidLen, buf, total, nuq.get(), offset);
+
+    // Decoded middle now matches previously decoded first
+    NuqCodec::Dec(d, total, nuq.get(), offset, dec2.get(), kMidLen);
+    for (size_t i = 0; i < kMidLen; ++i) {
+      HWY_ASSERT(dec1[i] == dec2[i]);
+    }
+  }
+};
+
+void TestAllOffsetF32() {
+  const hn::ForGEVectors<128, TestOffset> test;
+  test(float());
+}
+
+void TestAllOffsetBF16() {
+  const hn::ForGEVectors<128, TestOffset> test;
+  test(hwy::bfloat16_t());
+}
+
+struct TestStream {
+  template <typename T, class D>
+  HWY_INLINE void operator()(T /*unused*/, D d) {
+    const hn::Repartition<float, D> df;
+    const size_t num = 4 * kGroupSize;
+    auto in = hwy::AllocateAligned<float>(num);  // Enc() requires f32
+    auto out = hwy::AllocateAligned<T>(num);
+    auto nuq = hwy::AllocateAligned<NuqStream>(NuqStream::PackedEnd(num));
+    HWY_ASSERT(in && out && nuq);
+
+    std::mt19937 rng(123);
+    std::normal_distribution<float> dist{0.001f, 0.3f};
+    for (size_t i = 0; i < num; ++i) {
+      in[i] = dist(rng);
+    }
+
+    ClusterBuf buf;
+    double elapsed = hwy::HighestValue<double>();
+    for (size_t rep = 0; rep < 100; ++rep) {
+      const double t0 = hwy::platform::Now();
+      const size_t unused_clusters =
+          NuqCodec::Enc(df, in.get(), num, buf, num, nuq.get(), 0);
+      HWY_ASSERT(unused_clusters == 0);
+      const double t1 = hwy::platform::Now();
+      elapsed = HWY_MIN(elapsed, t1 - t0);
+    }
+    fprintf(stderr, "Vec %zu Enc %.2f MB/s\n", Lanes(d) * sizeof(T),
+            num * sizeof(float) * 1E-6 / elapsed);
+
+    elapsed = hwy::HighestValue<double>();
+    for (size_t rep = 0; rep < 100; ++rep) {
+      const double t0 = hwy::platform::Now();
+      NuqCodec::Dec(d, num, nuq.get(), 0, out.get(), num);
+      const double t1 = hwy::platform::Now();
+      elapsed = HWY_MIN(elapsed, t1 - t0);
+    }
+    fprintf(stderr, "Vec %zu Dec %.2f MB/s\n", Lanes(d) * sizeof(T),
+            num * sizeof(T) * 1E-6 / elapsed);
+
+    DistortionStats stats;
+    for (size_t i = 0; i < num; ++i) {
+      stats.Notify(in[i], hwy::ConvertScalarTo<float>(out[i]));
+    }
+    const float pnorm = stats.PNorm();
+    const float snr = stats.GeomeanValueDivL1();
+    fprintf(stderr, "p-norm %.3E snr %.2f @%zu = %.4E\n", pnorm, snr,
+            stats.MaxIndex(), stats.MaxL1());
+    static_assert(kGroupSize == 128 || kGroupSize == 256, "Update expected");
+    const float expected_pnorm = kGroupSize == 128 ? 3.44E-2f : 3.88E-2f;
+    const float expected_snr = kGroupSize == 128 ? 15.0f : 13.3f;
+    HWY_ASSERT(expected_pnorm <= pnorm && pnorm < 1.02f * expected_pnorm);
+    HWY_ASSERT(expected_snr <= snr && snr < 1.01f * expected_snr);
+  }
+};
+
+void TestAllStreamF32() {
+  const hn::ForGEVectors<128, TestStream> test;
+  test(float());
+}
+
+void TestAllStreamBF16() {
+  const hn::ForGEVectors<128, TestStream> test;
+  test(hwy::bfloat16_t());
+}
+
+struct TestDot {
+  template <typename T, class D>
+  HWY_INLINE void operator()(T /*unused*/, D d) {
+    const hn::Repartition<float, D> df;
+    const size_t num = 4 * kGroupSize;
+    auto in = hwy::AllocateAligned<float>(num);
+    auto dec = hwy::AllocateAligned<float>(num);
+    auto vec = hwy::AllocateAligned<T>(num);
+    auto nuq = hwy::AllocateAligned<NuqStream>(NuqStream::PackedEnd(num));
+    HWY_ASSERT(in && dec && vec && nuq);
+
+    std::mt19937 rng(123);
+    std::normal_distribution<float> dist{0.001f, 0.3f};
+    for (size_t i = 0; i < num; ++i) {
+      in[i] = dist(rng);
+      vec[i] = hwy::ConvertScalarTo<T>(dist(rng));
+    }
+    // This changes the correlation between in and vec, which considerably
+    // affects the error of the result.
+    std::shuffle(in.get(), in.get() + num, rng);
+
+    ClusterBuf buf;
+    const size_t unused_clusters =
+        NuqCodec::Enc(df, in.get(), num, buf, num, nuq.get(), 0);
+    HWY_ASSERT(unused_clusters == 0);
+
+    double actual = 0.0;
+    double elapsed = hwy::HighestValue<double>();
+    for (size_t rep = 0; rep < 20; ++rep) {
+      hn::Vec<decltype(df)> sum0 = hn::Zero(df);
+      hn::Vec<decltype(df)> sum1 = hn::Zero(df);
+      hn::Vec<decltype(df)> sum2 = hn::Zero(df);
+      hn::Vec<decltype(df)> sum3 = hn::Zero(df);
+      const double t0 = hwy::platform::Now();
+      NuqCodec::Dot(df, num, nuq.get(), 0, vec.get(), num, sum0, sum1, sum2,
+                    sum3);
+      const double t1 = hwy::platform::Now();
+      elapsed = HWY_MIN(elapsed, t1 - t0);
+      sum0 = hn::Add(hn::Add(sum0, sum1), hn::Add(sum2, sum3));
+      actual = hn::ReduceSum(df, sum0);
+    }
+
+    NuqCodec::Dec(df, num, nuq.get(), 0, dec.get(), num);
+    fprintf(stderr, "Vec %zu Dec %.2f MB/s\n", Lanes(d) * sizeof(T),
+            num * sizeof(in[0]) * 1E-6 / elapsed);
+
+    double expected = 0.0;   // using original input
+    double expected2 = 0.0;  // using decoded NUQ
+    for (size_t i = 0; i < num; ++i) {
+      expected += in[i] * hwy::ConvertScalarTo<double>(vec[i]);
+      expected2 += dec[i] * hwy::ConvertScalarTo<double>(vec[i]);
+    }
+    const double l1 = hwy::ScalarAbs(expected - actual);
+    const double snr = 1.0 + hwy::ScalarAbs(expected) / l1;
+    fprintf(stderr, "expected %.3f e2 %.4f actual %.4f l1 %E snr %.2f\n",
+            expected, expected2, actual, l1, snr);
+    HWY_ASSERT(hwy::ScalarAbs(expected2 - actual) < 1E-4);
+    static_assert(kGroupSize == 128 || kGroupSize == 256, "Update expected");
+    const double expected_l1 = kGroupSize == 128 ? 7.3E-2 : 4.34E-2;
+    const double expected_snr = kGroupSize == 128 ? 9.7f
+                                : sizeof(T) == 2  ? 14.5f
+                                                  : 14.9f;
+    HWY_ASSERT(expected_l1 <= l1 && l1 < 1.02f * expected_l1);
+    HWY_ASSERT(expected_snr <= snr && snr < 1.01f * expected_snr);
+  }
+};
+
+void TestAllDotF32() {
+  const hn::ForGEVectors<128, TestDot> test;
+  test(float());
+}
+void TestAllDotBF16() {
+  const hn::ForGEVectors<128, TestDot> test;
+  test(hwy::bfloat16_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace gcpp {
+HWY_BEFORE_TEST(NuqTest);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestAllFlat);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestAllPlateaus);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestAllRamp);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestAllNormal);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestAllOffsetF32);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestAllOffsetBF16);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestAllStreamF32);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestAllStreamBF16);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestAllDotF32);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestAllDotBF16);
+}  // namespace gcpp
+
+#endif
--- a/compression/sfp-inl.h
+++ b/compression/sfp-inl.h
@ -0,0 +1,515 @@
+// Copyright 2023 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Normal include guard to placate lint.
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_SFP_INL_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_SFP_INL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/sfp.h"
+#include "hwy/base.h"
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_SFP_INL_H_
+
+// Actual per-target include guard.
+#if defined(THIRD_PARTY_GEMMA_CPP_SFP_INL_TOGGLE) == defined(HWY_TARGET_TOGGLE)
+#ifdef THIRD_PARTY_GEMMA_CPP_SFP_INL_TOGGLE
+#undef THIRD_PARTY_GEMMA_CPP_SFP_INL_TOGGLE
+#else
+#define THIRD_PARTY_GEMMA_CPP_SFP_INL_TOGGLE
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+// For unsigned numbers with MSB zero, signed comparison is faster on x86.
+template <class DU>
+HWY_INLINE hn::Mask<DU> SignedGt(DU du, hn::Vec<DU> a, hn::Vec<DU> b) {
+  const hn::RebindToSigned<DU> di;
+  return hn::RebindMask(du, hn::Gt(BitCast(di, a), hn::BitCast(di, b)));
+}
+template <class DU>
+HWY_INLINE hn::Mask<DU> SignedLt(DU du, hn::Vec<DU> a, hn::Vec<DU> b) {
+  return SignedGt(du, b, a);
+}
+
+// Encode/decode functions.
+class SfpCodec {
+ public:
+  // Returns 8-bit packed representation of `lo` and `hi` bytes of bf16. 31 ops.
+  // Implementation detail, public because called by test.
+  template <class D, HWY_IF_U8_D(D)>
+  static HWY_INLINE hn::Vec<D> EncBytes(D d, const hn::Vec<D> lo,
+                                        const hn::Vec<D> hi) {
+    const hn::Vec<D> k1 = hn::Set(d, 1u);
+    const hn::Vec<D> k80 = hn::Set(d, 0x80u);
+
+    // Copy sign for later insertion.
+    const hn::Vec<D> sign_in_msb = hi;
+    // Biased exponent = lower 7 bits of hi and MSB of lo. Modified below.
+    hn::Vec<D> biased_e = hn::Or(hn::Add(hi, hi), hn::ShiftRight<7>(lo));
+    HWY_ASSERT(hn::AllTrue(d, hn::Lt(biased_e, k80)));  // <= 2^0
+
+    // Clear MSB to isolate the mantissa and enable signed comparisons, then
+    // shift right by *one* (plus 1 to undo the prior add/left-shift) to leave
+    // headroom for overflow during rounding.
+    const hn::Vec<D> m6 = hn::ShiftRight<2>(hn::Add(lo, lo));
+
+    // The place to round depends on whether the exponent is large (>= -7) - if
+    // so, we retain three mantissa bits, otherwise two. However, rounding can
+    // also cause the exponent to increase. We first choose a threshold that
+    // rounds up to 1.0*2^-7 for both two and three bit mantissas:
+    // >= 1.1111 * 2^-8 (0.007568359375). This entails the exponent being
+    // greater, or equal and the mantissa > (1111000 >> 1) - 1 = 0x3B.
+    const hn::Vec<D> kMinLargeE = hn::Set(d, 127 - 8);
+    const hn::Mask<D> is_large_before_round = hn::Or(
+        SignedGt(d, biased_e, kMinLargeE),
+        hn::And(hn::Eq(biased_e, kMinLargeE), SignedGt(d, m6, Set(d, 0x3B))));
+
+    // To retain the most-significant 3 or 2 mantissa bits, we will right-shift
+    // by is_large_before_round ? 3 : 4. Variable Shr is expensive for 8-bit
+    // elements, so (<< 1) if is_large_before_round, then always (>> 4).
+    const hn::Vec<D> m_shl4 =
+        hn::MaskedAddOr(m6, is_large_before_round, m6, m6);
+
+    // Before shifting (truncation), round to nearest even to reduce bias. If
+    // the lowest remaining mantissa bit is odd, increase the offset. Example
+    // with the lowest remaining bit (left) and next lower two bits; the
+    // latter, plus two more, will be truncated.
+    // 0[00] +  1 =  0[01]
+    // 0[01] +  1 =  0[10]
+    // 0[10] +  1 =  0[11]  (round down toward even)
+    // 0[11] +  1 =  1[00]  (round up)
+    // 1[00] + 10 =  1[10]
+    // 1[01] + 10 =  1[11]
+    // 1[10] + 10 = C0[00]  (round up toward even with C=1 carry out)
+    // 1[11] + 10 = C0[01]  (round up toward even with C=1 carry out)
+    const hn::Vec<D> odd_bit = hn::And(hn::ShiftRight<4>(m_shl4), k1);
+    const hn::Vec<D> rounded = hn::Add(m_shl4, hn::Add(odd_bit, Set(d, 7)));
+    // Update the exponent if rounding overflowed.
+    const hn::Vec<D> carry_bit =
+        hn::IfThenElse(is_large_before_round, k80, hn::Set(d, 0x40u));
+    const hn::Vec<D> carry_clear = hn::AndNot(carry_bit, rounded);
+    HWY_DASSERT(hn::AllTrue(d, hn::Lt(carry_clear, carry_bit)));
+    const hn::Mask<D> is_overflow = hn::Ne(carry_clear, rounded);
+    biased_e = hn::MaskedAddOr(biased_e, is_overflow, biased_e, k1);
+    HWY_DASSERT(hn::AllTrue(d, hn::Lt(biased_e, Set(d, 128))));
+
+    // Detect if zero or the min exponent.
+    const hn::Vec<D> kMinNormal = hn::Set(d, 127 - 23);
+    const hn::Mask<D> is_zero = SignedLt(d, biased_e, kMinNormal);
+    const hn::Mask<D> is_min = hn::Eq(biased_e, kMinNormal);
+
+    // 1.1110xxx * 2^-8 was considered small above, and thus rounded up to 2^-7,
+    // which the decoder will consider large, and expect 3 mantissa bits. If we
+    // set the threshold above to 1.111, then it does NOT round up. Thus we
+    // check exponent >= -7 *after* rounding.
+    const hn::Mask<D> is_large = SignedGt(d, biased_e, hn::Set(d, 127 - 8));
+
+    // To extract and pack the mantissa, only is_large matters. Either it
+    // matches is_large_before_round, or the rounding resulted in mantissa=0, so
+    // we either extract two or three bits by shifting out the lower 5..6 bits.
+    // is_large_before is_large  rounded     want
+    //         0           0     0Cmm????     mm
+    //         0           1     0100????    000
+    //         1           0     impossible   -
+    //         1           1     Cmmm???0    mmm
+    hn::Vec<D> m = hn::ShiftRight<4>(carry_clear);
+    HWY_DASSERT(hn::AllTrue(
+        d, SignedLt(d, m,
+                    hn::IfThenElse(is_large, hn::Set(d, 8), hn::Set(d, 4)))));
+
+    // 1.0 * 2^-23 has the same encoding as zero, so round it up to 1.01.
+    m = hn::MaskedMaxOr(m, is_min, m, k1);
+
+    const hn::Vec<D> e_bias = hn::IfThenElse(
+        is_large,
+        hn::Set(d, hwy::BitCastScalar<uint8_t>(static_cast<int8_t>(15 - 127))),
+        hn::Set(d, hwy::BitCastScalar<uint8_t>(static_cast<int8_t>(23 - 127))));
+    const hn::Vec<D> e = hn::Add(biased_e, e_bias);
+    HWY_DASSERT(
+        hn::AllTrue(d, hn::Lt(hn::IfThenZeroElse(is_zero, e), hn::Set(d, 16))));
+
+    // Shift exponent left 2 or 3 bits to make space for `m`.
+    const hn::Vec<D> em =
+        hn::Or(m, hn::ShiftLeft<2>(hn::MaskedAddOr(e, is_large, e, e)));
+    HWY_DASSERT(hn::AllTrue(d, hn::Lt(hn::IfThenZeroElse(is_zero, em), k80)));
+    const hn::Vec<D> encoded = hn::BitwiseIfThenElse(k80, sign_in_msb, em);
+    // Doing this last ensures -0 is replaced with 0.
+    return hn::IfThenZeroElse(is_zero, encoded);
+  }
+
+  // Decodes u8 `encoded` into `lo` and `hi` bytes of bf16. 12 ops.
+  // Implementation detail, public because called by test.
+  template <class D, HWY_IF_U8_D(D)>
+  static HWY_INLINE void DecBytes(D d, hn::Vec<D> encoded, hn::Vec<D>& lo,
+                                  hn::Vec<D>& hi) {
+    const hn::Vec<D> k0 = hn::Zero(d);
+    const hn::Vec<D> k80 = hn::Set(d, 0x80u);
+
+    HWY_DASSERT(hn::AllTrue(d, hn::Ne(encoded, k80)));  // -0 is reserved
+    // Copy sign for later insertion via BitwiseIfThenElse.
+    const hn::Vec<D> sign_in_msb = encoded;
+    encoded = hn::AndNot(k80, encoded);
+
+    // Special-case zero, negated so we can use MaskedAddOr. Signed comparison
+    // is fine because we have cleared the sign bit.
+    const hn::Mask<D> is_nonzero = SignedGt(d, encoded, k0);
+    // If MSB is clear, we have two mantissa bits, otherwise three.
+    const hn::Mask<D> is_small_e = SignedLt(d, encoded, hn::Set(d, 64));
+    // If is_small_e, add/left-shift 0xxxx.mm to 0xxxx.mm0; else keep 1xxx.mmm.
+    const hn::Vec<D> e4m3 =
+        hn::MaskedAddOr(encoded, is_small_e, encoded, encoded);
+    HWY_DASSERT(hn::AllTrue(d, hn::Lt(e4m3, k80)));
+    const hn::Vec<D> e = hn::ShiftRight<3>(e4m3);  // 4-bit exponent only
+    HWY_DASSERT(hn::AllTrue(d, hn::Lt(e, Set(d, 16u))));
+    // The encoded exponent for 2^0 is 15, so subtract 15. Add 127 for the
+    // binary32/bf16 bias. Subtract another 8 if is_small_e because its lowest
+    // encoded value (0) should be less than the lowest 'large' exponent 2^-7.
+    const hn::Vec<D> e_bias = hn::IfThenElse(
+        is_small_e, hn::Set(d, 127u - 15u - 8u), hn::Set(d, 127u - 15u));
+    // Special-case zero or add e_bias. If encoded=0, e and e4m3 are zero, but
+    // we must zero e_bias to get the desired all-zero bf16.
+    const hn::Vec<D> biased_e = hn::MaskedAddOr(k0, is_nonzero, e_bias, e);
+    // The decoded binary32 exponent should be at most 2^0.
+    HWY_DASSERT(hn::AllTrue(d, hn::Lt(biased_e, k80)));
+
+    // Shift the MSB of e4m3's mantissa into the MSB of the bf16 mantissa.
+    const hn::Vec<D> m7 = hn::ShiftLeft<4>(e4m3);
+    // Lower byte of bf16 = exponent LSB || mantissa.
+    lo = hn::BitwiseIfThenElse(k80, hn::ShiftLeft<7>(biased_e), m7);
+    // Upper byte of bf16 = sign || lower 7 bits of exponent.
+    hi = hn::BitwiseIfThenElse(k80, sign_in_msb, hn::ShiftRight<1>(biased_e));
+  }
+
+  // Encodes `num` bf16 values from `in_bf` to `out_packed`.
+  template <class DBF, HWY_IF_BF16_D(DBF)>
+  static HWY_INLINE void Enc(DBF dbf, const hwy::bfloat16_t* HWY_RESTRICT in_bf,
+                             size_t num, SfpStream* HWY_RESTRICT out_packed) {
+    const hn::Repartition<uint8_t, DBF> d8;
+    using V8 = hn::Vec<decltype(d8)>;
+    const size_t N16 = hn::Lanes(dbf);
+
+    size_t i = 0;
+    if (num >= 2 * N16) {
+      HWY_UNROLL(1)
+      for (; i <= num - 2 * N16; i += 2 * N16) {
+        const V8 packed = Enc2B(dbf, in_bf + i);
+        hn::StoreU(packed, d8, &out_packed->byte + i);
+      }
+    }
+
+    const size_t remaining = num - i;
+    HWY_DASSERT(remaining < 2 * N16);
+    if (remaining != 0) {
+      HWY_ALIGN hwy::bfloat16_t padded[2 * hn::MaxLanes(dbf)];
+      hwy::ZeroBytes(padded, sizeof(padded));
+      hwy::CopyBytes(in_bf + i, padded, remaining * sizeof(padded[0]));
+      const V8 packed = Enc2B(dbf, padded);
+      hn::StoreN(packed, d8, &out_packed->byte + i, remaining);
+    }
+  }
+
+  // Encodes `num` f32 values from `in_f` to `packed`.
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Enc(DF df, const float* HWY_RESTRICT in_f, size_t num,
+                             SfpStream* HWY_RESTRICT out_packed) {
+    const hn::Repartition<uint8_t, DF> d8;
+    using V8 = hn::Vec<decltype(d8)>;
+    const size_t NF = hn::Lanes(df);
+
+    size_t i = 0;
+    if (num >= 4 * NF) {
+      HWY_UNROLL(1)
+      for (; i <= num - 4 * NF; i += 4 * NF) {
+        const V8 packed = Enc4F(df, in_f + i);
+        hn::StoreU(packed, d8, &out_packed->byte + i);
+      }
+    }
+
+    const size_t remaining = num - i;
+    HWY_DASSERT(remaining < 4 * NF);
+    if (remaining != 0) {
+      HWY_ALIGN float padded[4 * hn::MaxLanes(df)];
+      hwy::ZeroBytes(padded, sizeof(padded));
+      hwy::CopyBytes(in_f + i, padded, remaining * sizeof(padded[0]));
+      const V8 packed = Enc4F(df, padded);
+      hn::StoreN(packed, d8, &out_packed->byte + i, remaining);
+    }
+  }
+
+  // Decodes `num` values from `in_packed` to `out_bf`.
+  template <class DBF, HWY_IF_BF16_D(DBF)>
+  static HWY_INLINE void Dec(DBF dbf, const SfpStream* HWY_RESTRICT in_packed,
+                             size_t num, hwy::bfloat16_t* HWY_RESTRICT out_bf) {
+    const hn::Repartition<uint8_t, DBF> d8;
+    using V8 = hn::Vec<decltype(d8)>;
+    using VBF = hn::Vec<decltype(dbf)>;
+    const size_t N16 = hn::Lanes(dbf);
+
+    size_t i = 0;
+    if (num >= 2 * N16) {
+      HWY_UNROLL(1)
+      for (; i <= num - 2 * N16; i += 2 * N16) {
+        const V8 packed = hn::LoadU(d8, &in_packed->byte + i);
+        VBF bf0, bf1;
+        Dec2B(dbf, packed, bf0, bf1);
+        hn::StoreU(bf0, dbf, out_bf + i);
+        hn::StoreU(bf1, dbf, out_bf + i + N16);
+      }
+    }
+
+    const size_t remaining = num - i;
+    HWY_DASSERT(remaining < 2 * N16);
+    if (remaining != 0) {
+      const V8 packed = hn::LoadN(d8, &in_packed->byte + i, remaining);
+      HWY_ALIGN hwy::bfloat16_t padded[2 * hn::MaxLanes(dbf)];
+      VBF bf0, bf1;
+      Dec2B(dbf, packed, bf0, bf1);
+      hn::StoreU(bf0, dbf, padded);
+      hn::StoreU(bf1, dbf, padded + N16);
+      hwy::CopyBytes(padded, out_bf + i, remaining * sizeof(padded[0]));
+    }
+  }
+
+  // Decodes `num` values from `in_packed` to `out_f`.
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Dec(DF df, const SfpStream* HWY_RESTRICT in_packed,
+                             size_t num, float* HWY_RESTRICT out_f) {
+    const hn::Repartition<uint8_t, DF> d8;
+    using V8 = hn::Vec<decltype(d8)>;
+    using VF = hn::Vec<decltype(df)>;
+    const size_t NF = hn::Lanes(df);
+
+    size_t i = 0;
+    if (num >= 4 * NF) {
+      HWY_UNROLL(1)
+      for (; i <= num - 4 * NF; i += 4 * NF) {
+        const V8 packed = hn::LoadU(d8, &in_packed->byte + i);
+        VF f0, f1, f2, f3;
+        Dec4F(df, packed, f0, f1, f2, f3);
+        hn::StoreU(f0, df, out_f + i + NF * 0);
+        hn::StoreU(f1, df, out_f + i + NF * 1);
+        hn::StoreU(f2, df, out_f + i + NF * 2);
+        hn::StoreU(f3, df, out_f + i + NF * 3);
+      }
+    }
+
+    const size_t remaining = num - i;
+    HWY_DASSERT(remaining < 4 * NF);
+    if (remaining != 0) {
+      const V8 packed = hn::LoadN(d8, &in_packed->byte + i, remaining);
+      HWY_ALIGN float padded[4 * hn::MaxLanes(df)];
+      VF f0, f1, f2, f3;
+      Dec4F(df, packed, f0, f1, f2, f3);
+      hn::StoreU(f0, df, padded + NF * 0);
+      hn::StoreU(f1, df, padded + NF * 1);
+      hn::StoreU(f2, df, padded + NF * 2);
+      hn::StoreU(f3, df, padded + NF * 3);
+      hwy::CopyBytes(padded, out_f + i, remaining * sizeof(padded[0]));
+    }
+  }
+
+  // Fused decode and dot product with bf16 into four output accumulators.
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Dot(DF df, const SfpStream* HWY_RESTRICT in_packed,
+                             size_t num,
+                             const hwy::bfloat16_t* HWY_RESTRICT vec_aligned,
+                             hn::Vec<DF>& sum0, hn::Vec<DF>& sum1,
+                             hn::Vec<DF>& sum2, hn::Vec<DF>& sum3) {
+    const hn::Repartition<uint8_t, DF> d8;
+    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+    using V8 = hn::Vec<decltype(d8)>;
+    using VBF = hn::Vec<decltype(dbf)>;
+    const size_t N16 = hn::Lanes(dbf);
+
+    size_t i = 0;
+    if (num >= 2 * N16) {
+      HWY_UNROLL(1)
+      for (; i <= num - 2 * N16; i += 2 * N16) {
+        const V8 packed = hn::LoadU(d8, &in_packed->byte + i);
+        const VBF v0 = hn::LoadU(dbf, vec_aligned + i);
+        const VBF v1 = hn::LoadU(dbf, vec_aligned + i + N16);
+        VBF bf0, bf1;
+        Dec2B(dbf, packed, bf0, bf1);
+        sum0 = hn::ReorderWidenMulAccumulate(df, bf0, v0, sum0, sum1);
+        sum2 = hn::ReorderWidenMulAccumulate(df, bf1, v1, sum2, sum3);
+      }
+    }
+
+    const size_t remaining = num - i;
+    if (remaining != 0) {
+      const V8 packed = hn::LoadN(d8, &in_packed->byte + i, remaining);
+      HWY_ALIGN hwy::bfloat16_t padded[2 * hn::MaxLanes(dbf)];
+      hwy::ZeroBytes(padded, sizeof(padded));
+      hwy::CopyBytes(vec_aligned + i, padded, remaining * sizeof(padded[0]));
+      const VBF v0 = hn::LoadU(dbf, padded);
+      const VBF v1 = hn::LoadU(dbf, padded + N16);
+      VBF bf0, bf1;
+      Dec2B(dbf, packed, bf0, bf1);
+      sum0 = hn::ReorderWidenMulAccumulate(df, bf0, v0, sum0, sum1);
+      sum2 = hn::ReorderWidenMulAccumulate(df, bf1, v1, sum2, sum3);
+    }
+  }
+
+  // Fused decode and dot product with f32 into four output accumulators.
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void Dot(DF df, const SfpStream* HWY_RESTRICT in_packed,
+                             size_t num, const float* HWY_RESTRICT vec_aligned,
+                             hn::Vec<DF>& sum0, hn::Vec<DF>& sum1,
+                             hn::Vec<DF>& sum2, hn::Vec<DF>& sum3) {
+    const hn::Repartition<uint8_t, DF> d8;
+    using V8 = hn::Vec<decltype(d8)>;
+    using VF = hn::Vec<decltype(df)>;
+    const size_t NF = hn::Lanes(df);
+
+    size_t i = 0;
+    if (num >= 4 * NF) {
+      HWY_UNROLL(1)
+      for (; i <= num - 4 * NF; i += 4 * NF) {
+        const V8 packed = hn::LoadU(d8, &in_packed->byte + i);
+        const VF v0 = hn::LoadU(df, vec_aligned + i + NF * 0);
+        const VF v1 = hn::LoadU(df, vec_aligned + i + NF * 1);
+        const VF v2 = hn::LoadU(df, vec_aligned + i + NF * 2);
+        const VF v3 = hn::LoadU(df, vec_aligned + i + NF * 3);
+        VF f0, f1, f2, f3;
+        Dec4F(df, packed, f0, f1, f2, f3);
+        sum0 = hn::MulAdd(f0, v0, sum0);
+        sum1 = hn::MulAdd(f1, v1, sum1);
+        sum2 = hn::MulAdd(f2, v2, sum2);
+        sum3 = hn::MulAdd(f3, v3, sum3);
+      }
+    }
+
+    const size_t remaining = num - i;
+    if (remaining != 0) {
+      const V8 packed = hn::LoadN(d8, &in_packed->byte + i, remaining);
+      HWY_ALIGN float padded[4 * hn::MaxLanes(df)];
+      hwy::ZeroBytes(padded, sizeof(padded));
+      hwy::CopyBytes(vec_aligned + i, padded, remaining * sizeof(padded[0]));
+      const VF v0 = hn::LoadU(df, padded + NF * 0);
+      const VF v1 = hn::LoadU(df, padded + NF * 1);
+      const VF v2 = hn::LoadU(df, padded + NF * 2);
+      const VF v3 = hn::LoadU(df, padded + NF * 3);
+      VF f0, f1, f2, f3;
+      Dec4F(df, packed, f0, f1, f2, f3);
+      sum0 = hn::MulAdd(f0, v0, sum0);
+      sum1 = hn::MulAdd(f1, v1, sum1);
+      sum2 = hn::MulAdd(f2, v2, sum2);
+      sum3 = hn::MulAdd(f3, v3, sum3);
+    }
+  }
+
+ private:
+  // Wrappers to avoid code duplication across float/bf16 input types and
+  // the main loop/remainder.
+
+  // Returns vector of packed bytes for callers to StoreU or StoreN.
+  template <class D16, HWY_IF_U16_D(D16),
+            class V8 = hn::Vec<hn::Repartition<uint8_t, D16>>>
+  static HWY_INLINE V8 Enc2U(D16 d16, const hn::Vec<D16> w0,
+                             const hn::Vec<D16> w1) {
+    const hn::Repartition<uint8_t, D16> d8;
+
+    // Although more expensive on AVX3, in-order packing enables streaming
+    // decompression without fixed-size packets.
+    const V8 lo = hn::ConcatEven(d8, hn::BitCast(d8, w1), hn::BitCast(d8, w0));
+    const V8 hi = hn::ConcatOdd(d8, hn::BitCast(d8, w1), hn::BitCast(d8, w0));
+    return EncBytes(d8, lo, hi);
+  }
+
+  template <class DBF, HWY_IF_BF16_D(DBF),
+            class V8 = hn::Vec<hn::Repartition<uint8_t, DBF>>>
+  static HWY_INLINE V8 Enc2B(DBF dbf, const hwy::bfloat16_t* HWY_RESTRICT in) {
+    const hn::Repartition<uint16_t, DBF> d16;
+    const size_t N16 = hn::Lanes(d16);
+    using V16 = hn::Vec<decltype(d16)>;
+
+    const V16 w0 = hn::BitCast(d16, hn::LoadU(dbf, in));
+    const V16 w1 = hn::BitCast(d16, hn::LoadU(dbf, in + N16));
+    return Enc2U(d16, w0, w1);
+  }
+
+  template <class DF, HWY_IF_F32_D(DF),
+            class V8 = hn::Vec<hn::Repartition<uint8_t, DF>>>
+  static HWY_INLINE V8 Enc4F(DF df, const float* HWY_RESTRICT in) {
+    const hn::Repartition<uint16_t, DF> d16;
+    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+    using VF = hn::Vec<decltype(df)>;
+    using V16 = hn::Vec<decltype(d16)>;
+    const size_t NF = hn::Lanes(df);
+
+    const VF f0 = hn::LoadU(df, in + NF * 0);
+    const VF f1 = hn::LoadU(df, in + NF * 1);
+    const VF f2 = hn::LoadU(df, in + NF * 2);
+    const VF f3 = hn::LoadU(df, in + NF * 3);
+    // Chop off the lower 16 bits; EncBytes still rounds properly.
+    const V16 w0 = hn::BitCast(d16, hn::OrderedDemote2To(dbf, f0, f1));
+    const V16 w1 = hn::BitCast(d16, hn::OrderedDemote2To(dbf, f2, f3));
+    return Enc2U(d16, w0, w1);
+  }
+
+  template <class D16, HWY_IF_U16_D(D16),
+            class V8 = hn::Vec<hn::Repartition<uint8_t, D16>>>
+  static HWY_INLINE void Dec2U(D16 d16, V8 packed, hn::Vec<D16>& w0,
+                               hn::Vec<D16>& w1) {
+    const hn::Repartition<uint8_t, D16> d8;
+    V8 lo, hi;
+    DecBytes(d8, packed, lo, hi);
+    w0 = hn::BitCast(d16, hn::InterleaveWholeLower(d8, lo, hi));
+    w1 = hn::BitCast(d16, hn::InterleaveWholeUpper(d8, lo, hi));
+  }
+
+  template <class DBF, HWY_IF_BF16_D(DBF),
+            class V8 = hn::Vec<hn::Repartition<uint8_t, DBF>>>
+  static HWY_INLINE void Dec2B(DBF dbf, V8 packed, hn::Vec<DBF>& bf0,
+                               hn::Vec<DBF>& bf1) {
+    const hn::Repartition<uint16_t, DBF> d16;
+    using V16 = hn::Vec<decltype(d16)>;
+    V16 w0, w1;
+    Dec2U(d16, packed, w0, w1);
+    bf0 = hn::BitCast(dbf, w0);
+    bf1 = hn::BitCast(dbf, w1);
+  }
+
+  template <class DF, HWY_IF_F32_D(DF),
+            class V8 = hn::Vec<hn::Repartition<uint8_t, DF>>>
+  static HWY_INLINE void Dec4F(DF df, V8 packed, hn::Vec<DF>& f0,
+                               hn::Vec<DF>& f1, hn::Vec<DF>& f2,
+                               hn::Vec<DF>& f3) {
+    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+    using VBF = hn::Vec<decltype(dbf)>;
+    VBF bf0, bf1;
+    Dec2B(dbf, packed, bf0, bf1);
+    f0 = hn::PromoteLowerTo(df, bf0);
+    f1 = hn::PromoteUpperTo(df, bf0);
+    f2 = hn::PromoteLowerTo(df, bf1);
+    f3 = hn::PromoteUpperTo(df, bf1);
+  }
+};  // SfpCodec
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_SFP_INL_H_
--- a/compression/sfp.h
+++ b/compression/sfp.h
@ -0,0 +1,51 @@
+// Copyright 2023 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_SFP_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_SFP_H_
+
+// Switching Floating Point: a hybrid 8-bit float representation of bf16/f32
+// inputs that combines the advantages of e4m3 and e5m2 into a single format.
+// It supports seeking at a granularity of 1, decoding to bf16/f32, and a
+// fused decode/dot product with bf16/f32 vectors.
+
+#include <stdint.h>
+
+namespace gcpp {
+
+// Points to the *start* of an SFP stream. Values are stored in-order to enable
+// vector-length agnostic seeking, because streams may be written to disk for
+// loading on other CPUs.
+//
+// Characteristics:
+// - 24-bit dynamic range, with max exponent 2^0.
+// - 3 bit mantissa for values >= 2^-7, otherwise 2.
+//
+// This is faster to decode than a straightforward implementation of eXmY, in
+// part because SFP does not require subnormals. Unlike OCP MX, it also does not
+// require side information (shared exponents).
+//
+// Although the representation could probably be shrunk to 6-7 bits, more
+// savings can be had by non-uniform clustering - see nuq.h.
+#pragma pack(push, 1)
+struct SfpStream {
+  uint8_t byte;
+};
+#pragma pack(pop)
+
+static inline const char* TypeName(SfpStream) { return "SFP"; }
+
+}  // namespace gcpp
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_SFP_H_
--- a/compression/sfp_test.cc
+++ b/compression/sfp_test.cc
@ -0,0 +1,440 @@
+// Copyright 2023 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/sfp.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <random>
+#include <set>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE \
+  "third_party/gemma_cpp/compression/sfp_test.cc"  // NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+// Any highway.h must come after foreach_target.h
+// copybara:import_next_line:gemma_cpp
+#include "compression/distortion.h"
+// copybara:import_next_line:gemma_cpp
+#include "compression/sfp-inl.h"
+#include "hwy/highway.h"
+#include "hwy/tests/hwy_gtest.h"
+#include "hwy/tests/test_util-inl.h"
+#include "hwy/timer.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+
+// Decode
+float F32FromSFP8(uint32_t sfp) {
+  HWY_ASSERT(sfp < 256);
+  HWY_ASSERT(sfp != 0x80);  // -0 is reserved
+
+  const uint32_t sign32 = (sfp & 0x80) << 24;
+  sfp &= 0x7F;
+  const bool large_e = sfp >= 64;
+  const size_t m_bits = large_e ? 3 : 2;
+  uint32_t m = sfp & ((1u << m_bits) - 1u);
+  size_t e = sfp >> m_bits;
+  if (sfp == 0) return 0.0f;
+  const uint32_t e_bias = large_e ? 15 : 23;
+  const uint32_t exp32 = static_cast<uint32_t>(127 + e - e_bias) << 23;
+  const uint32_t mnt32 = m << (23 - m_bits);
+  const uint32_t binary32 = sign32 | exp32 | mnt32;
+  float result;
+  hwy::CopySameSize(&binary32, &result);
+  return result;
+}
+
+void TestAllUnique() {
+  std::set<float> unique;
+  for (uint32_t sfp = 0; sfp < 256; ++sfp) {
+    if (sfp == 0x80) continue;  // -0 is reserved
+    unique.insert(F32FromSFP8(sfp));
+  }
+  HWY_ASSERT_EQ(size_t{255}, unique.size());
+  if (false) {
+    for (float f : unique) {
+      fprintf(stderr, "%e\n", f);
+    }
+  }
+}
+
+// ------------------------------ Foreach compressed representation
+
+// Encode
+HWY_INLINE uint32_t SFP8FromF32(float f) {
+  HWY_ASSERT(-1.875f <= f && f <= 1.875f);
+
+  constexpr uint32_t kMaskM = hwy::MantissaMask<float>();
+  uint32_t binary32;
+  hwy::CopySameSize(&f, &binary32);
+  const uint32_t s = (binary32 & hwy::SignMask<float>()) >> 24;
+  binary32 &= ~hwy::SignMask<float>();
+  f = hwy::ScalarAbs(f);
+
+  // >= 1.1111 * 2^-8 rounds up to 1.0*2^-7.
+  bool large_e = (f >= 0.007568359375f);
+
+  const uint32_t org_binary32 = binary32;
+  const uint32_t m32 = binary32 & kMaskM;
+  binary32 = (binary32 & ~kMaskM) | m32;
+  size_t m_bits = large_e ? 3 : 2;
+  const uint32_t is_odd = (m32 >> (23 - m_bits)) & 1;
+  const uint32_t round = is_odd + (1u << (23 - m_bits - 1)) - 1;
+  const uint32_t rounded = binary32 + round;
+
+  // >= 1.111 also rounds up, but only if it was considered !large_e before.
+  if (f >= 0.00732421875f) {
+    large_e = true;
+    m_bits = 3;
+  }
+
+  uint32_t m = (kMaskM & rounded) >> (23 - m_bits);
+  int32_t e = (rounded >> 23) - 127;
+
+  if (e <= -23) {
+    // 2^-23 is the smallest normal exponent. Zero has e = -127. Do not set the
+    // SFP sign bit because the encoding for -0 is reserved.
+    if (e < -23) return 0;
+    // e = 2^-23: round up mantissa because m=0 encodes 0.0f.
+    if (m == 0) m = 1;
+  }
+
+  if (false) {
+    fprintf(stderr, "in %x round %x rounded %x e %d m %x large_e %d\n",
+            org_binary32, round, rounded, e, m, large_e);
+  }
+  uint32_t e_sfp = e + (large_e ? 15 : 23);
+  HWY_ASSERT(e_sfp < 16);
+
+  const uint32_t encoded = (e_sfp << m_bits) | m | s;
+  HWY_ASSERT(encoded < 256);
+  return encoded;
+}
+
+// For every possible encoding: ensure re-encoding the decoded value matches it.
+struct TestDecEnc {
+  template <class T, class D>
+  HWY_INLINE void operator()(T /*unused*/, D d) {
+    const hn::RepartitionToWide<D> d16;
+    const hn::Rebind<hwy::bfloat16_t, decltype(d16)> dbf;
+    const hn::Repartition<float, D> df;
+    for (uint32_t encoded = 0; encoded < 256; ++encoded) {
+      if (encoded == 0x80) continue;  // -0 is reserved
+      const float decoded = F32FromSFP8(encoded);
+      const uint32_t encoded2 = SFP8FromF32(decoded);
+
+      hn::Vec<D> dec_lo, dec_hi;
+      SfpCodec::DecBytes(d, hn::Set(d, encoded), dec_lo, dec_hi);
+      const hn::Vec<decltype(dbf)> dec =
+          hn::BitCast(dbf, hn::ZipLower(d16, dec_lo, dec_hi));
+      const float vdecoded = hn::GetLane(hn::PromoteLowerTo(df, dec));
+      const uint32_t vencoded2 =
+          hn::GetLane(SfpCodec::EncBytes(d, dec_lo, dec_hi));
+
+      if (decoded != vdecoded || encoded2 != vencoded2 || encoded != encoded2) {
+        HWY_ABORT("enc %u -> dec %E=%x=%E -> enc %u %u\n", encoded, decoded,
+                  hwy::BitCastScalar<uint32_t>(decoded), vdecoded, encoded2,
+                  vencoded2);
+      }
+    }
+  }
+};
+
+void TestAllDecEnc() { hn::ForGEVectors<32, TestDecEnc>()(uint8_t()); }
+
+// ------------------------------ Golden (known values)
+
+// Generate values, encode, decode back to that value.
+struct TestGolden {
+  template <class T, class D>
+  HWY_INLINE void operator()(T /*unused*/, D d) {
+    const hn::Repartition<float, D> df;
+    const hn::Repartition<hwy::bfloat16_t, D> dbf;
+    const hn::RebindToUnsigned<decltype(dbf)> d16;
+
+    struct Golden {
+      float in;
+      float out;
+    };
+    const Golden golden[] = {
+        // All mantissa bits set, all discarded zero (no rounding)
+        {0.46875f, 0.46875f},
+        {0.9375f, 0.9375f},
+        // All mantissa bits set, one below it set (round up to pow2)
+        {0.484375f, 0.5f},
+        {0.96875f, 1.0f},
+        // Lowest mantissa bit set, all discarded zero (no rounding)
+        {0.28125f, 0.28125f},
+        {0.5625f, 0.5625f},
+        // Lowest mantissa bit set, one below it set (round up to even)
+        {0.296875f, 0.3125f},
+        {0.59375f, 0.625f},
+        // All mantissa zero, all discarded set (round up)
+        {0.279296875f, 0.28125f},
+        {0.55859375f, 0.5625f},
+        // All mantissa zero, one below it set (round DOWN to pow2)
+        {0.265625f, 0.25f},
+        {0.53125f, 0.5f},
+
+        // At inflection point: 1.max*2^-8 rounds up to 1.0*2^-7
+        {0.0068359375f, 0.0068359375f},  // 1.11 -> 1.11
+        {0.00732421875f, 0.0078125f},    // 1.111 -> 1.11[1] -> 1.0
+        {0.007568359375f, 0.0078125f},   // 1.1111 -> 1.0
+
+        // Above 1.0: no longer special-cased.
+        {1.0f, 1.0f},
+        {1.0625f, 1.0f},  // 1.000100
+
+        // Smallest normal exponents - we no longer use subnormals.
+        {2.384185791015625E-7f, 2.384185791015625E-7f},  // 1.00p-22
+        {1.49011611938E-07f, 1.49011611938E-07f},        // 1.01p-23
+        {1.19209289551E-07f, 1.49011611938E-07f},        // 1.00p-23 -> 1.01p-23
+        {5.96046447754E-08f, 0.0f},                      // 1.00p-24 -> 0
+        {8.94069671631E-08f, 0.0f},                      // 1.10p-24 -> 0
+        {1.11758708954E-07f, 1.49011611938E-07f},        // 1.111p-24-> 1.01p-23
+
+        // 1100_010 * 2^-7 rounds down to 110
+        {0.013841f, 0.013671875f},
+    };
+    constexpr size_t kNumGolden = sizeof(golden) / sizeof(Golden);
+    for (uint32_t s : {0, 1}) {
+      for (size_t i = 0; i < kNumGolden; ++i) {
+        const float in = s ? -golden[i].in : golden[i].in;
+        const float out = s ? -golden[i].out : golden[i].out;
+        const hn::Vec<decltype(dbf)> in_bf =
+            hn::OrderedDemote2To(dbf, hn::Set(df, in), hn::Set(df, in));
+        const uint32_t encoded = SFP8FromF32(in);
+        const uint32_t vencoded = hn::GetLane(SfpCodec::EncBytes(
+            d, hn::BitCast(d, in_bf),
+            hn::BitCast(d, hn::ShiftRight<8>(hn::BitCast(d16, in_bf)))));
+        const float decoded = F32FromSFP8(encoded);
+        hn::Vec<D> dec_lo, dec_hi;
+        SfpCodec::DecBytes(d, hn::Set(d, encoded), dec_lo, dec_hi);
+        const hn::Vec<decltype(dbf)> dec =
+            hn::BitCast(dbf, hn::ZipLower(d16, dec_lo, dec_hi));
+        const float vdecoded = hn::GetLane(hn::PromoteLowerTo(df, dec));
+
+        if (decoded != vdecoded || decoded != out || encoded != vencoded) {
+          HWY_ABORT("@%zu in %E dec %E %E golden %E\n", i, in, decoded,
+                    vdecoded, golden[i].out);
+        }
+      }  // i
+    }    // s
+  }
+};
+
+void TestAllGolden() {
+  // Full vectors only, other tests cover partial vectors.
+  TestGolden()(uint8_t(), hn::ScalableTag<uint8_t>());
+}
+
+// ------------------------------ Foreach bf16 input
+
+// Generate all values, encode, decode back.
+struct TestEncDec {
+  template <class T, class DBF>
+  HWY_INLINE void operator()(T /*unused*/, DBF dbf) {
+    const hn::Repartition<uint8_t, DBF> du8;
+
+    // We only use the upper 4 of 7 bf16 mantissa bits, so force the lower three
+    // bits to zero to reduce the number of inputs.
+    constexpr size_t kStep = 8;
+    const size_t max = 0x8000 / 8;
+
+    auto in = hwy::AllocateAligned<T>(max);
+    auto packed = hwy::AllocateAligned<SfpStream>(max);
+    auto dec = hwy::AllocateAligned<T>(max);
+    HWY_ASSERT(in && packed && dec);
+    size_t num = 0;
+    for (size_t i = 0; i < max; ++i) {
+      const uint16_t bits = i * kStep;
+      const float f = hwy::F32FromBF16(hwy::BitCastScalar<T>(bits));
+      // Keep if within range
+      if (hwy::ScalarIsFinite(f) && f <= 1.875f) {
+        in[num] = hwy::BF16FromF32(f);
+        in[num + 1] = hwy::BF16FromF32(-f);
+        num += 2;
+      }
+    }
+
+    double enc_elapsed = hwy::HighestValue<double>();
+    double dec_elapsed = hwy::HighestValue<double>();
+    for (size_t rep = 0; rep < 100; ++rep) {
+      const double t0 = hwy::platform::Now();
+      SfpCodec::Enc(dbf, in.get(), num, packed.get());
+      const double t1 = hwy::platform::Now();
+      SfpCodec::Dec(dbf, packed.get(), num, dec.get());
+      const double t2 = hwy::platform::Now();
+      enc_elapsed = HWY_MIN(enc_elapsed, t1 - t0);
+      dec_elapsed = HWY_MIN(dec_elapsed, t2 - t1);
+    }
+    const double enc_mbs = num * sizeof(T) * 1E-6 / enc_elapsed;
+    const double dec_mbs = num * sizeof(T) * 1E-6 / dec_elapsed;
+    fprintf(stderr, "Vec size %zu Enc %.2f MB/s Dec %.2f MB/s\n", Lanes(du8),
+            enc_mbs, dec_mbs);
+
+    {
+      double sum = 0.0;
+      DistortionStats stats;
+      for (size_t i = 0; i < num; ++i) {
+        const float out = hwy::F32FromBF16(dec[i]);
+        sum += hwy::ConvertScalarTo<double>(hwy::ScalarAbs(in[i]));
+        stats.Notify(in[i], out);
+      }
+      const double avg = sum / num;
+      fprintf(stderr, "Avg magnitude %.3E, p-norm %.3E snr %.2f @%zu = %.4E\n",
+              avg, stats.PNorm(), stats.GeomeanValueDivL1(), stats.MaxIndex(),
+              stats.MaxL1());
+    }
+  }
+};
+
+void TestAllEncDec() { hn::ForGEVectors<32, TestEncDec>()(hwy::bfloat16_t()); }
+
+// ------------------------------ Order
+
+// Store 8-bit iota, decode, encode, check iota == packed. This ensures
+// Enc/Dec are preserving the order independent of vector length.
+struct TestOrder {
+  template <class T, class DBF>
+  HWY_INLINE void operator()(T /*unused*/, DBF dbf) {
+    const hn::Repartition<uint8_t, DBF> du8;
+
+    const size_t num = 10 * hn::Lanes(du8) / 3;
+
+    auto iota = hwy::AllocateAligned<SfpStream>(num);
+    auto packed = hwy::AllocateAligned<SfpStream>(num);
+    auto bf = hwy::AllocateAligned<hwy::bfloat16_t>(num);
+    HWY_ASSERT(iota && packed && bf);
+    for (size_t i = 0; i < num; ++i) {
+      // Clear sign bit so we can also check that bf is in ascending order.
+      iota[i].byte = i & 127;
+    }
+
+    SfpCodec::Dec(dbf, iota.get(), num, bf.get());
+    SfpCodec::Enc(dbf, bf.get(), num, packed.get());
+
+    for (size_t i = 0; i < num; ++i) {
+      if (iota[i].byte != packed[i].byte) {
+        HWY_ABORT("@%zu: %d %d\n", i, iota[i].byte, packed[i].byte);
+      }
+    }
+  }
+};
+
+void TestAllOrder() { hn::ForGEVectors<32, TestOrder>()(hwy::bfloat16_t()); }
+
+// ------------------------------ Dot
+
+struct TestDot {
+  template <typename T, class D>
+  HWY_INLINE void operator()(T /*unused*/, D d) {
+    const hn::Repartition<float, D> df;
+    const size_t num = 384;
+    auto in = hwy::AllocateAligned<T>(num);
+    auto dec = hwy::AllocateAligned<T>(num);
+    auto vec = hwy::AllocateAligned<T>(num);
+    auto sfp = hwy::AllocateAligned<SfpStream>(num);
+    HWY_ASSERT(in && dec && vec && sfp);
+
+    std::mt19937 rng(123);
+    std::normal_distribution<float> dist{0.001f, 0.3f};
+    for (size_t i = 0; i < num; ++i) {
+      in[i] = hwy::ConvertScalarTo<T>(dist(rng));
+      vec[i] = hwy::ConvertScalarTo<T>(dist(rng));
+    }
+    // This changes the correlation between in and vec, which considerably
+    // affects the error of the result.
+    std::shuffle(in.get(), in.get() + num, rng);
+
+    SfpCodec::Enc(d, in.get(), num, sfp.get());
+
+    double actual = 0.0;
+    double elapsed = hwy::HighestValue<double>();
+    for (size_t rep = 0; rep < 200; ++rep) {
+      hn::Vec<decltype(df)> sum0 = hn::Zero(df);
+      hn::Vec<decltype(df)> sum1 = hn::Zero(df);
+      hn::Vec<decltype(df)> sum2 = hn::Zero(df);
+      hn::Vec<decltype(df)> sum3 = hn::Zero(df);
+      const double t0 = hwy::platform::Now();
+      SfpCodec::Dot(df, sfp.get(), num, vec.get(), sum0, sum1, sum2, sum3);
+      const double t1 = hwy::platform::Now();
+      elapsed = HWY_MIN(elapsed, t1 - t0);
+      sum0 = hn::Add(hn::Add(sum0, sum1), hn::Add(sum2, sum3));
+      actual = hn::ReduceSum(df, sum0);
+    }
+
+    SfpCodec::Dec(d, sfp.get(), num, dec.get());
+    fprintf(stderr, "Vec %zu Dot %.2f MB/s\n", Lanes(d) * sizeof(T),
+            num * sizeof(T) * 1E-6 / elapsed);
+
+    double expected = 0.0;   // using original input
+    double expected2 = 0.0;  // using decoded SFP
+    for (size_t i = 0; i < num; ++i) {
+      expected += hwy::ConvertScalarTo<double>(in[i]) *
+                  hwy::ConvertScalarTo<double>(vec[i]);
+      expected2 += hwy::ConvertScalarTo<double>(dec[i]) *
+                   hwy::ConvertScalarTo<double>(vec[i]);
+    }
+    const double l1 = hwy::ScalarAbs(expected - actual);
+    const double snr = 1.0 + hwy::ScalarAbs(expected) / l1;
+    fprintf(stderr, "expected %.3f e2 %.4f actual %.4f l1 %E snr %.2f\n",
+            expected, expected2, actual, l1, snr);
+    HWY_ASSERT(hwy::ScalarAbs(expected2 - actual) < 1E-4);
+    const double expected_l1 = sizeof(T) == 2 ? 1.52E-2 : 1.15E-2;
+    const double expected_snr = sizeof(T) == 2 ? 80.1f : 104.9f;
+    HWY_ASSERT(expected_l1 <= l1 && l1 < 1.02f * expected_l1);
+    HWY_ASSERT(expected_snr <= snr && snr < 1.01f * expected_snr);
+  }
+};
+
+void TestAllDotF32() {
+  const hn::ForGEVectors<128, TestDot> test;
+  test(float());
+}
+void TestAllDotBF16() {
+  const hn::ForGEVectors<128, TestDot> test;
+  test(hwy::bfloat16_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace gcpp {
+HWY_BEFORE_TEST(SfpTest);
+HWY_EXPORT_AND_TEST_P(SfpTest, TestAllUnique);
+HWY_EXPORT_AND_TEST_P(SfpTest, TestAllDecEnc);
+HWY_EXPORT_AND_TEST_P(SfpTest, TestAllGolden);
+HWY_EXPORT_AND_TEST_P(SfpTest, TestAllEncDec);
+HWY_EXPORT_AND_TEST_P(SfpTest, TestAllOrder);
+HWY_EXPORT_AND_TEST_P(SfpTest, TestAllDotF32);
+HWY_EXPORT_AND_TEST_P(SfpTest, TestAllDotBF16);
+}  // namespace gcpp
+
+#endif
--- a/compression/stats.cc
+++ b/compression/stats.cc
@ -0,0 +1,117 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/stats.h"
+
+#include <stdio.h>
+
+#include <algorithm>  // std::min
+#include <string>
+
+#include "hwy/base.h"  // HWY_ASSERT
+
+void Stats::Assimilate(const Stats& other) {
+  const int64_t total_n = n_ + other.n_;
+  if (total_n == 0) return;  // Nothing to do; prevents div by zero.
+
+  min_ = std::min(min_, other.min_);
+  max_ = std::max(max_, other.max_);
+
+  product_ *= other.product_;
+
+  const double product_n = n_ * other.n_;
+  const double n2 = n_ * n_;
+  const double other_n2 = other.n_ * other.n_;
+  const int64_t total_n2 = total_n * total_n;
+  const double total_n3 = static_cast<double>(total_n2) * total_n;
+  // Precompute reciprocal for speed - used at least twice.
+  const double inv_total_n = 1.0 / total_n;
+  const double inv_total_n2 = 1.0 / total_n2;
+
+  const double delta = other.m1_ - m1_;
+  const double delta2 = delta * delta;
+  const double delta3 = delta * delta2;
+  const double delta4 = delta2 * delta2;
+
+  m1_ = (n_ * m1_ + other.n_ * other.m1_) * inv_total_n;
+
+  const double new_m2 = m2_ + other.m2_ + delta2 * product_n * inv_total_n;
+
+  const double new_m3 =
+      m3_ + other.m3_ + delta3 * product_n * (n_ - other.n_) * inv_total_n2 +
+      3.0 * delta * (n_ * other.m2_ - other.n_ * m2_) * inv_total_n;
+
+  m4_ += other.m4_ +
+         delta4 * product_n * (n2 - product_n + other_n2) / total_n3 +
+         6.0 * delta2 * (n2 * other.m2_ + other_n2 * m2_) * inv_total_n2 +
+         4.0 * delta * (n_ * other.m3_ - other.n_ * m3_) * inv_total_n;
+
+  m2_ = new_m2;
+  m3_ = new_m3;
+  n_ = total_n;
+}
+
+std::string Stats::ToString(int exclude) const {
+  if (Count() == 0) return std::string("(none)");
+
+  char buf[300];
+  int pos = 0;
+  int ret;  // snprintf - bytes written or negative for error.
+
+  if ((exclude & kNoCount) == 0) {
+    ret = snprintf(buf + pos, sizeof(buf) - pos, "Count=%9zu ",
+                   static_cast<size_t>(Count()));
+    HWY_ASSERT(ret > 0);
+    pos += ret;
+  }
+
+  if ((exclude & kNoMeanSD) == 0) {
+    const float sd = StandardDeviation();
+    if (sd > 100) {
+      ret = snprintf(buf + pos, sizeof(buf) - pos, "Mean=%8.2E SD=%7.1E ",
+                     Mean(), sd);
+    } else {
+      ret = snprintf(buf + pos, sizeof(buf) - pos, "Mean=%8.6f SD=%7.5f ",
+                     Mean(), sd);
+    }
+    HWY_ASSERT(ret > 0);
+    pos += ret;
+  }
+
+  if ((exclude & kNoMinMax) == 0) {
+    ret = snprintf(buf + pos, sizeof(buf) - pos, "Min=%8.5e Max=%8.5e ", Min(),
+                   Max());
+    HWY_ASSERT(ret > 0);
+    pos += ret;
+  }
+
+  if ((exclude & kNoSkewKurt) == 0) {
+    ret = snprintf(buf + pos, sizeof(buf) - pos, "Skew=%5.2f Kurt=%7.2f ",
+                   Skewness(), Kurtosis());
+    HWY_ASSERT(ret > 0);
+    pos += ret;
+  }
+
+  if ((exclude & kNoGeomean) == 0) {
+    ret = snprintf(buf + pos, sizeof(buf) - pos, "GeoMean=%9.6f ",
+                   GeometricMean());
+    HWY_ASSERT(ret > 0);
+    pos += ret;
+  }
+
+  HWY_ASSERT(pos < sizeof(buf));
+  return buf;
+}
--- a/compression/stats.h
+++ b/compression/stats.h
@ -0,0 +1,190 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_STATS_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_STATS_H_
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+
+#include "hwy/base.h"  // HWY_ASSERT
+
+// Thread-compatible.
+template <size_t N>
+class Bins {
+ public:
+  Bins() { Reset(); }
+
+  template <typename T>
+  void Notify(T bin) {
+    HWY_ASSERT(T{0} <= bin && bin < static_cast<T>(N));
+    counts_[static_cast<int32_t>(bin)]++;
+  }
+
+  void Assimilate(const Bins<N>& other) {
+    for (size_t i = 0; i < N; ++i) {
+      counts_[i] += other.counts_[i];
+    }
+  }
+
+  void Print(const char* caption) {
+    fprintf(stderr, "\n%s [%zu]\n", caption, N);
+    size_t last_nonzero = 0;
+    for (size_t i = N - 1; i < N; --i) {
+      if (counts_[i] != 0) {
+        last_nonzero = i;
+        break;
+      }
+    }
+    for (size_t i = 0; i <= last_nonzero; ++i) {
+      fprintf(stderr, "  %zu\n", counts_[i]);
+    }
+  }
+
+  void Reset() {
+    for (size_t i = 0; i < N; ++i) {
+      counts_[i] = 0;
+    }
+  }
+
+ private:
+  size_t counts_[N];
+};
+
+// Descriptive statistics of a variable (4 moments). Thread-compatible.
+class Stats {
+ public:
+  Stats() { Reset(); }
+
+  void Notify(const float x) {
+    ++n_;
+
+    min_ = std::min(min_, x);
+    max_ = std::max(max_, x);
+
+    product_ *= x;
+
+    // Online moments. Reference: https://goo.gl/9ha694
+    const double d = x - m1_;
+    const double d_div_n = d / n_;
+    const double d2n1_div_n = d * (n_ - 1) * d_div_n;
+    const int64_t n_poly = n_ * n_ - 3 * n_ + 3;
+    m1_ += d_div_n;
+    m4_ += d_div_n * (d_div_n * (d2n1_div_n * n_poly + 6.0 * m2_) - 4.0 * m3_);
+    m3_ += d_div_n * (d2n1_div_n * (n_ - 2) - 3.0 * m2_);
+    m2_ += d2n1_div_n;
+  }
+
+  void Assimilate(const Stats& other);
+
+  int64_t Count() const { return n_; }
+
+  float Min() const { return min_; }
+  float Max() const { return max_; }
+
+  double GeometricMean() const {
+    return n_ == 0 ? 0.0 : pow(product_, 1.0 / n_);
+  }
+
+  double Mean() const { return m1_; }
+  // Same as Mu2. Assumes n_ is large.
+  double SampleVariance() const {
+    return n_ == 0 ? 0.0 : m2_ / static_cast<int>(n_);
+  }
+  // Unbiased estimator for population variance even for smaller n_.
+  double Variance() const {
+    if (n_ == 0) return 0.0;
+    if (n_ == 1) return m2_;
+    return m2_ / static_cast<int>(n_ - 1);
+  }
+  double StandardDeviation() const { return std::sqrt(Variance()); }
+  // Near zero for normal distributions; if positive on a unimodal distribution,
+  // the right tail is fatter. Assumes n_ is large.
+  double SampleSkewness() const {
+    if (std::abs(m2_) < 1E-7) return 0.0;
+    return m3_ * std::sqrt(static_cast<double>(n_)) / std::pow(m2_, 1.5);
+  }
+  // Corrected for bias (same as Wikipedia and Minitab but not Excel).
+  double Skewness() const {
+    if (n_ == 0) return 0.0;
+    const double biased = SampleSkewness();
+    const double r = (n_ - 1.0) / n_;
+    return biased * std::pow(r, 1.5);
+  }
+  // Near zero for normal distributions; smaller values indicate fewer/smaller
+  // outliers and larger indicates more/larger outliers. Assumes n_ is large.
+  double SampleKurtosis() const {
+    if (std::abs(m2_) < 1E-7) return 0.0;
+    return m4_ * n_ / (m2_ * m2_);
+  }
+  // Corrected for bias (same as Wikipedia and Minitab but not Excel).
+  double Kurtosis() const {
+    if (n_ == 0) return 0.0;
+    const double biased = SampleKurtosis();
+    const double r = (n_ - 1.0) / n_;
+    return biased * r * r;
+  }
+
+  // Central moments, useful for "method of moments"-based parameter estimation
+  // of a mixture of two Gaussians. Assumes Count() != 0.
+  double Mu1() const { return m1_; }
+  double Mu2() const { return m2_ / static_cast<int>(n_); }
+  double Mu3() const { return m3_ / static_cast<int>(n_); }
+  double Mu4() const { return m4_ / static_cast<int>(n_); }
+
+  // Which statistics to EXCLUDE in ToString
+  enum {
+    kNoCount = 1,
+    kNoMeanSD = 2,
+    kNoMinMax = 4,
+    kNoSkewKurt = 8,
+    kNoGeomean = 16
+  };
+  std::string ToString(int exclude = 0) const;
+
+  void Reset() {
+    n_ = 0;
+
+    min_ = hwy::HighestValue<float>();
+    max_ = hwy::LowestValue<float>();
+
+    product_ = 1.0;
+
+    m1_ = 0.0;
+    m2_ = 0.0;
+    m3_ = 0.0;
+    m4_ = 0.0;
+  }
+
+ private:
+  int64_t n_;  // signed for faster conversion + safe subtraction
+
+  float min_;
+  float max_;
+
+  double product_;  // for geomean
+
+  // Moments
+  double m1_;
+  double m2_;
+  double m3_;
+  double m4_;
+};
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_STATS_H_
--- a/configs.h
+++ b/configs.h
@ -0,0 +1,57 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Model configurations
+
+#ifndef THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
+#define THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
+
+#include <cstddef>
+
+namespace gcpp {
+
+static constexpr size_t kSeqLen = 7168;
+
+struct ConfigGemma7B {
+  // NOLINTBEGIN(google3-readability-class-member-naming)
+  static constexpr int seq_len = kSeqLen;
+  static constexpr int vocab_size = 256128;
+  static constexpr int n_layers = 28;
+  static constexpr int dim_model = 3072;
+  static constexpr int dim_ffw_hidden = 16 * 3072 / 2;  // = 24576
+  static constexpr int n_heads = 16;
+  static constexpr int n_kv_heads = 16;  // standard MHA, no GQA or MQA
+  static constexpr int dim_qkv = 256;    // query size == key size == value size
+  static constexpr int top_k = 1;
+  // NOLINTEND(google3-readability-class-member-naming)
+};
+
+struct ConfigGemma2B {
+  // NOLINTBEGIN(google3-readability-class-member-naming)
+  static constexpr int seq_len = kSeqLen;
+  static constexpr int vocab_size = 256128;
+  static constexpr int n_layers = 18;
+  static constexpr int dim_model = 2048;
+  static constexpr int dim_ffw_hidden = 16 * 2048 / 2;  // = 16384
+  static constexpr int n_heads = 8;
+  static constexpr int n_kv_heads = 8;  // TODO(austinvhuang): add MQA support
+  static constexpr int dim_qkv = 256;   // query size == key size == value size
+  static constexpr int top_k = 1;
+  // NOLINTEND(google3-readability-class-member-naming)
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@ -0,0 +1,32 @@
+# How to Contribute
+
+We would love to accept your patches and contributions to this project.
+
+## Before you begin
+
+### Sign our Contributor License Agreement
+
+Contributions to this project must be accompanied by a
+[Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
+You (or your employer) retain the copyright to your contribution; this simply
+gives us permission to use and redistribute your contributions as part of the
+project.
+
+If you or your current employer have already signed the Google CLA (even if it
+was for a different project), you probably don't need to do it again.
+
+Visit <https://cla.developers.google.com/> to see your current agreements or to
+sign a new one.
+
+### Review our Community Guidelines
+
+This project follows [Google's Open Source Community
+Guidelines](https://opensource.google/conduct/).
+
+## Contribution process
+
+### Code Reviews
+
+All submissions, including submissions by project members, require review. We
+use [GitHub pull requests](https://docs.github.com/articles/about-pull-requests)
+for this purpose.
--- a/gemma.cc
+++ b/gemma.cc
@ -0,0 +1,811 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Lightweight C++ implementation of the gemma model.
+
+// Compiles this file for multiple architectures via "foreach_target.h", to
+// which we pass the filename via macro 'argument'.
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "gemma.cc"  // NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+// Must come after foreach_target.h to avoid redefinition errors.
+// copybara:import_next_line:gemma_cpp
+#include "compression/compress-inl.h"
+// copybara:import_next_line:gemma_cpp
+#include "ops.h"
+// copybara:import_next_line:gemma_cpp
+#include "util/args.h"  // Path
+#include "hwy/contrib/matvec/matvec-inl.h"
+#include "hwy/highway.h"
+#include "hwy/profiler.h"
+#include "hwy/timer.h"
+
+// Non-SIMD includes and types. Note that HWY_ONCE is only true on the last
+// compile pass, whereas we want this defined in the first.
+#ifndef GEMMA_ONCE
+#define GEMMA_ONCE
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdlib>
+#include <filesystem>  // NOLINT
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/compress.h"
+// copybara:import_next_line:gemma_cpp
+#include "configs.h"
+// copybara:import_next_line:gemma_cpp
+#include "gemma.h"
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+// copybara:import_next_line:sentencepiece
+#include "src/sentencepiece_processor.h"
+// #include "third_party/sentencepiece/src/util.h"
+
+namespace gcpp {
+
+template <class TConfig>
+struct Layer {
+  Layer() = default;
+  // NOLINTBEGIN(google3-readability-class-member-naming)
+  static constexpr size_t n_heads = TConfig::n_heads;
+  static constexpr size_t dim_model = TConfig::dim_model;
+  static constexpr size_t dim_qkv = TConfig::dim_qkv;
+  static constexpr size_t dim_ffw_hidden = TConfig::dim_ffw_hidden;
+  static constexpr size_t size_attn_vec_einsum_w =
+      n_heads * dim_qkv * dim_model;
+  // 3x for (query, key, value)
+  static constexpr size_t size_qkv_einsum_w = 3 * n_heads * dim_qkv * dim_model;
+  // 2x for (gelu gating vector, gated vector)
+  static constexpr size_t size_gating_einsum_w = 2 * dim_ffw_hidden * dim_model;
+  static constexpr size_t size_linear_w = dim_model * dim_ffw_hidden;
+  std::array<float, size_attn_vec_einsum_w> attn_vec_einsum_w;
+  std::array<float, size_qkv_einsum_w> qkv_einsum_w;
+  std::array<float, size_gating_einsum_w> gating_einsum_w;
+  std::array<float, size_linear_w> linear_w;
+  std::array<float, dim_model> pre_attention_norm_scale;
+  std::array<float, dim_model> pre_ffw_norm_scale;
+  // NOLINTEND(google3-readability-class-member-naming)
+};
+
+template <class TConfig>
+struct Weights {
+  Weights() = default;
+
+  hwy::AlignedUniquePtr<Layer<TConfig>[]> layers;  // n_layers
+
+  std::array<float, TConfig::vocab_size * TConfig::dim_model>
+      embedder_input_embedding;
+
+  std::array<float, TConfig::dim_model> final_norm_scale;
+};
+
+// Only called if cached loading fails.
+template <typename TConfig>
+hwy::AlignedUniquePtr<Weights<TConfig>> LoadWeights(const Path& checkpoint) {
+  PROFILER_ZONE("Startup.LoadWeights");
+  using TWeights = Weights<TConfig>;
+  hwy::AlignedUniquePtr<TWeights> weights = hwy::MakeUniqueAligned<TWeights>();
+  weights->layers =
+      hwy::MakeUniqueAlignedArray<Layer<TConfig>>(TConfig::n_layers);
+
+  FILE* fptr;
+  fptr = fopen(checkpoint.path.c_str(), "rb");
+  if (fptr == nullptr) {
+    HWY_ABORT("Failed to open model file %s - does it exist?",
+              checkpoint.path.c_str());
+  }
+  bool ok = true;
+  ok &= 1 == fread(&(weights->embedder_input_embedding),
+                   sizeof(weights->embedder_input_embedding), 1, fptr);
+  ok &= 1 == fread(&(weights->final_norm_scale),
+                   sizeof(weights->final_norm_scale), 1, fptr);
+  for (size_t layer = 0; layer < TConfig::n_layers; ++layer) {
+    Layer<TConfig>* layer_view = &weights->layers[layer];
+    ok &= 1 == fread(&layer_view->attn_vec_einsum_w,
+                     sizeof(layer_view->attn_vec_einsum_w), 1, fptr);
+    ok &= 1 == fread(&layer_view->qkv_einsum_w,
+                     sizeof(layer_view->qkv_einsum_w), 1, fptr);
+    ok &= 1 == fread(&layer_view->gating_einsum_w,
+                     sizeof(layer_view->gating_einsum_w), 1, fptr);
+    ok &= 1 ==
+          fread(&layer_view->linear_w, sizeof(layer_view->linear_w), 1, fptr);
+    ok &= 1 == fread(&layer_view->pre_attention_norm_scale,
+                     sizeof(layer_view->pre_attention_norm_scale), 1, fptr);
+    ok &= 1 == fread(&layer_view->pre_ffw_norm_scale,
+                     sizeof(layer_view->pre_ffw_norm_scale), 1, fptr);
+  }
+  if (!ok) {
+    HWY_ABORT("Failed to read from %s - might be a directory, or too small?",
+              checkpoint.path.c_str());
+  }
+  HWY_ASSERT(0 == fclose(fptr));
+  return weights;
+}
+
+template <class TConfig>
+struct CompressedLayer {
+  // No ctor/dtor, allocated via AllocateAligned.
+
+  using TLayer = gcpp::Layer<TConfig>;
+
+  // # NOLINTBEGIN(google3-readability-class-member-naming)
+  static constexpr size_t dim_model = TConfig::dim_model;
+  static constexpr size_t dim_ffw_hidden = TConfig::dim_ffw_hidden;
+  // NOLINTEND(google3-readability-class-member-naming)
+
+  // Compressed Parameters
+  // We don't yet have an RMSNorm that accepts all WeightT.
+  CompressedArray<hwy::bfloat16_t, dim_model> c_pre_attention_norm_scale;
+  CompressedArray<hwy::bfloat16_t, dim_model> c_pre_ffw_norm_scale;
+  CompressedArray<WeightT, TLayer::size_gating_einsum_w> c_gating_einsum_w;
+  CompressedArray<WeightT, dim_model * dim_ffw_hidden> c_linear_w;
+  CompressedArray<WeightT, TLayer::size_qkv_einsum_w> c_qkv_einsum_w;
+  CompressedArray<WeightT, TLayer::size_attn_vec_einsum_w> c_attn_vec_einsum_w;
+};
+
+// Array instead of single large allocation for parallel mem init. Split out of
+// CompressedWeights so that only these pointers are initialized, not the
+// CompressedArray.
+template <class TConfig>
+struct CompressedLayerPointers {
+  explicit CompressedLayerPointers(hwy::ThreadPool& pool) {
+    pool.Run(0, TConfig::n_layers, [this](uint64_t task, size_t /*thread*/) {
+      this->c_layers[task] = hwy::AllocateAligned<CompressedLayer<TConfig>>(1);
+    });
+  }
+
+  using CLayer = CompressedLayer<TConfig>;
+  std::array<hwy::AlignedFreeUniquePtr<CLayer[]>, TConfig::n_layers> c_layers;
+};
+
+template <class TConfig>
+struct CompressedWeights {
+  // No ctor/dtor, allocated via AllocateAligned.
+
+  CompressedArray<EmbedderInputT, TConfig::vocab_size * TConfig::dim_model>
+      c_embedder_input_embedding;
+
+  CompressedArray<hwy::bfloat16_t, TConfig::dim_model> c_final_norm_scale;
+
+  // Must be last so that the other arrays remain aligned.
+  CompressedLayerPointers<TConfig> c_layer_ptrs;
+
+  const CompressedLayer<TConfig>* CLayer(size_t layer) const {
+    return c_layer_ptrs.c_layers[layer].get();
+  }
+  CompressedLayer<TConfig>* CLayer(size_t layer) {
+    return c_layer_ptrs.c_layers[layer].get();
+  }
+};
+
+// Aligned.
+template <class TConfig, size_t BatchSize>
+struct Activations {
+  // # NOLINTBEGIN(google3-readability-class-member-naming)
+  static constexpr size_t batch_size = BatchSize;
+  using LayerConfig = Layer<TConfig>;
+  static constexpr size_t dim_model = TConfig::dim_model;
+  static constexpr size_t dim_qkv = TConfig::dim_qkv;
+  static constexpr size_t n_heads = TConfig::n_heads;
+  static constexpr size_t n_kv_heads = TConfig::n_kv_heads;
+  static constexpr size_t size_cache_pos =
+      TConfig::n_layers * n_kv_heads * dim_qkv;
+  static constexpr size_t size_cache_layer = n_kv_heads * dim_qkv;
+  // NOLINTEND(google3-readability-class-member-naming)
+
+  std::array<float, batch_size * dim_model> x;  // input
+  std::array<float, batch_size * dim_model> pre_att_rms_out;
+  std::array<float, batch_size * n_heads * dim_qkv> q;  // query vector
+  std::array<float, batch_size * n_heads * TConfig::seq_len>
+      att;  // attention vector
+  std::array<float, batch_size * n_heads * dim_qkv>
+      att_out;  // attention output
+  std::array<float, n_heads * batch_size * dim_model>
+      att_post1;  // attention output after linear transformation, per head
+  std::array<float, batch_size * dim_model>
+      att_post2;  // accumulation of attention outputs over heads
+  std::array<hwy::bfloat16_t, batch_size * dim_model> bf_pre_ffw_rms_out;
+  std::array<float, batch_size * TConfig::dim_ffw_hidden * 2> ffw_hidden;
+  // bf_ version can't be used until GeluMulToBF16 issue in FFW() is resolved.
+  // std::array<hwy::bfloat16_t, batch_size * 2 * TConfig::dim_ffw_hidden>
+  //     bf_ffw_hidden;
+  std::array<float, batch_size * dim_model> ffw_out;
+  std::array<float, batch_size * TConfig::vocab_size> logits;
+};
+
+// GemmaImpl is a template and thus cannot be exposed in gemma.h, hence we
+// define an abstract base class.
+struct GemmaInterface {
+  virtual ~GemmaInterface() = default;
+
+  virtual const sentencepiece::SentencePieceProcessor& Tokenizer() const = 0;
+
+  // TODO: group pool/callbacks into struct
+  virtual void Generate(const InferenceArgs& args,
+                        const std::vector<int>& prompt, size_t start_pos,
+                        hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
+                        const StreamFunc& stream_token,
+                        const AcceptFunc& accept_token, std::mt19937& gen,
+                        int verbosity) = 0;
+};
+
+template <class Config>
+struct GemmaImpl : public GemmaInterface {
+  GemmaImpl(const LoaderArgs& args, hwy::ThreadPool& pool);
+
+  ~GemmaImpl() {
+    using CWeights = CompressedWeights<Config>;
+    CWeights* c_weights = reinterpret_cast<CWeights*>(compressed_weights.get());
+    c_weights->c_layer_ptrs.~CompressedLayerPointers<Config>();
+  }
+
+  const sentencepiece::SentencePieceProcessor& Tokenizer() const {
+    return tokenizer;
+  }
+
+  void Generate(const InferenceArgs& args, const std::vector<int>& prompt,
+                size_t start_pos, hwy::ThreadPool& pool,
+                hwy::ThreadPool& inner_pool, const StreamFunc& stream_token,
+                const AcceptFunc& accept_token, std::mt19937&, int verbosity);
+
+  sentencepiece::SentencePieceProcessor tokenizer;
+
+  // CompressedWeights<Config>
+  hwy::AlignedFreeUniquePtr<uint8_t[]> compressed_weights;
+  hwy::AlignedUniquePtr<Activations<Config, kPrefillBatchSize>> prefill;
+  hwy::AlignedUniquePtr<Activations<Config, 1>> state;
+  KVCache kv_cache;
+};
+
+}  // namespace gcpp
+#endif  // GEMMA_ONCE
+
+// SIMD code, compiled once per target.
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+
+template <class TConfig, size_t batch_size>
+HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
+                            Activations<TConfig, batch_size>& activations,
+                            const CompressedLayer<TConfig>* c_layer,
+                            KVCache& kv_cache, hwy::ThreadPool& pool) {
+  PROFILER_ZONE("Gen.Attention");
+  const size_t pos = batch_start + batch_idx;
+  HWY_DASSERT(batch_idx < batch_size);
+  static constexpr size_t dim_qkv = gcpp::Activations<TConfig, 1>::dim_qkv;
+  static constexpr size_t size_cache_pos =
+      gcpp::Activations<TConfig, batch_size>::size_cache_pos;
+  static constexpr size_t size_cache_layer =
+      gcpp::Activations<TConfig, batch_size>::size_cache_layer;
+  static constexpr size_t dim_model =
+      gcpp::Activations<TConfig, batch_size>::dim_model;
+  static constexpr size_t n_heads = TConfig::n_heads;
+  const float kQueryScale = 1.0 / sqrtf(static_cast<float>(dim_qkv));
+
+  pool.Run(0, n_heads, [&](const uint64_t head, size_t /*thread*/) HWY_ATTR {
+    // linear projections to QKV
+    const size_t head_offset =
+        3 * dim_qkv * dim_model;  // 3x for QKV dimensions
+    const size_t q_offset = head * head_offset + 0 * dim_qkv * dim_model;
+    const size_t k_offset = head * head_offset + 1 * dim_qkv * dim_model;
+    const size_t v_offset = head * head_offset + 2 * dim_qkv * dim_model;
+
+    float* HWY_RESTRICT q =
+        activations.q.data() + head * dim_qkv + batch_idx * n_heads * dim_qkv;
+
+    const size_t batch_offset = batch_idx * dim_model;
+
+    MatVecLoop<dim_qkv, dim_model>(
+        c_layer->c_qkv_einsum_w, q_offset,
+        activations.pre_att_rms_out.data() + batch_offset, q);
+
+    const size_t kv_offset =
+        pos * size_cache_pos + layer * size_cache_layer + head * dim_qkv;
+
+    TwoOfsMatVecLoop<dim_qkv, dim_model>(
+        c_layer->c_qkv_einsum_w, k_offset, v_offset,
+        activations.pre_att_rms_out.data() + batch_offset,
+        kv_cache.key_cache.get() + kv_offset,
+        kv_cache.value_cache.get() + kv_offset);
+
+    // Calculate scores
+    float* HWY_RESTRICT head_att = activations.att.data() +
+                                   head * TConfig::seq_len +
+                                   batch_idx * n_heads * dim_qkv;
+
+    Rope(q, dim_qkv, pos);
+    Rope(kv_cache.key_cache.get() + kv_offset, dim_qkv, pos);
+    MulByConst(kQueryScale, q, dim_qkv);
+    // Compute Q dot K scores
+    for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
+      const size_t cache_offset =
+          pos2 * size_cache_pos + layer * size_cache_layer + head * dim_qkv;
+      const float* HWY_RESTRICT k2 = kv_cache.key_cache.get() + cache_offset;
+      const float score = Dot(q, k2, dim_qkv);
+      head_att[pos2] = score;
+    }
+    Softmax(head_att, pos + 1);
+
+    // Weighted summation
+    float* HWY_RESTRICT att_out = activations.att_out.data() + head * dim_qkv +
+                                  batch_idx * n_heads * dim_qkv;
+    hwy::ZeroBytes(att_out, dim_qkv * sizeof(*att_out));
+    for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
+      const size_t cache_offset =
+          pos2 * size_cache_pos + layer * size_cache_layer + head * dim_qkv;
+      float* HWY_RESTRICT v2 = kv_cache.value_cache.get() + cache_offset;
+      MulByConstAndAdd(head_att[pos2], v2, att_out, dim_qkv);
+    }
+    // linear projection from dim_qkv back to dim_model, sum projections
+    // across heads
+    float* HWY_RESTRICT head_out =
+        head == 0
+            ? activations.att_post2.data() + batch_idx * dim_model
+            : activations.att_post1.data() + head * batch_size * dim_model;
+    MatVecLoop<dim_model, dim_qkv>(c_layer->c_attn_vec_einsum_w,
+                                   head * dim_model * dim_qkv, att_out,
+                                   head_out);
+  });
+
+  // accumulate output across all heads into att_post2. head 0 already wrote
+  // directly to att_post2.
+  for (size_t head = 1; head < n_heads; ++head) {
+    AddFrom(activations.att_post1.data() + head * batch_size * dim_model,
+            activations.att_post2.data() + batch_idx * dim_model, dim_model);
+  }
+}
+
+template <typename TConfig, size_t batch_size>
+HWY_NOINLINE void FFW(Activations<TConfig, batch_size>& activations,
+                      size_t batch_idx, const CompressedLayer<TConfig>* c_layer,
+                      hwy::ThreadPool& pool) {
+  HWY_DASSERT(batch_idx < batch_size);
+  static constexpr size_t dim_model = TConfig::dim_model;
+  static constexpr size_t dim_ffw_hidden = TConfig::dim_ffw_hidden;
+  const size_t hidden_offset = batch_idx * dim_ffw_hidden * 2;
+
+  {
+    PROFILER_ZONE("Gen.FFW.GatedGELU");
+    const hwy::bfloat16_t* HWY_RESTRICT vec =
+        activations.bf_pre_ffw_rms_out.data() + batch_idx * dim_model;
+    float* HWY_RESTRICT out = activations.ffw_hidden.data() + hidden_offset;
+    float* HWY_RESTRICT out_mul = out + dim_ffw_hidden;
+
+    // Same matrix, first and second half of rows. Could fuse into one MatVec,
+    // but separating them could help on NUMA e.g. multiple sockets.
+    MatVec<dim_ffw_hidden, dim_model>(c_layer->c_gating_einsum_w,
+                                      dim_ffw_hidden * dim_model, vec, out_mul,
+                                      pool);
+
+    // Gate, will go through the nonlinearity.
+    MatVec<dim_ffw_hidden, dim_model>(c_layer->c_gating_einsum_w, 0, vec, out,
+                                      pool);
+
+    namespace hn = hwy::HWY_NAMESPACE;
+    using DF = hn::ScalableTag<float>;
+    using VF = hn::Vec<DF>;
+    hn::Transform1(DF(), out, dim_ffw_hidden, out_mul,
+                   [](DF df, VF v, VF mul)
+                       HWY_ATTR { return hn::Mul(mul, Gelu(df, v)); });
+  }
+
+  PROFILER_ZONE("Gen.FFW\\GatedGELU");
+  MatVec<dim_model, dim_ffw_hidden>(
+      c_layer->c_linear_w, 0, activations.ffw_hidden.data() + hidden_offset,
+      activations.ffw_out.data() + batch_idx * dim_model, pool);
+}
+
+template <typename TConfig, size_t batch_size>
+HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
+                          const CompressedWeights<TConfig>& c_weights,
+                          Activations<TConfig, batch_size>& activations,
+                          KVCache& kv_cache, hwy::ThreadPool& pool,
+                          hwy::ThreadPool& inner_pool) {
+  PROFILER_ZONE("Gen.Prefill\\Att\\FFW");
+  static constexpr size_t dim_model = TConfig::dim_model;
+  static const float kEmbScaling = sqrtf(static_cast<float>(dim_model));
+
+  pool.Run(
+      0, num_tokens, [&](const uint64_t token_idx, size_t /*thread*/) HWY_ATTR {
+        const int token = tokens[token_idx];
+        Decompress(c_weights.c_embedder_input_embedding, token * dim_model,
+                   activations.x.data() + token_idx * dim_model, dim_model);
+        MulByConst(kEmbScaling, activations.x.data() + token_idx * dim_model,
+                   dim_model);
+      });
+
+  for (size_t layer = 0; layer < TConfig::n_layers; ++layer) {
+    const CompressedLayer<TConfig>* c_layer = c_weights.CLayer(layer);
+
+    for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
+      RMSNorm(activations.x.data() + token_idx * dim_model,
+              c_layer->c_pre_attention_norm_scale.data(),
+              activations.pre_att_rms_out.data() + token_idx * dim_model,
+              dim_model);
+      Attention<TConfig, batch_size>(pos, token_idx, layer, activations,
+                                     c_layer, kv_cache, pool);
+    }
+
+    // TODO: sink the loop into these functions, i.e. make them matmuls.
+    pool.Run(
+        0, num_tokens,
+        [&](const uint64_t token_idx, size_t thread_id) HWY_ATTR {
+          AddFrom(activations.att_post2.data() + token_idx * dim_model,
+                  activations.x.data() + token_idx * dim_model, dim_model);
+          RMSNorm(activations.x.data() + token_idx * dim_model,
+                  c_layer->c_pre_ffw_norm_scale.data(),
+                  activations.bf_pre_ffw_rms_out.data() + token_idx * dim_model,
+                  dim_model);
+          FFW<TConfig, batch_size>(activations, token_idx, c_layer, inner_pool);
+          AddFrom(activations.ffw_out.data() + token_idx * dim_model,
+                  activations.x.data() + token_idx * dim_model, dim_model);
+        });
+  }  // foreach layer
+
+  pool.Run(
+      0, num_tokens, [&](const uint64_t token_idx, size_t /*thread*/) HWY_ATTR {
+        RMSNormInplace(c_weights.c_final_norm_scale.data(),
+                       activations.x.data() + token_idx * dim_model, dim_model);
+      });
+}
+
+// n = 1 specialization
+template <class TConfig>
+void Transformer(int token, size_t pos,
+                 const CompressedWeights<TConfig>& c_weights,
+                 Activations<TConfig, 1>& activations, KVCache& kv_cache,
+                 hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool) {
+  static constexpr size_t n_layers = TConfig::n_layers;
+  static constexpr size_t dim_model = TConfig::dim_model;
+
+  static const float kEmbScaling = sqrtf(static_cast<float>(dim_model));
+
+  Decompress(c_weights.c_embedder_input_embedding, token * dim_model,
+             activations.x.data(), dim_model);
+
+  MulByConst(kEmbScaling, activations.x.data(), dim_model);
+
+  for (size_t layer = 0; layer < n_layers; ++layer) {
+    const CompressedLayer<TConfig>* c_layer = c_weights.CLayer(layer);
+    RMSNorm(activations.x.data(), c_layer->c_pre_attention_norm_scale.data(),
+            activations.pre_att_rms_out.data(), dim_model);
+    Attention<TConfig, 1>(pos, 0, layer, activations, c_layer, kv_cache, pool);
+    AddFrom(activations.att_post2.data(), activations.x.data(), dim_model);
+    RMSNorm(activations.x.data(), c_layer->c_pre_ffw_norm_scale.data(),
+            activations.bf_pre_ffw_rms_out.data(), dim_model);
+    FFW<TConfig, 1>(activations, /* batch_idx = */ 0, c_layer, pool);
+    AddFrom(activations.ffw_out.data(), activations.x.data(), dim_model);
+  }
+  RMSNormInplace(c_weights.c_final_norm_scale.data(), activations.x.data(),
+                 dim_model);
+}
+
+template <class TConfig>
+void GenerateImpl(GemmaImpl<TConfig>& gemma, const InferenceArgs& args,
+                  const std::vector<int>& prompt, size_t pos,
+                  hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
+                  const StreamFunc& stream_token,
+                  const AcceptFunc& accept_token, std::mt19937& gen,
+                  int verbosity) {
+  static constexpr size_t dim_model = TConfig::dim_model;
+  static constexpr size_t vocab_size = TConfig::vocab_size;
+  static constexpr size_t top_k = TConfig::top_k;
+  Activations<TConfig, 1>& activations = *gemma.state.get();
+  Activations<TConfig, kPrefillBatchSize>& prefill_activations =
+      *gemma.prefill.get();
+  const CompressedWeights<TConfig>& c_weights =
+      *reinterpret_cast<CompressedWeights<TConfig>*>(
+          gemma.compressed_weights.get());
+  KVCache& kv_cache = gemma.kv_cache;
+  int token;
+
+  // pos indexes the KV cache. In the first turn of a chat, pos = 0.
+  //
+  // After the first turn, pos gets passed in with > 0 corresponding to the
+  // current token position in the KV cache.
+  //
+  // pos_offset keeps track of the relative position within the turn, starting
+  // at 0 each turn. During prefill, pos_offset corresponds to the index into
+  // the prompt vector.
+  //
+  // In single-turn (non-chat) usage, pos and pos_offset start at 0 and are
+  // always equal.
+  size_t pos_offset = 0;  // offset relative to pos
+  double prefill_start = hwy::platform::Now();
+
+  // Prefill stops before prompt.size() - 1 since the last prompt token is the
+  // first input token for generation.
+  while (pos_offset < prompt.size() - 1) {
+    const size_t end_offset =
+        std::min(kPrefillBatchSize, prompt.size() - 1 - pos_offset);
+    HWY_DASSERT(end_offset < prompt.size());
+    const int* batch_tokens = prompt.data() + pos_offset;
+    Prefill<TConfig, kPrefillBatchSize>(batch_tokens, end_offset, pos,
+                                        c_weights, prefill_activations,
+                                        kv_cache, pool, inner_pool);
+    for (size_t idx = 0; idx < end_offset; ++idx) {
+      stream_token(batch_tokens[idx], 0.0);
+    }
+    pos += end_offset;
+    pos_offset += end_offset;
+  }
+
+  if (verbosity >= 2) {
+    // in the future this output should not occur in GenerateImpl but instead
+    // should be available as observable state for frontend code to handle I/O.
+    double prefill_end = hwy::platform::Now();
+    const double prefill_tok_sec = pos_offset / (prefill_end - prefill_start);
+    std::cout << "\n[ Prefill tokens / sec = " << prefill_tok_sec << " ]\n";
+  }
+
+  double gen_start = hwy::platform::Now();
+
+  HWY_DASSERT(pos_offset == prompt.size() - 1);
+
+  if (verbosity >= 2) {
+    // Provide usage warnings if max_new_tokens is out of range.
+    if (args.max_generated_tokens > args.max_tokens) {
+      std::cout << "Warning: max_new_tokens should be <= max_tokens"
+                << std::endl;
+    } else if ((prompt.size() + args.max_generated_tokens) > args.max_tokens) {
+      std::cout << "Warning: Prompt size + max_new_tokens exceeds max_tokens."
+                << std::endl;
+    }
+  }
+
+  auto pos_gen_start = pos_offset;
+  token = prompt.at(pos_offset);
+  size_t generate_pos = 0;
+  for (; pos < args.max_tokens && generate_pos < args.max_generated_tokens;
+       ++pos, ++pos_offset, ++generate_pos) {
+    Transformer(token, pos, c_weights, activations, kv_cache, pool, inner_pool);
+    float* final_activation = activations.x.data();
+    if (pos_offset >= prompt.size()) {
+      PROFILER_ZONE("Gen.Embedding");
+      // Generation phase
+      MatVec<vocab_size, dim_model>(c_weights.c_embedder_input_embedding, 0,
+                                    final_activation, activations.logits.data(),
+                                    pool);
+      // Barrier: must have all logits so we can subtract max.
+      Softmax(activations.logits.data(), vocab_size);
+      token = SampleTopK<top_k>(activations.logits.data(), vocab_size, gen,
+                                args.temperature, accept_token);
+    }
+    if (!stream_token(token, activations.logits[token])) {
+      token = EOS_ID;
+    }
+    if (token == EOS_ID) {
+      if (verbosity >= 2) {
+        double gen_end = hwy::platform::Now();
+        const double gen_tok_sec =
+            (pos_offset - pos_gen_start) / (gen_end - gen_start);
+        std::cout << "\n[ Generation tokens / sec = " << gen_tok_sec << " ]\n";
+      }
+      break;
+    }
+  }
+}
+
+void Generate2B(GemmaImpl<ConfigGemma2B>& gemma, const InferenceArgs& args,
+                const std::vector<int>& prompt, size_t start_pos,
+                hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
+                const StreamFunc& stream_token, const AcceptFunc& accept_token,
+                std::mt19937& gen, int verbosity) {
+  GenerateImpl(gemma, args, prompt, start_pos, pool, inner_pool, stream_token,
+               accept_token, gen, verbosity);
+}
+
+void Generate7B(GemmaImpl<ConfigGemma7B>& gemma, const InferenceArgs& args,
+                const std::vector<int>& prompt, size_t start_pos,
+                hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
+                const StreamFunc& stream_token, const AcceptFunc& accept_token,
+                std::mt19937& gen, int verbosity) {
+  GenerateImpl(gemma, args, prompt, start_pos, pool, inner_pool, stream_token,
+               accept_token, gen, verbosity);
+}
+
+// Calls func(name, float*, CompressedArray&) for each tensor. float* is null
+// if weights = null, which happens during the first call where we attempt to
+// load from cache.
+//
+// This avoids repeating the list of tensors between loading and compressing.
+template <class TConfig, class Func>
+void ForEachTensor(const Weights<TConfig>* weights,
+                   CompressedWeights<TConfig>& c_weights, Func& func) {
+  func("c_embedding",
+       weights ? weights->embedder_input_embedding.data() : nullptr,
+       c_weights.c_embedder_input_embedding);
+  func("c_final_norm", weights ? weights->final_norm_scale.data() : nullptr,
+       c_weights.c_final_norm_scale);
+
+  char name[16];
+  for (size_t layer_idx = 0; layer_idx < TConfig::n_layers; ++layer_idx) {
+    Layer<TConfig>* layer = weights ? &weights->layers[layer_idx] : nullptr;
+    CompressedLayer<TConfig>* c_layer = c_weights.CLayer(layer_idx);
+
+    snprintf(name, sizeof(name), "pre_ff_ns_%lu", layer_idx);
+    func(name, layer ? layer->pre_ffw_norm_scale.data() : nullptr,
+         c_layer->c_pre_ffw_norm_scale);
+
+    snprintf(name, sizeof(name), "gating_ein_%lu", layer_idx);
+    func(name, layer ? layer->gating_einsum_w.data() : nullptr,
+         c_layer->c_gating_einsum_w);
+
+    snprintf(name, sizeof(name), "linear_w_%lu", layer_idx);
+    func(name, layer ? layer->linear_w.data() : nullptr, c_layer->c_linear_w);
+    snprintf(name, sizeof(name), "qkv_ein_%lu", layer_idx);
+
+    func(name, layer ? layer->qkv_einsum_w.data() : nullptr,
+         c_layer->c_qkv_einsum_w);
+    snprintf(name, sizeof(name), "att_ein_%lu", layer_idx);
+
+    func(name, layer ? layer->attn_vec_einsum_w.data() : nullptr,
+         c_layer->c_attn_vec_einsum_w);
+
+    snprintf(name, sizeof(name), "pre_att_ns_%lu", layer_idx);
+    func(name, layer ? layer->pre_attention_norm_scale.data() : nullptr,
+         c_layer->c_pre_attention_norm_scale);
+  }
+}
+
+template <class TConfig>
+hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeights(
+    const Path& model, const Path& cache, hwy::ThreadPool& pool) {
+  PROFILER_ZONE("Startup.LoadCache");
+
+  if (!std::filesystem::exists(model.path) &&
+      !std::filesystem::exists(cache.path)) {
+    HWY_ABORT(
+        "Either the model weights (--weights) or cached compressed weights "
+        "(--compressed_weights) must exist.");
+  }
+
+  // Allocate compressed weights.
+  using CWeights = CompressedWeights<TConfig>;
+  hwy::AlignedFreeUniquePtr<uint8_t[]> c_weights_u8 =
+      hwy::AllocateAligned<uint8_t>(sizeof(CWeights));
+  CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
+  new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
+
+  // First attempt to load them from cache, without requiring weights.
+  CacheLoader loader(cache.path.c_str());
+  ForEachTensor<TConfig>(nullptr, *c_weights, loader);
+  if (loader.ReadAll(pool)) return c_weights_u8;
+
+  // Get weights, compress, and store in cache.
+  hwy::AlignedUniquePtr<Weights<TConfig>> weights = LoadWeights<TConfig>(model);
+  Compressor compressor(pool);
+  ForEachTensor<TConfig>(weights.get(), *c_weights, compressor);
+  compressor.WriteAll(pool, cache.path.c_str());
+
+  return c_weights_u8;
+}
+
+// Type-erased because this function is called via a function pointer.
+hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeightsT(
+    const LoaderArgs& args, hwy::ThreadPool& pool) {
+  switch (args.ModelType()) {
+    case Model::GEMMA_2B:
+      return GetCompressedWeights<ConfigGemma2B>(args.model, args.cache, pool);
+    case Model::GEMMA_7B:
+      return GetCompressedWeights<ConfigGemma7B>(args.model, args.cache, pool);
+    default:
+      HWY_ABORT("Model type %d unknown.", static_cast<int>(args.ModelType()));
+  }
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace gcpp {
+
+HWY_EXPORT(GetCompressedWeightsT);
+HWY_EXPORT(Generate2B);
+HWY_EXPORT(Generate7B);
+
+KVCache CreateKVCache(size_t size_cache_pos, size_t seq_len) {
+  KVCache kv_cache = {};
+  kv_cache.key_cache = hwy::AllocateAligned<float>(seq_len * size_cache_pos);
+  kv_cache.value_cache = hwy::AllocateAligned<float>(seq_len * size_cache_pos);
+  return kv_cache;
+}
+
+template <class Config>
+GemmaImpl<Config>::GemmaImpl(const LoaderArgs& args, hwy::ThreadPool& pool)
+    : compressed_weights(
+          HWY_DYNAMIC_DISPATCH(GetCompressedWeightsT)(args, pool)),
+      prefill(hwy::MakeUniqueAligned<Activations<Config, kPrefillBatchSize>>()),
+      state(hwy::MakeUniqueAligned<Activations<Config, 1>>()),
+      kv_cache(
+          CreateKVCache(Config::n_layers * Config::n_kv_heads * Config::dim_qkv,
+                        Config::seq_len)) {
+  PROFILER_ZONE("Startup.tokenizer");
+
+  HWY_ASSERT(tokenizer.Load(args.tokenizer.path).ok());
+}
+
+template <>
+void GemmaImpl<ConfigGemma2B>::Generate(const InferenceArgs& args,
+                                        const std::vector<int>& prompt,
+                                        size_t start_pos, hwy::ThreadPool& pool,
+                                        hwy::ThreadPool& inner_pool,
+                                        const StreamFunc& stream_token,
+                                        const AcceptFunc& accept_token,
+                                        std::mt19937& gen, int verbosity) {
+  HWY_DYNAMIC_DISPATCH(Generate2B)
+  (*this, args, prompt, start_pos, pool, inner_pool, stream_token, accept_token,
+   gen, verbosity);
+}
+template <>
+void GemmaImpl<ConfigGemma7B>::Generate(const InferenceArgs& args,
+                                        const std::vector<int>& prompt,
+                                        size_t start_pos, hwy::ThreadPool& pool,
+                                        hwy::ThreadPool& inner_pool,
+                                        const StreamFunc& stream_token,
+                                        const AcceptFunc& accept_token,
+                                        std::mt19937& gen, int verbosity) {
+  HWY_DYNAMIC_DISPATCH(Generate7B)
+  (*this, args, prompt, start_pos, pool, inner_pool, stream_token, accept_token,
+   gen, verbosity);
+}
+
+Gemma::Gemma(const LoaderArgs& args, hwy::ThreadPool& pool) {
+  const Model model_type = args.ModelType();
+  model_training = args.ModelTraining();
+  switch (model_type) {
+    case Model::GEMMA_2B:
+      impl_.reset(new GemmaImpl<ConfigGemma2B>(args, pool));
+      break;
+    case Model::GEMMA_7B:
+      impl_.reset(new GemmaImpl<ConfigGemma7B>(args, pool));
+      break;
+    default:
+      HWY_ABORT("Model type %d unknown.", static_cast<int>(model_type));
+  }
+}
+Gemma::~Gemma() = default;  // after GemmaInterface is defined
+
+const sentencepiece::SentencePieceProcessor& Gemma::Tokenizer() const {
+  return impl_->Tokenizer();
+}
+
+void GenerateGemma(Gemma& gemma, const InferenceArgs& args,
+                   const std::vector<int>& prompt, size_t start_pos,
+                   hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
+                   const StreamFunc& stream_token,
+                   const AcceptFunc& accept_token, std::mt19937& gen,
+                   int verbosity) {
+  pool.SetWaitMode(hwy::PoolWaitMode::kSpin);
+  gemma.impl_->Generate(args, prompt, start_pos, pool, inner_pool, stream_token,
+                        accept_token, gen, verbosity);
+  pool.SetWaitMode(hwy::PoolWaitMode::kBlock);
+}
+
+}  // namespace gcpp
+#endif  // HWY_ONCE
--- a/gemma.h
+++ b/gemma.h
@ -0,0 +1,207 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_H_
+#define THIRD_PARTY_GEMMA_CPP_GEMMA_H_
+
+#include <algorithm>
+#include <cctype>
+#include <functional>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+
+// copybara:import_next_line:gemma_cpp
+#include "configs.h" // kSeqLen
+// copybara:import_next_line:gemma_cpp
+#include "compression/compress.h"  // SfpStream/NuqStream
+// copybara:import_next_line:gemma_cpp
+#include "util/args.h"             // ArgsBase
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"  // hwy::bfloat16_t
+#include "hwy/contrib/thread_pool/thread_pool.h"
+// copybara:import_next_line:sentencepiece
+#include "src/sentencepiece_processor.h"
+
+namespace gcpp {
+
+// Allowable types for GEMMA_WEIGHT_T (can be specified at compilation time):
+// float, hwy::bfloat16_t, SfpStream, NuqStream
+#ifndef GEMMA_WEIGHT_T
+#define GEMMA_WEIGHT_T SfpStream
+#endif  // !GEMMA_WEIGHT_T
+using WeightT = GEMMA_WEIGHT_T;
+
+using EmbedderInputT = hwy::bfloat16_t;
+constexpr size_t kPrefillBatchSize = 16;
+constexpr bool kSystemPrompt = false;
+
+struct KVCache {
+  hwy::AlignedFreeUniquePtr<float[]>
+      key_cache;  // batch_size * seq_len * n_layers * n_kv_heads * dim_qkv
+  hwy::AlignedFreeUniquePtr<float[]>
+      value_cache;  // batch_size * seq_len * n_layers * n_kv_heads * dim_qkv
+};
+
+// Model variants: see configs.h for details.
+enum class Model { GEMMA_2B, GEMMA_7B };
+enum class ModelTraining { GEMMA_IT, GEMMA_PT };
+
+struct LoaderArgs : public ArgsBase<LoaderArgs> {
+  LoaderArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
+
+  static std::string ToLower(const std::string& text) {
+    std::string result = text;
+    std::transform(begin(result), end(result), begin(result),
+                   [](unsigned char c) { return std::tolower(c); });
+    return result;
+  }
+
+  gcpp::Model ModelType() const {
+    const std::string model_type_lc = ToLower(model_type);
+    if (model_type_lc == "2b-pt" || model_type_lc == "2b-it") {
+      return gcpp::Model::GEMMA_2B;
+    } else {
+      return gcpp::Model::GEMMA_7B;
+    }
+  }
+
+  gcpp::ModelTraining ModelTraining() const {
+    const std::string model_type_lc = ToLower(model_type);
+    if (model_type_lc == "7b-pt" || model_type_lc == "2b-pt") {
+      return gcpp::ModelTraining::GEMMA_PT;
+    } else {
+      return gcpp::ModelTraining::GEMMA_IT;
+    }
+  }
+
+  // Returns error string or nullptr if OK.
+  const char* Validate() const {
+    const std::string model_type_lc = ToLower(model_type);
+    if (model_type_lc != "2b-pt" && model_type_lc != "7b-pt" &&
+        model_type_lc != "2b-it" && model_type_lc != "7b-it") {
+      return "Model type must be 2b-pt, 7b-pt, 2b-it, or "
+             "7b-it.";
+    }
+    if (tokenizer.path.empty()) {
+      return "Missing --tokenizer flag, a file for the tokenizer is required.";
+    }
+    if (model_type.empty()) {
+      return "Missing --model flag, need to specify either 2b-pt, 7b-pt, "
+             "2b-it, or 7b-it.";
+    }
+    if (cache.path.empty()) {
+      return "Missing --compressed_weights flag, a file for the compressed "
+             "model.";
+    }
+    return nullptr;
+  }
+
+  Path tokenizer;
+  Path model;  // uncompressed weights OR
+  Path cache;  // compressed weights
+  std::string model_type;
+
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
+    visitor(tokenizer, "tokenizer", Path(),
+            "Path name of tokenizer model file. (required)");
+    visitor(
+        cache, "compressed_weights", Path(),
+        "Path name of compressed weights file, regenerated from `--weights` "
+        "file if "
+        "the compressed weights file does not exist. (required)");
+    visitor(model_type, "model", std::string(),
+            "Model type - can be 2b-it (2B parameters, instruction-tuned), "
+            "2b-pt (2B parameters, pretrained), 7b-it (7B parameters, "
+            "instruction-tuned), or 7b-pt (7B parameters, pretrained). "
+            "(required)");
+    visitor(model, "weights", Path(),
+            "Path name of model weights (.sbs) file. Only required if "
+            "compressed_weights file is not present and needs to be "
+            "regenerated. Otherwise, not needed");
+  }
+};
+
+struct GemmaInterface;
+
+struct Gemma {
+  Gemma(const LoaderArgs& args, hwy::ThreadPool& pool);
+  ~Gemma();  // must be defined after GemmaInterface's dtor is defined.
+
+  const sentencepiece::SentencePieceProcessor& Tokenizer() const;
+
+  std::unique_ptr<GemmaInterface> impl_;
+  gcpp::ModelTraining model_training;
+};
+
+// StreamFunc is called with (token, probability). For prompt tokens,
+// probability is 0.0f.
+using StreamFunc = std::function<bool(int, float)>;
+using AcceptFunc = std::function<bool(int)>;
+
+struct InferenceArgs : public ArgsBase<InferenceArgs> {
+  InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
+
+  size_t max_tokens;
+  size_t max_generated_tokens;
+
+  float temperature;
+  bool deterministic;
+  bool multiturn;
+
+  // Returns error string or nullptr if OK.
+  const char* Validate() const {
+    if (max_tokens > gcpp::kSeqLen) {
+      return "max_tokens is larger than the maximum sequence length (see "
+             "configs.h).";
+    }
+    if (max_generated_tokens > max_tokens) {
+      return "Maximum number of generated tokens is larger than the maximum "
+             "total tokens.";
+    }
+    return nullptr;
+  }
+
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
+    visitor(max_tokens, "max_tokens", size_t{3072},
+            "Maximum number of tokens in prompt + generation.");
+    visitor(max_generated_tokens, "max_generated_tokens", size_t{2048},
+            "Maximum number of tokens to generate.");
+
+    visitor(temperature, "temperature", 1.0f, "Temperature for top-K", 2);
+    visitor(deterministic, "deterministic", false,
+            "Make top-k sampling deterministic", 2);
+    visitor(multiturn, "multiturn", true,
+            "Multiturn mode (if 0, this clears the KV cache after every "
+            "interaction without quitting)",
+            2);
+  }
+};
+
+void GenerateGemma(Gemma& gemma, const InferenceArgs& args,
+                   const std::vector<int>& prompt, size_t start_pos,
+                   hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
+                   const StreamFunc& stream_token,
+                   const AcceptFunc& accept_token, std::mt19937& g,
+                   int verbosity);
+
+constexpr int EOS_ID = 1;
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_H_
--- a/ops.h
+++ b/ops.h
@ -0,0 +1,682 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Include guard for non-SIMD code.
+#ifndef THIRD_PARTY_GEMMA_CPP_OPS_H_
+#define THIRD_PARTY_GEMMA_CPP_OPS_H_
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <cmath>
+#include <random>
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/compress.h"
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/profiler.h"
+
+#endif  // THIRD_PARTY_GEMMA_CPP_OPS_H_
+
+// Include guard for (potentially) SIMD code.
+#if defined(THIRD_PARTY_GEMMA_CPP_OPS_TOGGLE) == defined(HWY_TARGET_TOGGLE)
+#ifdef THIRD_PARTY_GEMMA_CPP_OPS_TOGGLE
+#undef THIRD_PARTY_GEMMA_CPP_OPS_TOGGLE
+#else
+#define THIRD_PARTY_GEMMA_CPP_OPS_TOGGLE
+#endif
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/compress-inl.h"
+#include "hwy/cache_control.h"  // FlushStream
+#include "hwy/contrib/algo/transform-inl.h"
+#include "hwy/contrib/dot/dot-inl.h"
+#include "hwy/contrib/math/math-inl.h"
+#include "hwy/contrib/matvec/matvec-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+HWY_INLINE constexpr size_t MaxCols() {
+  // Vec + mat rows should fit into 32 KiB L1.
+  return 2048;
+}
+
+template <size_t kOuter>
+HWY_INLINE constexpr size_t RowsPerStrip() {
+  // Aim for 128 work items to reduce pool overhead. Must be at least one
+  // vector; prefer a power of two for faster division.
+  constexpr size_t kRowsPerStrip =
+      HWY_MAX(hn::ScalableTag<float>().MaxLanes(),
+              1ULL << hwy::FloorLog2(kOuter / 128));
+  return kRowsPerStrip;
+}
+
+// Simple version without tiling nor threading.
+template <size_t kOuter, size_t kInner, typename MatT, size_t kCapacity,
+          typename VecT>
+HWY_INLINE void MatVecLoop(const CompressedArray<MatT, kCapacity>& mat,
+                           const size_t mat_ofs,
+                           const VecT* HWY_RESTRICT vec_aligned,
+                           float* HWY_RESTRICT out) {
+  PROFILER_ZONE("MatVecLoop");
+  const hn::ScalableTag<float> df;
+
+  for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
+    const size_t row_ofs = mat_ofs + idx_row * kInner;
+    out[idx_row] = Dot(df, mat, row_ofs, vec_aligned, kInner);
+  }
+}
+
+// Simple version without tiling nor threading, but two offsets/outputs.
+template <size_t kOuter, size_t kInner, typename MatT, size_t kCapacity,
+          typename VecT>
+HWY_INLINE void TwoOfsMatVecLoop(const CompressedArray<MatT, kCapacity>& mat,
+                                 const size_t mat_ofs0, const size_t mat_ofs1,
+                                 const VecT* HWY_RESTRICT vec_aligned,
+                                 float* HWY_RESTRICT out0,
+                                 float* HWY_RESTRICT out1) {
+  PROFILER_ZONE("MatVecLoop");
+  const hn::ScalableTag<float> df;
+
+  for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
+    const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner;
+    const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner;
+    out0[idx_row] = Dot(df, mat, row_ofs0, vec_aligned, kInner);
+    out1[idx_row] = Dot(df, mat, row_ofs1, vec_aligned, kInner);
+  }
+}
+
+namespace detail {
+
+// For each i = [0, num_rows), compute partial (length `num_cols`) dot product
+// of row i with `vec_aligned` and add into `out[i]`. The upper-left coordinate
+// of the tile is r0, c0.
+template <class DF, typename MatT, size_t kCapacity, typename VecT>
+HWY_INLINE void AccumulatePartialDotProducts(
+    DF df, const CompressedArray<MatT, kCapacity>& mat, size_t mat_ofs,
+    size_t mat_stride, size_t r0, size_t c0, size_t num_rows, size_t num_cols,
+    const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
+  for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
+    const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
+    out[idx_row] += Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
+  }
+}
+
+// Same as above, but sets out[i] to the first partial dot product, which
+// avoids having to zero-initialize and accumulate.
+template <class DF, typename MatT, size_t kCapacity, typename VecT>
+HWY_INLINE void SetFirstPartialDotProducts(
+    DF df, const CompressedArray<MatT, kCapacity>& mat, size_t mat_ofs,
+    size_t mat_stride, size_t r0, size_t c0, size_t num_rows, size_t num_cols,
+    const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
+  for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
+    const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
+    out[idx_row] = Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
+  }
+}
+
+// Adds together partial dot products for all tiles with the same r0 (a
+// horizontal strip of the entire matrix); the result is the full dot product
+// for rows r in [r0, r0 + num_rows), which we store into in out[r - r0].
+template <class DF, typename MatT, size_t kCapacity, typename VecT>
+HWY_INLINE void FullDotProductsForStrip(
+    DF df, const CompressedArray<MatT, kCapacity>& mat, size_t mat_ofs,
+    size_t mat_stride, size_t r0, size_t num_rows,
+    const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
+  // Tall and skinny: set `out` to the single dot product.
+  if (mat_stride < MaxCols()) {
+    SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, 0, num_rows,
+                               mat_stride, vec_aligned, out);
+    return;
+  }
+
+  // We have at least MaxCols, so start by setting `out` to that:
+  SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, 0, num_rows,
+                             MaxCols(), vec_aligned, out);
+  // For further multiples of MaxCols, accumulate. Remainders handled below.
+  size_t c0 = MaxCols();
+  HWY_UNROLL(1)
+  for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) {
+    AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, num_rows,
+                                 MaxCols(), vec_aligned, out);
+  }
+
+  if (c0 < mat_stride) {  // Final cols
+    AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, num_rows,
+                                 mat_stride - c0, vec_aligned, out);
+  }
+}
+
+}  // namespace detail
+
+// Stores dot products of rows with `vec_aligned` to a buffer, then stores them
+// to `out`.
+template <size_t kOuter, size_t kInner, typename MatT, size_t kCapacity,
+          typename VecT>
+HWY_INLINE void MatVec(const CompressedArray<MatT, kCapacity>& mat,
+                       const size_t mat_ofs,
+                       const VecT* HWY_RESTRICT const vec_aligned,
+                       float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
+  PROFILER_ZONE("MatVec");
+
+  const hn::ScalableTag<float> df;
+  constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
+  constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
+
+  // For each entire strip.
+  pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
+    PROFILER_ZONE("MatVec.lambda");
+    const size_t r0 = strip * kRowsPerStrip;
+    detail::FullDotProductsForStrip(df, mat, mat_ofs, kInner, r0, kRowsPerStrip,
+                                    vec_aligned, out + r0);
+  });
+
+  // Remaining rows
+  const size_t r0 = kNumStrips * kRowsPerStrip;
+  if (r0 < kOuter) {
+    PROFILER_ZONE("MatVec remainder");
+    const size_t num_rows = kOuter - r0;
+    detail::FullDotProductsForStrip(df, mat, mat_ofs, kInner, r0, num_rows,
+                                    vec_aligned, out + r0);
+  }
+}
+
+template <class D, HWY_IF_F32_D(D)>
+static HWY_INLINE hn::Vec<D> Gelu(D d, hn::Vec<D> v) {
+  const hn::Vec<D> kMul = Set(d, 0.044715f);
+  const hn::Vec<D> kSqrt2OverPi = hn::Set(d, 0.797884560804236f);
+  const hn::Vec<D> kHalf = Set(d, 0.5f);
+
+  // tanh approximation matches training.
+  const hn::Vec<D> v3 = hn::Mul(hn::Mul(v, v), v);
+  const hn::Vec<D> arg = hn::Mul(kSqrt2OverPi, hn::MulAdd(kMul, v3, v));
+  // 0.5 * (1 + tan) = MulAdd(0.5, tan, 0.5).
+  const hn::Vec<D> cdf = hn::MulAdd(kHalf, hn::Tanh(d, arg), kHalf);
+  return Mul(v, cdf);
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(float* HWY_RESTRICT x,
+                                               size_t size) {
+  namespace hn = hwy::HWY_NAMESPACE;
+  using D = hn::ScalableTag<float>;
+  hn::Transform(D(), x, size, [](D d, hn::Vec<D> v) { return Gelu(d, v); });
+}
+
+// out[i] = BF(mul[i] * Gelu(gelu_in[i]))
+static HWY_NOINLINE HWY_MAYBE_UNUSED void GeluMulToBF16(
+    const float* HWY_RESTRICT gelu_in, const float* HWY_RESTRICT mul,
+    hwy::bfloat16_t* HWY_RESTRICT out, size_t size) {
+  namespace hn = hwy::HWY_NAMESPACE;
+  const hn::ScalableTag<float> df;
+  const hn::Repartition<hwy::bfloat16_t, decltype(df)> dbf;
+  const size_t NF = hn::Lanes(df);
+  using VF = hn::Vec<decltype(df)>;
+
+  size_t i = 0;
+  if (size >= 2 * NF) {
+    for (; i < size - 2 * NF; i += 2 * NF) {
+      const VF mul0 = LoadU(df, mul + i);
+      const VF mul1 = LoadU(df, mul + i + NF);
+      const VF g0 = Mul(mul0, Gelu(df, LoadU(df, gelu_in + i)));
+      const VF g1 = Mul(mul1, Gelu(df, LoadU(df, gelu_in + i + NF)));
+      const hn::Vec<decltype(dbf)> bf = hn::OrderedDemote2To(dbf, g0, g1);
+      StoreU(bf, dbf, out + i);
+    }
+  }
+  if (i != size) {
+    const size_t remaining = size - i;
+    const VF mul0 = LoadN(df, mul + i, remaining);
+    const VF g0 = Mul(mul0, Gelu(df, LoadN(df, gelu_in + i, remaining)));
+    const hn::Half<decltype(dbf)> dbfh;
+    const hn::Vec<decltype(dbfh)> bfh = hn::DemoteTo(dbfh, g0);
+    StoreN(bfh, dbfh, out + i, remaining);
+  }
+}
+
+// Two matrices, same vector
+// TODO(janwas): apply optimizations from MatVec/replace with above overload
+template <size_t kOuter, size_t kInner, typename MatT, size_t kCapacity,
+          typename VecT>
+HWY_NOINLINE void TwoMatVec(const CompressedArray<MatT, kCapacity>& mat0,
+                            const CompressedArray<MatT, kCapacity>& mat1,
+                            const size_t mat_ofs,
+                            const VecT* HWY_RESTRICT vec_aligned,
+                            float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
+                            hwy::ThreadPool& pool) {
+  const hn::ScalableTag<float> df;
+  const size_t NF = hn::Lanes(df);
+
+  // Process multiple rows at a time so that we write multiples of a cache line
+  // to avoid false sharing (>= 64).
+  constexpr size_t kRowsPerStrip = 128 / sizeof(float);
+  const uint32_t num_strips = kOuter / kRowsPerStrip;
+
+  // No remainder handling after ThreadPool.
+  static_assert(kOuter % kRowsPerStrip == 0, "Add remainder handling");
+
+  // Required for Stream loop, otherwise we might have partial vectors.
+  HWY_DASSERT(kRowsPerStrip >= NF);
+  pool.Run(0, num_strips,
+           [&](const uint32_t strip, size_t /*thread*/) HWY_ATTR {
+             // MSVC workaround: duplicate to ensure constexpr.
+             constexpr size_t kRowsPerStrip = 128 / sizeof(float);
+             // Software write-combining to avoid cache pollution from out.
+             // Although `out` may be used later, keeping it out of the cache
+             // now and avoiding RFOs is a consistent 5% overall win.
+             HWY_ALIGN float buf0[kRowsPerStrip];
+             HWY_ALIGN float buf1[kRowsPerStrip];
+
+             // Only handle entire strips here because the Stream is not masked.
+             const size_t begin = strip * kRowsPerStrip;
+             for (size_t idx_row = 0; idx_row < kRowsPerStrip; ++idx_row) {
+               const size_t row_ofs = mat_ofs + (begin + idx_row) * kInner;
+               buf0[idx_row] = Dot(df, mat0, row_ofs, vec_aligned, kInner);
+               buf1[idx_row] = Dot(df, mat1, row_ofs, vec_aligned, kInner);
+             }
+
+             HWY_UNROLL(4)
+             for (size_t i = 0; i != kRowsPerStrip; i += NF) {
+               hn::Stream(hn::Load(df, buf0 + i), df, out0 + begin + i);
+             }
+             HWY_UNROLL(4)
+             for (size_t i = 0; i != kRowsPerStrip; i += NF) {
+               hn::Stream(hn::Load(df, buf1 + i), df, out1 + begin + i);
+             }
+           });
+  hwy::FlushStream();
+}
+
+// Baseline Naive MatMul
+template <size_t kOuter, size_t kInner, size_t kBatchSize, typename MatT,
+          size_t kCapacity, typename VecT>
+HWY_NOINLINE void MatMul(const CompressedArray<MatT, kCapacity>& mat,
+                         const size_t mat_ofs, const VecT* HWY_RESTRICT vec,
+                         float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
+  for (size_t i = 0; i < kBatchSize; ++i) {
+    MatVec<kOuter, kInner, MatT, kCapacity, VecT>(
+        mat, mat_ofs, vec + i * kInner, out + i * kOuter, pool);
+  }
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED float Dot(const float* HWY_RESTRICT a,
+                                               const float* HWY_RESTRICT b,
+                                               size_t size) {
+  const hn::ScalableTag<float> d;
+  HWY_DASSERT(size >= hn::Lanes(d));
+  HWY_DASSERT(size % hn::Lanes(d) == 0);
+  constexpr int kAssumptions =
+      hn::Dot::kAtLeastOneVector | hn::Dot::kMultipleOfVector;
+  return hn::Dot::Compute<kAssumptions>(d, a, b, size);
+}
+
+// = Dot(a, a, size), but that is not allowed due to HWY_RESTRICT.
+static HWY_NOINLINE HWY_MAYBE_UNUSED float SquaredL2(
+    const float* HWY_RESTRICT a, size_t size) {
+  float total = 0.f;
+  for (size_t i = 0; i < size; ++i) {
+    total += a[i] * a[i];
+  }
+  return total;
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(
+    const float* HWY_RESTRICT x, const float* HWY_RESTRICT weight,
+    float* HWY_RESTRICT out, size_t size) {
+  constexpr float eps = 1e-6f;
+  float ss = SquaredL2(x, size);
+  ss = 1.0f / sqrtf(ss / static_cast<int>(size) + eps);
+  for (size_t j = 0; j < size; j++) {
+    // Note 1.0f centering here
+    out[j] = (1.0f + weight[j]) * (ss * x[j]);
+  }
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(
+    const float* HWY_RESTRICT x, const hwy::bfloat16_t* HWY_RESTRICT weight,
+    float* HWY_RESTRICT out, size_t size) {
+  constexpr float eps = 1e-6f;
+  float ss = SquaredL2(x, size);
+  ss = 1.0f / sqrtf(ss / static_cast<int>(size) + eps);
+  for (size_t j = 0; j < size; j++) {
+    // Note 1.0f centering here
+    out[j] = (1.0f + hwy::F32FromBF16(weight[j])) * (ss * x[j]);
+  }
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(
+    const float* HWY_RESTRICT weight, float* HWY_RESTRICT inout, size_t size) {
+  constexpr float eps = 1e-6f;
+  float ss = SquaredL2(inout, size);
+  ss = 1.0f / sqrtf(ss / static_cast<int>(size) + eps);
+  for (size_t j = 0; j < size; j++) {
+    // Note 1.0f centering here
+    inout[j] = (1.0f + weight[j]) * (ss * inout[j]);
+  }
+}
+
+// w=bf16 -> f
+static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(
+    const hwy::bfloat16_t* HWY_RESTRICT weight, float* HWY_RESTRICT inout,
+    const size_t size) {
+  namespace hn = hwy::HWY_NAMESPACE;
+  const hn::ScalableTag<hwy::bfloat16_t> dbf;
+  const hn::Repartition<float, decltype(dbf)> df32;
+  using VF = hn::Vec<decltype(df32)>;
+  const size_t N32 = hn::Lanes(df32);
+
+  constexpr float eps = 1e-6f;
+  const float ss = SquaredL2(inout, size);
+  const VF vss = Set(df32, 1.0f / sqrtf(ss / static_cast<int>(size) + eps));
+
+  HWY_DASSERT(size % (2 * MaxLanes(df32)) == 0);
+  for (size_t i = 0; i < size; i += 2 * N32) {
+    const hn::Vec<decltype(dbf)> w16 = hn::LoadU(dbf, weight + i);
+    const VF w0 = hn::PromoteLowerTo(df32, w16);
+    const VF w1 = hn::PromoteUpperTo(df32, w16);
+    const VF m0 = hn::Mul(vss, hn::LoadU(df32, inout + i));
+    const VF m1 = hn::Mul(vss, hn::LoadU(df32, inout + i + N32));
+    // (1+weight) * m = m + weight*m = one FMA.
+    hn::StoreU(hn::MulAdd(m0, w0, m0), df32, inout + i);
+    hn::StoreU(hn::MulAdd(m1, w1, m1), df32, inout + i + N32);
+  }
+}
+
+// f, f -> bf
+// TODO(janwas): consider generic function with adapter for loading bf16/f32
+static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(
+    const float* HWY_RESTRICT x, const float* HWY_RESTRICT weight,
+    hwy::bfloat16_t* HWY_RESTRICT out, const size_t size) {
+  namespace hn = hwy::HWY_NAMESPACE;
+  const hn::ScalableTag<hwy::bfloat16_t> dbf;
+  const hn::Repartition<float, decltype(dbf)> df32;
+  using VF = hn::Vec<decltype(df32)>;
+  const size_t N32 = hn::Lanes(df32);
+
+  constexpr float eps = 1e-6f;
+  const float ss = SquaredL2(x, size);
+  const VF vss = Set(df32, 1.0f / sqrtf(ss / static_cast<int>(size) + eps));
+
+  HWY_DASSERT(size % (2 * MaxLanes(df32)) == 0);
+  for (size_t i = 0; i < size; i += 2 * N32) {
+    const VF w0 = hn::LoadU(df32, weight + i);
+    const VF w1 = hn::LoadU(df32, weight + i + N32);
+    const VF m0 = hn::Mul(vss, hn::LoadU(df32, x + i));
+    const VF m1 = hn::Mul(vss, hn::LoadU(df32, x + i + N32));
+    // (1+weight) * m = m + weight*m = one FMA.
+    const VF out0 = hn::MulAdd(m0, w0, m0);
+    const VF out1 = hn::MulAdd(m1, w1, m1);
+    hn::StoreU(hn::OrderedDemote2To(dbf, out0, out1), dbf, out + i);
+  }
+}
+
+// x=f, w=bf16 -> bf16 to enable W16A16 MatVec.
+static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(
+    const float* HWY_RESTRICT x, const hwy::bfloat16_t* HWY_RESTRICT weight,
+    hwy::bfloat16_t* HWY_RESTRICT out, const size_t size) {
+  namespace hn = hwy::HWY_NAMESPACE;
+  const hn::ScalableTag<hwy::bfloat16_t> dbf;
+  const hn::Repartition<float, decltype(dbf)> df32;
+  using VF = hn::Vec<decltype(df32)>;
+  const size_t N32 = hn::Lanes(df32);
+
+  constexpr float eps = 1e-6f;
+  const float ss = SquaredL2(x, size);
+  const VF vss = Set(df32, 1.0f / sqrtf(ss / size + eps));
+
+  HWY_DASSERT(size % (2 * MaxLanes(df32)) == 0);
+  for (size_t i = 0; i < size; i += 2 * N32) {
+    const hn::Vec<decltype(dbf)> w16 = hn::LoadU(dbf, weight + i);
+    const VF w0 = hn::PromoteLowerTo(df32, w16);
+    const VF w1 = hn::PromoteUpperTo(df32, w16);
+    const VF m0 = hn::Mul(vss, hn::LoadU(df32, x + i));
+    const VF m1 = hn::Mul(vss, hn::LoadU(df32, x + i + N32));
+    // (1+weight) * m = m + weight*m = one FMA.
+    const VF out0 = hn::MulAdd(m0, w0, m0);
+    const VF out1 = hn::MulAdd(m1, w1, m1);
+    hn::StoreU(hn::OrderedDemote2To(dbf, out0, out1), dbf, out + i);
+  }
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED void AddAbsolutePositionalEmbeddings(
+    float* HWY_RESTRICT x, size_t dim_model, size_t pos) {
+  const size_t num_timescales = dim_model / 2;
+  const float log_timescale_increment =
+      logf(10000.0f) /
+      (num_timescales != 0
+           ? static_cast<float>(static_cast<int>(num_timescales) - 1)
+           : 1.0f);
+  for (size_t dim = 0; dim < num_timescales; ++dim) {
+    const float inv_timescale =
+        expf(static_cast<int>(dim) * -log_timescale_increment);
+    x[dim] += sinf(pos * inv_timescale);
+    x[num_timescales + dim] += cosf(pos * inv_timescale);
+  }
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED void Rope(float* HWY_RESTRICT x,
+                                               size_t dim_qkv, size_t pos) {
+  HWY_DASSERT(dim_qkv % 2 == 0);
+  const size_t half_dim_qkv = dim_qkv / 2;
+  for (size_t dim = 0; dim < half_dim_qkv; ++dim) {
+    const float freq_exponents = static_cast<float>(2 * static_cast<int>(dim)) /
+                                 static_cast<float>(dim_qkv);
+    // Replacing with expf(ln(1E4) * freq_exponents) changes results noticeably.
+    const float timescale = powf(10000.0f, freq_exponents);
+    const float theta = pos / timescale;
+    const float cos_val = cosf(theta);
+    const float sin_val = sinf(theta);
+    const float x0 = x[dim];
+    const float x1 = x[dim + half_dim_qkv];
+    x[dim] = x0 * cos_val - x1 * sin_val;
+    x[dim + half_dim_qkv] = x0 * sin_val + x1 * cos_val;
+  }
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED void RopeAndMulBy(const float mul,
+                                                       float* HWY_RESTRICT x,
+                                                       size_t dim_qkv,
+                                                       size_t pos) {
+  HWY_DASSERT(dim_qkv % 2 == 0);
+  const size_t half_dim_qkv = dim_qkv / 2;
+  for (size_t dim = 0; dim < half_dim_qkv; ++dim) {
+    const float freq_exponents = static_cast<float>(2 * static_cast<int>(dim)) /
+                                 static_cast<float>(dim_qkv);
+    // Replacing with expf(ln(1E4) * freq_exponents) changes results noticeably.
+    const float timescale = powf(10000.0f, freq_exponents);
+    const float theta = pos / timescale;
+    const float cos_val = cosf(theta);
+    const float sin_val = sinf(theta);
+    const float x0 = x[dim];
+    const float x1 = x[dim + half_dim_qkv];
+    x[dim] = mul * (x0 * cos_val - x1 * sin_val);
+    x[dim + half_dim_qkv] = mul * (x0 * sin_val + x1 * cos_val);
+  }
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED void AddFrom(
+    const float* HWY_RESTRICT other, float* HWY_RESTRICT x, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    x[i] += other[i];
+  }
+}
+
+static HWY_NOINLINE void MulBy(const float* HWY_RESTRICT other,
+                               float* HWY_RESTRICT x, size_t size,
+                               size_t max_pos) {
+  HWY_DASSERT(max_pos <= size);
+  for (size_t i = 0; i < max_pos; ++i) {
+    x[i] *= other[i];
+  }
+}
+
+static HWY_INLINE HWY_MAYBE_UNUSED void MulBy(const float* HWY_RESTRICT other,
+                                              float* HWY_RESTRICT x,
+                                              size_t size) {
+  return MulBy(other, x, size, size);
+}
+
+static HWY_NOINLINE void MulByConst(float c, float* HWY_RESTRICT x, size_t size,
+                                    size_t max_pos) {
+  HWY_DASSERT(max_pos <= size);
+  for (size_t i = 0; i < max_pos; ++i) {
+    x[i] *= c;
+  }
+}
+
+static HWY_INLINE HWY_MAYBE_UNUSED void MulByConst(float c,
+                                                   float* HWY_RESTRICT x,
+                                                   size_t size) {
+  MulByConst(c, x, size, size);
+}
+
+static HWY_NOINLINE void MulByConstAndAdd(float c, const float* HWY_RESTRICT x,
+                                          float* HWY_RESTRICT out, size_t size,
+                                          size_t max_pos) {
+  for (size_t i = 0; i < max_pos; ++i) {
+    out[i] += x[i] * c;
+  }
+}
+
+static HWY_INLINE HWY_MAYBE_UNUSED void MulByConstAndAdd(
+    float c, const float* HWY_RESTRICT x, float* HWY_RESTRICT out,
+    size_t size) {
+  MulByConstAndAdd(c, x, out, size, size);
+}
+
+static HWY_NOINLINE void Softmax(float* HWY_RESTRICT x, size_t size,
+                                 size_t mask_pos) {
+  HWY_DASSERT(size != 0);
+  HWY_DASSERT(mask_pos <= size);
+
+  namespace hn = hwy::HWY_NAMESPACE;
+  using D = hn::ScalableTag<float>;
+  const D d;
+  using V = hn::Vec<D>;
+
+  // Find max so we can subtract it below.
+  const V vmin = hn::Set(d, hwy::LowestValue<float>());
+  V max = vmin;
+  hn::Foreach(d, x, mask_pos, vmin,
+              [&max](D d, V v) { max = hn::Max(max, v); });
+  max = hn::MaxOfLanes(d, max);  // broadcast
+
+  // Subtract max (avoid precision loss for large exponents) and exponentiate.
+  V sum = hn::Zero(d);
+  hn::Transform(d, x, mask_pos, [&sum, max](D d, V v) {
+    const V out = hn::Exp(d, hn::Sub(v, max));
+    sum = hn::Add(sum, out);
+    return out;
+  });
+
+  // Normalize to probability distribution
+  const float mul = 1.0f / hn::ReduceSum(d, sum);
+  MulByConst(mul, x, size, mask_pos);
+}
+
+static HWY_INLINE HWY_MAYBE_UNUSED void Softmax(float* HWY_RESTRICT x,
+                                                size_t size) {
+  Softmax(x, size, size);
+}
+
+static HWY_NOINLINE void LogitsSoftCap(const float cap, float* HWY_RESTRICT x,
+                                       size_t size, size_t max_pos) {
+  HWY_DASSERT(max_pos <= size);
+
+  namespace hn = hwy::HWY_NAMESPACE;
+  using D = hn::ScalableTag<float>;
+  const D d;
+  using V = hn::Vec<D>;
+
+  const V inv_cap = hn::Set(d, 1.0f / cap);
+  const V vcap = hn::Set(d, cap);
+
+  hn::Transform(d, x, size, [vcap, inv_cap](D d, hn::Vec<D> v) {
+    return hn::Mul(vcap, hn::Tanh(d, hn::Mul(inv_cap, v)));
+  });
+}
+
+static HWY_INLINE HWY_MAYBE_UNUSED void LogitsSoftCap(const float cap,
+                                                      float* HWY_RESTRICT x,
+                                                      size_t size) {
+  LogitsSoftCap(cap, x, size, size);
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED size_t
+SampleArgmax(const float* probabilities, size_t vocab_size) {
+  size_t max_index = 0;
+  float max_prob = probabilities[0];
+  for (size_t i = 1; i < vocab_size; ++i) {
+    if (probabilities[i] > max_prob) {
+      max_index = i;
+      max_prob = probabilities[i];
+    }
+  }
+  return max_index;
+}
+
+template <size_t k>
+static HWY_NOINLINE HWY_MAYBE_UNUSED std::discrete_distribution<int>
+create_distribution(std::array<float, k>& top_k, float temperature) {
+  // re-normalize distribution
+  for (size_t i = 0; i < k; ++i) {
+    top_k[i] = exp(log(top_k[i]) / temperature);
+  }
+  float denominator = 0.0f;
+  for (size_t i = 0; i < k; ++i) {
+    denominator += top_k[i];
+  }
+  denominator = 1.0f / denominator;
+  MulByConst(denominator, top_k.data(), k);
+  return std::discrete_distribution<int>(std::begin(top_k), std::end(top_k));
+}
+
+template <size_t k, typename TAcceptToken>
+static HWY_NOINLINE HWY_MAYBE_UNUSED int SampleTopK(
+    const float* HWY_RESTRICT probabilities, size_t vocab_size,
+    std::mt19937& gen, float temperature, TAcceptToken& accept_token) {
+  static_assert(k != 0, "");
+  // TODO(austinvhuang): Optimize this implementation.
+  std::array<float, k> top_k{};  // sorted from highest [0], to lowest [k-1]
+  std::array<int, k> indices{};
+  for (size_t i = 0; i < vocab_size; ++i) {
+    if (probabilities[i] < top_k[k - 1] && accept_token(static_cast<int>(i))) {
+      continue;
+    }
+    for (size_t j = 0; j < k; ++j) {
+      if (probabilities[i] > top_k[j] && accept_token(static_cast<int>(i))) {
+        // shift elements by 1, insert the new value, move on to next value
+        for (size_t idx = k - 1; idx > j; --idx) {
+          top_k[idx] = top_k[idx - 1];
+          indices[idx] = indices[idx - 1];
+        }
+        top_k[j] = probabilities[i];
+        indices[j] = static_cast<int>(i);
+        break;
+      }
+    }
+  }
+  return indices[create_distribution<k>(top_k, temperature)(gen)];
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#endif  // NOLINT
--- a/run.cc
+++ b/run.cc
@ -0,0 +1,261 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Command line text interface to gemma.
+
+#include <ctime>
+#include <iostream>
+#include <random>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+// copybara:import_next_line:gemma_cpp
+#include "compression/compress.h"
+// copybara:import_next_line:gemma_cpp
+#include "gemma.h"    // Gemma
+// copybara:import_next_line:gemma_cpp
+#include "util/app.h"
+// copybara:import_next_line:gemma_cpp
+#include "util/args.h"  // HasHelp
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/highway.h"
+#include "hwy/per_target.h"
+#include "hwy/profiler.h"
+#include "hwy/timer.h"
+
+namespace gcpp {
+
+void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
+              gcpp::AppArgs& app) {
+  fprintf(stderr,
+          "\ngemma.cpp\n---------\n\nTo run gemma.cpp, you need to "
+          "specify 3 required model loading arguments: --tokenizer, "
+          "--compressed_weights, "
+          "and --model.\n\nModel Loading Arguments\n\n");
+  loader.Help();
+  fprintf(stderr, "\nInference Arguments\n\n");
+  inference.Help();
+  fprintf(stderr, "\nApplication Arguments\n\n");
+  app.Help();
+  fprintf(stderr, "\n\n");
+}
+
+void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
+  loader.Print(app.verbosity);
+  inference.Print(app.verbosity);
+  app.Print(app.verbosity);
+
+  if (app.verbosity >= 2) {
+    time_t now = time(nullptr);
+    char* dt = ctime(&now);  // NOLINT
+    std::cout << "Date & Time                   : " << dt
+              << "Prefill Token Batch Size      : " << gcpp::kPrefillBatchSize
+              << "\n"
+              << "Hardware concurrency          : "
+              << std::thread::hardware_concurrency() << std::endl
+              << "Instruction set               : "
+              << hwy::TargetName(hwy::DispatchedTarget()) << " ("
+              << hwy::VectorBytes() * 8 << " bits)" << "\n"
+              << "Weight Type                   : "
+              << gcpp::TypeName(gcpp::WeightT()) << "\n"
+              << "EmbedderInput Type            : "
+              << gcpp::TypeName(gcpp::EmbedderInputT()) << "\n";
+  }
+}
+
+void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
+               hwy::ThreadPool& inner_pool, const InferenceArgs& args,
+               int verbosity, const gcpp::AcceptFunc& accept_token) {
+  PROFILER_ZONE("Gen.misc");
+  int abs_pos = 0;      // absolute token index over all turns
+  int current_pos = 0;  // token index within the current turn
+  int prompt_size{};
+
+  std::mt19937 gen;
+  if (args.deterministic) {
+    gen.seed(42);
+  } else {
+    std::random_device rd;
+    gen.seed(rd());
+  }
+
+  // callback function invoked for each generated token.
+  auto stream_token = [&abs_pos, &current_pos, &args, &gen, &prompt_size,
+                       tokenizer = &model.Tokenizer(),
+                       verbosity](int token, float) {
+    ++abs_pos;
+    ++current_pos;
+    if (current_pos < prompt_size) {
+      std::cerr << "." << std::flush;
+    } else if (token == gcpp::EOS_ID) {
+      if (!args.multiturn) {
+        abs_pos = 0;
+        if (args.deterministic) {
+          gen.seed(42);
+        }
+      }
+      if (verbosity >= 2) {
+        std::cout << "\n[ End ]" << std::endl;
+      }
+    } else {
+      std::string token_text;
+      HWY_ASSERT(tokenizer->Decode(std::vector<int>{token}, &token_text).ok());
+      // +1 since position is incremented above
+      if (current_pos == prompt_size + 1) {
+        // first token of response
+        token_text.erase(0, token_text.find_first_not_of(" \t\n"));
+        if (verbosity >= 1) {
+          std::cout << std::endl << std::endl;
+        }
+      }
+      // TODO(austinvhuang): is explicit space necessary?
+      std::cout << token_text << std::flush;
+    }
+    return true;
+  };
+
+  while (abs_pos < args.max_tokens) {
+    std::string prompt_string;
+    std::vector<int> prompt;
+    current_pos = 0;
+    {
+      PROFILER_ZONE("Gen.input");
+      if (verbosity >= 1) {
+        std::cout << "> " << std::flush;
+      }
+      std::getline(std::cin, prompt_string);
+    }
+
+    if (std::cin.fail() || prompt_string == "%q" || prompt_string == "%Q") {
+      return;
+    }
+
+    if (model.model_training == ModelTraining::GEMMA_IT) {
+      // For instruction-tuned models: add control tokens.
+      prompt_string = "<start_of_turn>user\n" + prompt_string +
+                      "<end_of_turn>\n<start_of_turn>model\n";
+      if (abs_pos > 0) {
+        // Prepend "<end_of_turn>" token if this is a multi-turn dialogue
+        // continuation.
+        prompt_string = "<end_of_turn>\n" + prompt_string;
+      }
+    }
+
+    HWY_ASSERT(model.Tokenizer().Encode(prompt_string, &prompt).ok());
+
+    // For both pre-trained and instruction-tuned models: prepend "<bos>" token
+    // if needed.
+    if (abs_pos == 0) {
+      prompt.insert(prompt.begin(), 2);
+    }
+
+    prompt_size = prompt.size();
+
+    std::cerr << std::endl << "[ Reading prompt ] " << std::flush;
+
+    const double time_start = hwy::platform::Now();
+    GenerateGemma(model, args, prompt, abs_pos, pool, inner_pool, stream_token,
+                  accept_token, gen, verbosity);
+    const double time_end = hwy::platform::Now();
+    const double tok_sec = current_pos / (time_end - time_start);
+    if (verbosity >= 2) {
+      std::cout << current_pos << " tokens (" << abs_pos << " total tokens)"
+                << std::endl
+                << tok_sec << " tokens / sec" << std::endl;
+    }
+    std::cout << std::endl << std::endl;
+  }
+  std::cout
+      << "max_tokens (" << args.max_tokens
+      << ") exceeded. Use a larger value if desired using the --max_tokens "
+      << "command line flag.\n";
+}
+
+void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
+  PROFILER_ZONE("Run.misc");
+
+  hwy::ThreadPool inner_pool(0);
+  hwy::ThreadPool pool(app.num_threads);
+  // For many-core, pinning threads to cores helps.
+  if (app.num_threads > 10) {
+    PinThreadToCore(app.num_threads - 1);  // Main thread
+
+    pool.Run(0, pool.NumThreads(),
+             [](uint64_t /*task*/, size_t thread) { PinThreadToCore(thread); });
+  }
+
+  gcpp::Gemma model(loader, pool);
+
+  if (const char* error = inference.Validate()) {
+    ShowHelp(loader, inference, app);
+    HWY_ABORT("\nInvalid args: %s", error);
+  }
+
+  if (app.verbosity >= 1) {
+    static const std::string banner_ascii_art =
+        "  __ _  ___ _ __ ___  _ __ ___   __ _   ___ _ __  _ __\n"
+        " / _` |/ _ \\ '_ ` _ \\| '_ ` _ \\ / _` | / __| '_ \\| '_ \\\n"
+        "| (_| |  __/ | | | | | | | | | | (_| || (__| |_) | |_) |\n"
+        " \\__, |\\___|_| |_| |_|_| |_| |_|\\__,_(_)___| .__/| .__/\n"
+        "  __/ |                                    | |   | |\n"
+        " |___/                                     |_|   |_|";
+
+    const std::string instructions =
+        "*Usage*\n"
+        "  Enter an instruction and press enter (%Q quits).\n\n"
+        "*Examples*\n"
+        "  - Write an email to grandma thanking her for the cookies.\n"
+        "  - What are some historical attractions to visit around "
+        "Massachusetts?\n"
+        "  - Compute the nth fibonacci number in javascript.\n"
+        "  - Write a standup comedy bit about GPU programming.\n";
+
+    std::cout << "\033[2J\033[1;1H"  // clear screen
+              << banner_ascii_art << "\n\n";
+    ShowConfig(loader, inference, app);
+    std::cout << "\n" << instructions << "\n";
+  }
+
+  ReplGemma(model, pool, inner_pool, inference, app.verbosity,
+            /*accept_token=*/[](int) { return true; });
+}
+
+}  // namespace gcpp
+
+int main(int argc, char** argv) {
+  {
+    PROFILER_ZONE("Startup.misc");
+
+    gcpp::LoaderArgs loader(argc, argv);
+    gcpp::InferenceArgs inference(argc, argv);
+    gcpp::AppArgs app(argc, argv);
+
+    if (gcpp::HasHelp(argc, argv)) {
+      ShowHelp(loader, inference, app);
+      return 0;
+    }
+
+    if (const char* error = loader.Validate()) {
+      ShowHelp(loader, inference, app);
+      HWY_ABORT("\nInvalid args: %s", error);
+    }
+
+    gcpp::Run(loader, inference, app);
+  }
+  PROFILER_PRINT_RESULTS();  // Must call outside the zone above.
+  return 0;
+}
--- a/util/app.h
+++ b/util/app.h
@ -0,0 +1,85 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Shared between various frontends.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
+#define THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
+
+#include <sched.h>
+#include <stddef.h>
+
+#include <algorithm>  // std::clamp
+#include <thread>     // NOLINT>
+
+// copybara:import_next_line:gemma_cpp
+#include "util/args.h"
+#include "hwy/base.h"  // HWY_ASSERT
+
+namespace gcpp {
+
+static inline void PinThreadToCore(size_t cpu_index) {
+#if HWY_OS_LINUX
+  // Forces the thread to run on the logical processor with the same number.
+  cpu_set_t cset;             // bit array
+  CPU_ZERO(&cset);            // clear all
+  CPU_SET(cpu_index, &cset);  // set bit indicating which processor to run on.
+  HWY_ASSERT(0 == sched_setaffinity(0, sizeof(cset), &cset));
+#else
+  (void)cpu_index;
+#endif
+}
+
+class AppArgs : public ArgsBase<AppArgs> {
+  static constexpr size_t kDefaultNumThreads = ~size_t{0};
+
+  void ChooseNumThreads() {
+    if (num_threads == kDefaultNumThreads) {
+      // This is a rough heuristic, replace with something better in the future.
+      num_threads = static_cast<size_t>(std::clamp(
+          static_cast<int>(std::thread::hardware_concurrency()) - 2, 1, 18));
+    }
+  }
+
+ public:
+  AppArgs(int argc, char* argv[]) {
+    InitAndParse(argc, argv);
+    ChooseNumThreads();
+  }
+
+  Path log;  // output
+  int verbosity;
+  size_t num_threads;
+
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
+    visitor(log, "log", Path{"/tmp/log.txt"}, "Logging file", 2);
+    visitor(verbosity, "verbosity", 1,
+            "Show verbose developer information\n   0 = only print generation "
+            "output\n   1 = standard user-facing terminal ui\n   2 = show "
+            "developer/debug info).\n   Default = 1.",
+            2);
+    visitor(num_threads, "num_threads",
+            kDefaultNumThreads,  // see ChooseNumThreads
+            "Number of threads to use. Default value is set based on an "
+            "estimate of "
+            "how many concurrent threads are supported.",
+            2);
+  }
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
--- a/util/args.h
+++ b/util/args.h
@ -0,0 +1,223 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Command line arguments.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_UTIL_ARGS_H_
+#define THIRD_PARTY_GEMMA_CPP_UTIL_ARGS_H_
+
+#include <stdio.h>
+
+#include <algorithm>  // std::transform
+#include <string>
+
+#include "hwy/base.h"  // HWY_ABORT
+
+namespace gcpp {
+
+// Wrapper for strings representing a path name. Differentiates vs. arbitrary
+// strings and supports shortening for display purposes.
+struct Path {
+  Path& operator=(const char* other) {
+    path = other;
+    return *this;
+  }
+
+  std::string Shortened() const {
+    constexpr size_t max_len = 48;
+    constexpr size_t cut_point = max_len / 2 - 5;
+    if (path.size() > max_len) {
+      return std::string(begin(path), begin(path) + cut_point) + " ... " +
+             std::string(end(path) - cut_point, end(path));
+    }
+    if (path.empty()) return "[no path specified]";
+    return path;
+  }
+
+  std::string path;
+};
+
+// Args is a class that provides a ForEach member function which visits each of
+// its member variables. ArgsBase provides functions called by Args to
+// initialize values to their defaults (passed as an argument to the visitor),
+// print and parse, without having to repeat the args for each usage.
+template <class Args>
+class ArgsBase {
+  struct InitVisitor {
+    template <typename T>
+    void operator()(T& t, const char* /*name*/, const T& init,
+                    const char* /*help*/, int /*print_verbosity*/ = 0) const {
+      t = init;
+    }
+  };
+
+  struct HelpVisitor {
+    template <typename T>
+    void operator()(T&, const char* name, T /*init*/, const char* help,
+                    int /*print_verbosity*/ = 0) const {
+      fprintf(stderr, "  --%s : %s\n", name, help);
+    }
+  };
+
+  class PrintVisitor {
+   public:
+    explicit PrintVisitor(int verbosity) : verbosity_(verbosity) {}
+
+    template <typename T>
+    void operator()(const T& t, const char* name, const T& /*init*/,
+                    const char* /*help*/, int print_verbosity = 0) const {
+      if (verbosity_ >= print_verbosity) {
+        fprintf(stderr, "%-30s: %s\n", name, std::to_string(t).c_str());
+      }
+    }
+
+    void operator()(const std::string& t, const char* name,
+                    const std::string& /*init*/, const char* /*help*/,
+                    int print_verbosity = 0) const {
+      if (verbosity_ >= print_verbosity) {
+        fprintf(stderr, "%-30s: %s\n", name, t.c_str());
+      }
+    }
+    void operator()(const Path& t, const char* name, const Path& /*init*/,
+                    const char* /*help*/, int print_verbosity = 0) const {
+      if (verbosity_ >= print_verbosity) {
+        fprintf(stderr, "%-30s: %s\n", name, t.Shortened().c_str());
+      }
+    }
+
+   private:
+    int verbosity_;
+  };
+
+  // Supported types: integer, float, std::string, bool, Path. This is O(N^2):
+  // for each arg, we search through argv. If there are more than a dozen args,
+  // consider adding a hash-map to speed this up.
+  class ParseVisitor {
+   public:
+    ParseVisitor(int argc, char* argv[]) : argc_(argc), argv_(argv) {}
+
+    template <typename T>
+    void operator()(T& t, const char* name, const T& /*init*/,
+                    const char* /*help*/, int /*print_verbosity*/ = 0) const {
+      const std::string prefixed = std::string("--") + name;
+      for (int i = 1; i < argc_; ++i) {
+        if (std::string(argv_[i]) == prefixed) {
+          if (i + 1 >= argc_) {
+            HWY_ABORT("Missing value for %s\n", name);
+          }
+          if (!SetValue(argv_[i + 1], t)) {
+            HWY_ABORT("Invalid value for %s, got %s\n", name, argv_[i + 1]);
+          }
+          return;
+        }
+      }
+    }
+
+   private:
+    // Returns false if an invalid value is detected.
+    template <typename T, HWY_IF_NOT_FLOAT(T)>
+    static bool SetValue(const char* string, T& t) {
+      t = std::stoi(string);
+      return true;
+    }
+
+    template <typename T, HWY_IF_FLOAT(T)>
+    static bool SetValue(const char* string, T& t) {
+      t = std::stof(string);
+      return true;
+    }
+
+    static bool SetValue(const char* string, std::string& t) {
+      t = string;
+      return true;
+    }
+    static bool SetValue(const char* string, Path& t) {
+      t.path = string;
+      return true;
+    }
+
+    static bool SetValue(const char* string, bool& t) {
+      std::string value(string);
+      // Lower-case. Arg names are expected to be ASCII-only.
+      std::transform(value.begin(), value.end(), value.begin(), [](char c) {
+        return 'A' <= c && c <= 'Z' ? c - ('Z' - 'z') : c;
+      });
+
+      if (value == "true" || value == "on" || value == "1") {
+        t = true;
+        return true;
+      } else if (value == "false" || value == "off" || value == "0") {
+        t = false;
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    int argc_;
+    char** argv_;
+  };  // ParseVisitor
+
+  template <class Visitor>
+  void ForEach(Visitor& visitor) {
+    static_cast<Args*>(this)->ForEach(visitor);
+  }
+
+ public:
+  // WARNING: cannot call from ctor because the derived ctor has not yet run.
+  void Init() {
+    InitVisitor visitor;
+    ForEach(visitor);
+  }
+
+  void Help() {
+    HelpVisitor visitor;
+    ForEach(visitor);
+  }
+
+  void Print(int verbosity = 0) {
+    PrintVisitor visitor(verbosity);
+    ForEach(visitor);
+  }
+
+  void Parse(int argc, char* argv[]) {
+    ParseVisitor visitor(argc, argv);
+    ForEach(visitor);
+  }
+
+  // For convenience, enables single-line constructor.
+  void InitAndParse(int argc, char* argv[]) {
+    Init();
+    Parse(argc, argv);
+  }
+};
+
+static bool HasHelp(int argc, char* argv[]) {
+  // TODO(austinvhuang): handle case insensitivity
+  if (argc == 1) {
+    // no arguments - print help
+    return true;
+  }
+  for (int i = 1; i < argc; ++i) {
+    if (std::string(argv[i]) == "--help") {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_UTIL_ARGS_H_