From 1a95cf32745ca3d75a3a09c948812f093113f1a0 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sat, 24 Feb 2024 20:25:07 +0900
Subject: [PATCH 01/26] Add --eot_line option

---
 run.cc     | 19 ++++++++++++++++---
 util/app.h |  6 ++++++
 2 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/run.cc b/run.cc
index 87d8445..526ea8f 100644
--- a/run.cc
+++ b/run.cc
@@ -79,7 +79,9 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
 
 void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
                hwy::ThreadPool& inner_pool, const InferenceArgs& args,
-               int verbosity, const gcpp::AcceptFunc& accept_token) {
+               int verbosity, const gcpp::AcceptFunc& accept_token,
+               std::string &eot_line
+) {
   PROFILER_ZONE("Gen.misc");
   int abs_pos = 0;      // absolute token index over all turns
   int current_pos = 0;  // token index within the current turn
@@ -137,7 +139,18 @@ void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
       if (verbosity >= 1) {
         std::cout << "> " << std::flush;
       }
-      std::getline(std::cin, prompt_string);
+
+      if (eot_line.size() == 0) {
+        std::getline(std::cin, prompt_string);
+      } else {
+        std::string line;
+        while (std::getline(std::cin, line)) {
+          if (line == eot_line) {
+            break;
+          }
+          prompt_string += line + "\n";
+        }
+      }
     }
 
     if (std::cin.fail() || prompt_string == "%q" || prompt_string == "%Q") {
@@ -231,7 +244,7 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
   }
 
   ReplGemma(model, pool, inner_pool, inference, app.verbosity,
-            /*accept_token=*/[](int) { return true; });
+            /*accept_token=*/[](int) { return true; }, app.eot_line);
 }
 
 }  // namespace gcpp
diff --git a/util/app.h b/util/app.h
index 966fa41..8eb672b 100644
--- a/util/app.h
+++ b/util/app.h
@@ -62,6 +62,7 @@ class AppArgs : public ArgsBase<AppArgs> {
   Path log;  // output
   int verbosity;
   size_t num_threads;
+  std::string eot_line;
 
   template <class Visitor>
   void ForEach(const Visitor& visitor) {
@@ -77,6 +78,11 @@ class AppArgs : public ArgsBase<AppArgs> {
             "estimate of "
             "how many concurrent threads are supported.",
             2);
+    visitor(eot_line, "eot_line", std::string(""),
+            "End of turn line. "
+            "When you specify this, the prompt will be all lines "
+            "before the line where only the given string appears.",
+            2);
   }
 };
 

From 5fe31ad0bc6e4a2e0ecca57ff591ae23cf4ea16d Mon Sep 17 00:00:00 2001
From: Dan Zheng <danielzheng@google.com>
Date: Sat, 24 Feb 2024 12:54:47 -0800
Subject: [PATCH 02/26] Copybara: rename BUILD to BUILD.bazel.

PiperOrigin-RevId: 610039263
---
 run.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/run.cc b/run.cc
index 96ba316..87d8445 100644
--- a/run.cc
+++ b/run.cc
@@ -144,11 +144,6 @@ void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
       return;
     }
 
-    if (prompt_string == "%c" || prompt_string == "%C") {
-      abs_pos = 0;
-      continue;
-    }
-
     if (model.model_training == ModelTraining::GEMMA_IT) {
       // For instruction-tuned models: add control tokens.
       prompt_string = "<start_of_turn>user\n" + prompt_string +

From 3af439621e360b54c6cad660c54928a40a92d68c Mon Sep 17 00:00:00 2001
From: Dan Zheng <danielzheng@google.com>
Date: Sat, 24 Feb 2024 14:52:59 -0800
Subject: [PATCH 03/26] Rename BUILD to BUILD.bazel.  (#36)

* Rename BUILD to BUILD.bazel.

This fixes an error on macOS due to `build` and `BUILD` having conflicting names.

* Enable macos-latest in GitHub Actions CI.

* Fix concurrency key in GitHub Actions.

Use matrix configuration in concurrency key.
---
 .github/workflows/build.yml |  8 +++---
 BUILD.bazel                 | 51 -------------------------------------
 2 files changed, 4 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 929e140..b0d4b6e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,11 +1,11 @@
-name: Build
+name: build
 
 # Trigger on push or via manual dispatch.
 on: [push, workflow_dispatch]
 
 jobs:
   build:
-    runs-on: ${{matrix.os}}
+    runs-on: ${{ matrix.os }}
     name: ${{ matrix.os }} ${{ matrix.type }}
     timeout-minutes: 30
 
@@ -13,10 +13,10 @@ jobs:
       fail-fast: false
       matrix:
         type: ['Release']
-        os: ['ubuntu-latest']
+        os: ['ubuntu-latest', 'macos-latest']
 
     concurrency:
-      group: ${{ github.workflow }}-${{ github.ref }}
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.type }}
       cancel-in-progress: true
 
     steps:
diff --git a/BUILD.bazel b/BUILD.bazel
index 190690b..18dad30 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -114,54 +114,3 @@ cc_binary(
         "//:thread_pool",
     ],
 )
-
-# copybara:strip_begin
-cc_binary(
-    name = "run_csv",
-    srcs = [
-        "run_csv.cc",
-    ],
-    deps = [
-        ":app",
-        ":args",
-        ":gemma_lib",
-        "//compression:compress",
-        # copybara:import_next_line:hwy
-        "//:hwy",
-        # copybara:import_next_line:hwy
-        "//:nanobenchmark",
-        # copybara:import_next_line:hwy
-        "//:profiler",
-        # copybara:import_next_line:hwy
-        "//:thread_pool",
-        "//third_party/riegeli/bytes:file_reader",
-        "//third_party/riegeli/bytes:file_writer",
-        "//third_party/riegeli/csv:csv_reader",
-        "//third_party/riegeli/csv:csv_writer",
-    ],
-)
-
-gensignature(
-    name = "gemma_sign",
-    srcs = [":gemma"],
-)
-
-cc_test(
-    name = "benchmarks",
-    size = "large",
-    srcs = [
-        "benchmarks.cc",
-    ],
-    tags = ["notap"],
-    deps = [
-        ":app",
-        ":gemma_lib",
-        "//third_party/benchmark",
-        # copybara:import_next_line:hwy
-        "//:hwy",
-        # copybara:import_next_line:hwy
-        "//:thread_pool",
-    ],
-)
-
-# copybara:strip_end

From 621434e424d5f35b2913d867ddd7781b1c6cb1ad Mon Sep 17 00:00:00 2001
From: Naoki Kishida <naokikishida@gmail.com>
Date: Sun, 25 Feb 2024 07:21:01 +0900
Subject: [PATCH 04/26] reset conversation (#34)

---
 run.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/run.cc b/run.cc
index 87d8445..96ba316 100644
--- a/run.cc
+++ b/run.cc
@@ -144,6 +144,11 @@ void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
       return;
     }
 
+    if (prompt_string == "%c" || prompt_string == "%C") {
+      abs_pos = 0;
+      continue;
+    }
+
     if (model.model_training == ModelTraining::GEMMA_IT) {
       // For instruction-tuned models: add control tokens.
       prompt_string = "<start_of_turn>user\n" + prompt_string +

From 84444c93a44f484442fda2523dde7e77dbd3a53c Mon Sep 17 00:00:00 2001
From: Dan Zheng <danielzheng@google.com>
Date: Sat, 24 Feb 2024 15:14:53 -0800
Subject: [PATCH 05/26] Revert "Copybara configuration update."

This reverts commit c03b5da542ef19f65a4147a52ccac7c89334e7f3.

Restore lost changes due to improper Copybara syncing.
---
 .github/workflows/build.yml |  4 +-
 CMakeLists.txt              |  9 ++--
 CMakePresets.json           | 59 ++++++++++++++++++++++
 README.md                   | 42 +++++++++++++---
 compression/blob_store.cc   | 97 +++++++++++++++++++++++++++++++++----
 util/app.h                  |  2 +
 6 files changed, 191 insertions(+), 22 deletions(-)
 create mode 100644 CMakePresets.json

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b0d4b6e..da63c1c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,7 +1,7 @@
 name: build
 
-# Trigger on push or via manual dispatch.
-on: [push, workflow_dispatch]
+# Trigger on push, pull request, or via manual dispatch.
+on: [push, pull_request, workflow_dispatch]
 
 jobs:
   build:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3858968..c7828cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f)
 FetchContent_MakeAvailable(highway)
 
-## Note: absl meeds tp be installed by sentencepiece. This will only happen if
+## Note: absl needs to be installed by sentencepiece. This will only happen if
 ## cmake is invoked with -DSPM_ENABLE_SHARED=OFF and -DSPM_ABSL_PROVIDER=module
 FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)
 FetchContent_MakeAvailable(sentencepiece)
@@ -43,14 +43,13 @@ set(SOURCES
   util/args.h
   )
 
-add_compile_options($<$<CONFIG:Release>:-O2>)
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release")
 endif()
 
 # Allowable types for WEIGHT_TYPE:
 # float - slow, not recommended
-# hwy::bfloat16_t - bfloat16 as impemented by https://github.com/google/highway
+# hwy::bfloat16_t - bfloat16 as implemented by https://github.com/google/highway
 # SfpStream - 8-bit switched floating point (recommended)
 # NuqStream - experimental, work-in-progress
 option(WEIGHT_TYPE "Set weight type" "")
@@ -68,6 +67,8 @@ target_link_libraries(gemma hwy hwy_contrib sentencepiece)
 target_include_directories(gemma PRIVATE ./)
 FetchContent_GetProperties(sentencepiece)
 target_include_directories(gemma PRIVATE ${sentencepiece_SOURCE_DIR})
+target_compile_definitions(gemma PRIVATE $<$<PLATFORM_ID:Windows>:_CRT_SECURE_NO_WARNINGS NOMINMAX>)
+target_compile_options(gemma PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-declarations>)
 
 ## Library Target
 
@@ -77,3 +78,5 @@ set_target_properties(libgemma PROPERTIES PREFIX "")
 target_include_directories(libgemma PUBLIC ./)
 target_link_libraries(libgemma hwy hwy_contrib sentencepiece)
 target_include_directories(libgemma PRIVATE ${sentencepiece_SOURCE_DIR})
+target_compile_definitions(libgemma PRIVATE $<$<PLATFORM_ID:Windows>:_CRT_SECURE_NO_WARNINGS NOMINMAX>)
+target_compile_options(libgemma PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-declarations>)
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 0000000..5fe13c8
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,59 @@
+{
+    "version": 3,
+    "cmakeMinimumRequired": {
+      "major": 3,
+      "minor": 11,
+      "patch": 0
+    },
+    "configurePresets": [
+      {
+        "name": "__defaults__",
+        "hidden": true,
+        "binaryDir": "${sourceDir}/build"
+      },
+      {
+        "name": "make",
+        "inherits": "__defaults__",
+        "displayName": "Make",
+        "description": "Unix Makefiles",
+        "generator": "Unix Makefiles",
+        "binaryDir": "${sourceDir}/build"
+      },
+      {
+        "name": "windows",
+        "inherits": "__defaults__",
+        "displayName": "Windows",
+        "description": "Visual Studio 2022 with Clang/LLVM frontend",
+        "generator": "Visual Studio 17 2022",
+        "toolset": "ClangCL",
+        "condition": {
+          "type": "equals",
+          "lhs": "${hostSystemName}",
+          "rhs": "Windows"
+        }
+      }
+    ],
+    "buildPresets": [
+      {
+        "name": "__defaults__",
+        "hidden": true,
+        "targets": [
+            "gemma",
+            "libgemma"
+        ]
+      },
+      {
+        "name": "make",
+        "inherits": "__defaults__",
+        "displayName": "Unix Makefiles",
+        "configurePreset": "make"
+      },
+      {
+        "name": "windows",
+        "inherits": "__defaults__",
+        "displayName": "Windows",
+        "configuration": "Release",
+        "configurePreset": "windows"
+      }
+    ]
+  }
diff --git a/README.md b/README.md
index e278833..ff1011b 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,16 @@ Before starting, you should have installed:
   least C++17.
 - `tar` for extracting archives from Kaggle.
 
+Building natively on Windows requires the Visual Studio 2012 Build Tools with the
+optional Clang/LLVM C++ frontend (`clang-cl`). This can be installed from the
+command line with
+[`winget`](https://learn.microsoft.com/en-us/windows/package-manager/winget/):
+
+```sh
+winget install --id Kitware.CMake
+winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--passive --wait --add Microsoft.VisualStudio.Workload.VCTools;installRecommended --add Microsoft.VisualStudio.Component.VC.Llvm.Clang --add Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset"
+```
+
 ### Step 1: Obtain model weights and tokenizer from Kaggle
 
 Visit [the Gemma model page on
@@ -107,6 +117,7 @@ runtime, create a build directory and generate the build files using `cmake`
 from the top-level project directory. For the 8-bit switched floating point
 weights (sfp), run cmake with no options:
 
+#### Unix-like Platforms
 ```sh
 cmake -B build
 ```
@@ -126,17 +137,18 @@ your weights, you can enter the `build/` directory and run `make` to build the
 `./gemma` executable:
 
 ```sh
-cd build
-make -j [number of parallel threads to use] gemma
+# Configure `build` directory
+cmake --preset make
+
+# Build project using make
+cmake --build --preset make -j [number of parallel threads to use]
 ```
 
 Replace `[number of parallel threads to use]` with a number - the number of
-cores available on your system is a reasonable heuristic.
-
-For example, `make -j4 gemma` will build using 4 threads. If this is successful,
-you should now have a `gemma` executable in the `build/` directory. If the
-`nproc` command is available, you can use `make -j$(nproc) gemma` as a
-reasonable default for the number of threads. 
+cores available on your system is a reasonable heuristic.  For example,
+`make -j4 gemma` will build using 4 threads. If the `nproc` command is
+available, you can use `make -j$(nproc) gemma` as a reasonable default
+for the number of threads. 
 
 If you aren't sure of the right value for the `-j` flag, you can simply run
 `make gemma` instead and it should still build the `./gemma` executable.
@@ -145,6 +157,20 @@ If you aren't sure of the right value for the `-j` flag, you can simply run
 > On Windows Subsystem for Linux (WSL) users should set the number of
 > parallel threads to 1. Using a larger number may result in errors.
 
+If the build is successful, you should now have a `gemma` executable in the `build/` directory.
+
+#### Windows
+
+```sh
+# Configure `build` directory
+cmake --preset windows
+
+# Build project using Visual Studio Build Tools
+cmake --build --preset windows -j [number of parallel threads to use]
+```
+
+If the build is successful, you should now have a `gemma.exe` executable in the `build/` directory.
+
 ### Step 4: Run
 
 You can now run `gemma` from inside the `build/` directory.
diff --git a/compression/blob_store.cc b/compression/blob_store.cc
index 8d6c1d0..550c727 100644
--- a/compression/blob_store.cc
+++ b/compression/blob_store.cc
@@ -16,11 +16,16 @@
 // copybara:import_next_line:gemma_cpp
 #include "compression/blob_store.h"
 
-#include <fcntl.h>  // open
 #include <stdint.h>
 #include <stdio.h>     // SEEK_END - unistd isn't enough for IDE.
 #include <sys/stat.h>  // O_RDONLY
-#include <unistd.h>    // read, close
+#include <fcntl.h>  // open
+#if HWY_OS_WIN
+#include <io.h>  // read, write, close
+#include <fileapi.h>
+#else
+#include <unistd.h>    // read, write, close
+#endif
 
 #include <atomic>
 #include <vector>
@@ -30,6 +35,54 @@
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/detect_compiler_arch.h"
 
+namespace {
+#if HWY_OS_WIN
+
+// pread is not supported on Windows
+static int64_t pread(int fd, void* buf, uint64_t size, uint64_t offset) {
+  HANDLE file = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
+  if (file == INVALID_HANDLE_VALUE) {
+    return -1;
+  }
+
+  OVERLAPPED overlapped = {0};
+  overlapped.Offset = offset & 0xFFFFFFFF;
+  overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF;
+
+  DWORD bytes_read;
+  if (!ReadFile(file, buf, size, &bytes_read, &overlapped)) {
+    if (GetLastError() != ERROR_HANDLE_EOF) {
+      return -1;
+    }
+  }
+
+  return bytes_read;
+}
+
+// pwrite is not supported on Windows
+static int64_t pwrite(int fd, const void* buf, uint64_t size, uint64_t offset) {
+  HANDLE file = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
+  if (file == INVALID_HANDLE_VALUE) {
+    return -1;
+  }
+
+  OVERLAPPED overlapped = {0};
+  overlapped.Offset = offset & 0xFFFFFFFF;
+  overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF;
+
+  DWORD bytes_written;
+  if (!WriteFile(file, buf, size, &bytes_written, &overlapped)) {
+    if (GetLastError() != ERROR_HANDLE_EOF) {
+      return -1;
+    }
+  }
+
+  return bytes_written;
+}
+
+#endif
+}
+
 namespace gcpp {
 
 hwy::uint128_t MakeKey(const char* string) {
@@ -64,19 +117,30 @@ static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
   }
 }
 
+
 struct IO {
   // Returns size in bytes or 0.
   static uint64_t FileSize(const char* filename) {
     int fd = open(filename, O_RDONLY);
-    if (fd >= 0) {
-      const off_t size = lseek(fd, 0, SEEK_END);
-      HWY_ASSERT(close(fd) != -1);
-      if (size != static_cast<off_t>(-1)) {
-        return static_cast<uint64_t>(size);
-      }
+    if (fd < 0) {
+      return 0;
     }
 
-    return 0;
+#if HWY_OS_WIN
+    const int64_t size = _lseeki64(fd, 0, SEEK_END);
+    HWY_ASSERT(close(fd) != -1);
+    if (size < 0) {
+      return 0;
+    }
+#else
+    const off_t size = lseek(fd, 0, SEEK_END);
+    HWY_ASSERT(close(fd) != -1);
+    if (size == static_cast<off_t>(-1)) {
+      return 0;
+    }
+#endif
+
+    return static_cast<uint64_t>(size);
   }
 
   static bool Read(int fd, uint64_t offset, uint64_t size, void* to) {
@@ -252,7 +316,14 @@ class BlobStore {
 #pragma pack(pop)
 
 BlobError BlobReader::Open(const char* filename) {
+#if HWY_OS_WIN
+  DWORD flags = FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN;
+  HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, flags, nullptr);
+  if (file == INVALID_HANDLE_VALUE) return __LINE__;
+  fd_ = _open_osfhandle(reinterpret_cast<intptr_t>(file), _O_RDONLY);
+#else
   fd_ = open(filename, O_RDONLY);
+#endif
   if (fd_ < 0) return __LINE__;
 
 #if _POSIX_C_SOURCE >= 200112L
@@ -330,7 +401,14 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
       keys_.data(), blobs_.data(), keys_.size());
 
   // Create/replace existing file.
+#if HWY_OS_WIN
+  DWORD flags = FILE_ATTRIBUTE_NORMAL;
+  HANDLE file = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, flags, nullptr);
+  if (file == INVALID_HANDLE_VALUE) return __LINE__;
+  const int fd = _open_osfhandle(reinterpret_cast<intptr_t>(file), _O_WRONLY);
+#else
   const int fd = open(filename, O_CREAT | O_RDWR | O_TRUNC, 0644);
+#endif
   if (fd < 0) return __LINE__;
 
   std::atomic_flag err = ATOMIC_FLAG_INIT;
@@ -341,6 +419,7 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
                err.test_and_set();
              }
            });
+  HWY_ASSERT(close(fd) != -1);
   if (err.test_and_set()) return __LINE__;
   return 0;
 }
diff --git a/util/app.h b/util/app.h
index 966fa41..bd665a4 100644
--- a/util/app.h
+++ b/util/app.h
@@ -18,7 +18,9 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
 #define THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
 
+#if HWY_OS_LINUX
 #include <sched.h>
+#endif
 #include <stddef.h>
 
 #include <algorithm>  // std::clamp

From 696597383cabbd7e78c5e581e6425b452f267ab1 Mon Sep 17 00:00:00 2001
From: Silvio Traversaro <silvio@traversaro.it>
Date: Sat, 24 Feb 2024 20:41:04 -0800
Subject: [PATCH 06/26] Copybara import of the project:

--
19694e1f2e62d1c26a69309d727f2dbc5d9ada14 by Silvio Traversaro <silvio@traversaro.it>:

Do not pass explicitly -O2 flag to compiler in Release build

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gemma.cpp/pull/3 from traversaro:patch-1 19694e1f2e62d1c26a69309d727f2dbc5d9ada14
PiperOrigin-RevId: 610096914
---
 .github/workflows/build.yml | 12 ++---
 BUILD.bazel                 | 51 +++++++++++++++++++
 CMakeLists.txt              |  8 +--
 CMakePresets.json           | 59 ----------------------
 README.md                   | 42 +++-------------
 compression/blob_store.cc   | 97 ++++---------------------------------
 run.cc                      |  5 --
 util/app.h                  |  2 -
 8 files changed, 76 insertions(+), 200 deletions(-)
 delete mode 100644 CMakePresets.json

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index da63c1c..929e140 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,11 +1,11 @@
-name: build
+name: Build
 
-# Trigger on push, pull request, or via manual dispatch.
-on: [push, pull_request, workflow_dispatch]
+# Trigger on push or via manual dispatch.
+on: [push, workflow_dispatch]
 
 jobs:
   build:
-    runs-on: ${{ matrix.os }}
+    runs-on: ${{matrix.os}}
     name: ${{ matrix.os }} ${{ matrix.type }}
     timeout-minutes: 30
 
@@ -13,10 +13,10 @@ jobs:
       fail-fast: false
       matrix:
         type: ['Release']
-        os: ['ubuntu-latest', 'macos-latest']
+        os: ['ubuntu-latest']
 
     concurrency:
-      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.type }}
+      group: ${{ github.workflow }}-${{ github.ref }}
       cancel-in-progress: true
 
     steps:
diff --git a/BUILD.bazel b/BUILD.bazel
index 18dad30..190690b 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -114,3 +114,54 @@ cc_binary(
         "//:thread_pool",
     ],
 )
+
+# copybara:strip_begin
+cc_binary(
+    name = "run_csv",
+    srcs = [
+        "run_csv.cc",
+    ],
+    deps = [
+        ":app",
+        ":args",
+        ":gemma_lib",
+        "//compression:compress",
+        # copybara:import_next_line:hwy
+        "//:hwy",
+        # copybara:import_next_line:hwy
+        "//:nanobenchmark",
+        # copybara:import_next_line:hwy
+        "//:profiler",
+        # copybara:import_next_line:hwy
+        "//:thread_pool",
+        "//third_party/riegeli/bytes:file_reader",
+        "//third_party/riegeli/bytes:file_writer",
+        "//third_party/riegeli/csv:csv_reader",
+        "//third_party/riegeli/csv:csv_writer",
+    ],
+)
+
+gensignature(
+    name = "gemma_sign",
+    srcs = [":gemma"],
+)
+
+cc_test(
+    name = "benchmarks",
+    size = "large",
+    srcs = [
+        "benchmarks.cc",
+    ],
+    tags = ["notap"],
+    deps = [
+        ":app",
+        ":gemma_lib",
+        "//third_party/benchmark",
+        # copybara:import_next_line:hwy
+        "//:hwy",
+        # copybara:import_next_line:hwy
+        "//:thread_pool",
+    ],
+)
+
+# copybara:strip_end
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7828cc..722e408 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f)
 FetchContent_MakeAvailable(highway)
 
-## Note: absl needs to be installed by sentencepiece. This will only happen if
+## Note: absl meeds tp be installed by sentencepiece. This will only happen if
 ## cmake is invoked with -DSPM_ENABLE_SHARED=OFF and -DSPM_ABSL_PROVIDER=module
 FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)
 FetchContent_MakeAvailable(sentencepiece)
@@ -49,7 +49,7 @@ endif()
 
 # Allowable types for WEIGHT_TYPE:
 # float - slow, not recommended
-# hwy::bfloat16_t - bfloat16 as implemented by https://github.com/google/highway
+# hwy::bfloat16_t - bfloat16 as impemented by https://github.com/google/highway
 # SfpStream - 8-bit switched floating point (recommended)
 # NuqStream - experimental, work-in-progress
 option(WEIGHT_TYPE "Set weight type" "")
@@ -67,8 +67,6 @@ target_link_libraries(gemma hwy hwy_contrib sentencepiece)
 target_include_directories(gemma PRIVATE ./)
 FetchContent_GetProperties(sentencepiece)
 target_include_directories(gemma PRIVATE ${sentencepiece_SOURCE_DIR})
-target_compile_definitions(gemma PRIVATE $<$<PLATFORM_ID:Windows>:_CRT_SECURE_NO_WARNINGS NOMINMAX>)
-target_compile_options(gemma PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-declarations>)
 
 ## Library Target
 
@@ -78,5 +76,3 @@ set_target_properties(libgemma PROPERTIES PREFIX "")
 target_include_directories(libgemma PUBLIC ./)
 target_link_libraries(libgemma hwy hwy_contrib sentencepiece)
 target_include_directories(libgemma PRIVATE ${sentencepiece_SOURCE_DIR})
-target_compile_definitions(libgemma PRIVATE $<$<PLATFORM_ID:Windows>:_CRT_SECURE_NO_WARNINGS NOMINMAX>)
-target_compile_options(libgemma PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-declarations>)
diff --git a/CMakePresets.json b/CMakePresets.json
deleted file mode 100644
index 5fe13c8..0000000
--- a/CMakePresets.json
+++ /dev/null
@@ -1,59 +0,0 @@
-{
-    "version": 3,
-    "cmakeMinimumRequired": {
-      "major": 3,
-      "minor": 11,
-      "patch": 0
-    },
-    "configurePresets": [
-      {
-        "name": "__defaults__",
-        "hidden": true,
-        "binaryDir": "${sourceDir}/build"
-      },
-      {
-        "name": "make",
-        "inherits": "__defaults__",
-        "displayName": "Make",
-        "description": "Unix Makefiles",
-        "generator": "Unix Makefiles",
-        "binaryDir": "${sourceDir}/build"
-      },
-      {
-        "name": "windows",
-        "inherits": "__defaults__",
-        "displayName": "Windows",
-        "description": "Visual Studio 2022 with Clang/LLVM frontend",
-        "generator": "Visual Studio 17 2022",
-        "toolset": "ClangCL",
-        "condition": {
-          "type": "equals",
-          "lhs": "${hostSystemName}",
-          "rhs": "Windows"
-        }
-      }
-    ],
-    "buildPresets": [
-      {
-        "name": "__defaults__",
-        "hidden": true,
-        "targets": [
-            "gemma",
-            "libgemma"
-        ]
-      },
-      {
-        "name": "make",
-        "inherits": "__defaults__",
-        "displayName": "Unix Makefiles",
-        "configurePreset": "make"
-      },
-      {
-        "name": "windows",
-        "inherits": "__defaults__",
-        "displayName": "Windows",
-        "configuration": "Release",
-        "configurePreset": "windows"
-      }
-    ]
-  }
diff --git a/README.md b/README.md
index ff1011b..e278833 100644
--- a/README.md
+++ b/README.md
@@ -55,16 +55,6 @@ Before starting, you should have installed:
   least C++17.
 - `tar` for extracting archives from Kaggle.
 
-Building natively on Windows requires the Visual Studio 2012 Build Tools with the
-optional Clang/LLVM C++ frontend (`clang-cl`). This can be installed from the
-command line with
-[`winget`](https://learn.microsoft.com/en-us/windows/package-manager/winget/):
-
-```sh
-winget install --id Kitware.CMake
-winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--passive --wait --add Microsoft.VisualStudio.Workload.VCTools;installRecommended --add Microsoft.VisualStudio.Component.VC.Llvm.Clang --add Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset"
-```
-
 ### Step 1: Obtain model weights and tokenizer from Kaggle
 
 Visit [the Gemma model page on
@@ -117,7 +107,6 @@ runtime, create a build directory and generate the build files using `cmake`
 from the top-level project directory. For the 8-bit switched floating point
 weights (sfp), run cmake with no options:
 
-#### Unix-like Platforms
 ```sh
 cmake -B build
 ```
@@ -137,18 +126,17 @@ your weights, you can enter the `build/` directory and run `make` to build the
 `./gemma` executable:
 
 ```sh
-# Configure `build` directory
-cmake --preset make
-
-# Build project using make
-cmake --build --preset make -j [number of parallel threads to use]
+cd build
+make -j [number of parallel threads to use] gemma
 ```
 
 Replace `[number of parallel threads to use]` with a number - the number of
-cores available on your system is a reasonable heuristic.  For example,
-`make -j4 gemma` will build using 4 threads. If the `nproc` command is
-available, you can use `make -j$(nproc) gemma` as a reasonable default
-for the number of threads. 
+cores available on your system is a reasonable heuristic.
+
+For example, `make -j4 gemma` will build using 4 threads. If this is successful,
+you should now have a `gemma` executable in the `build/` directory. If the
+`nproc` command is available, you can use `make -j$(nproc) gemma` as a
+reasonable default for the number of threads. 
 
 If you aren't sure of the right value for the `-j` flag, you can simply run
 `make gemma` instead and it should still build the `./gemma` executable.
@@ -157,20 +145,6 @@ If you aren't sure of the right value for the `-j` flag, you can simply run
 > On Windows Subsystem for Linux (WSL) users should set the number of
 > parallel threads to 1. Using a larger number may result in errors.
 
-If the build is successful, you should now have a `gemma` executable in the `build/` directory.
-
-#### Windows
-
-```sh
-# Configure `build` directory
-cmake --preset windows
-
-# Build project using Visual Studio Build Tools
-cmake --build --preset windows -j [number of parallel threads to use]
-```
-
-If the build is successful, you should now have a `gemma.exe` executable in the `build/` directory.
-
 ### Step 4: Run
 
 You can now run `gemma` from inside the `build/` directory.
diff --git a/compression/blob_store.cc b/compression/blob_store.cc
index 550c727..8d6c1d0 100644
--- a/compression/blob_store.cc
+++ b/compression/blob_store.cc
@@ -16,16 +16,11 @@
 // copybara:import_next_line:gemma_cpp
 #include "compression/blob_store.h"
 
+#include <fcntl.h>  // open
 #include <stdint.h>
 #include <stdio.h>     // SEEK_END - unistd isn't enough for IDE.
 #include <sys/stat.h>  // O_RDONLY
-#include <fcntl.h>  // open
-#if HWY_OS_WIN
-#include <io.h>  // read, write, close
-#include <fileapi.h>
-#else
-#include <unistd.h>    // read, write, close
-#endif
+#include <unistd.h>    // read, close
 
 #include <atomic>
 #include <vector>
@@ -35,54 +30,6 @@
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/detect_compiler_arch.h"
 
-namespace {
-#if HWY_OS_WIN
-
-// pread is not supported on Windows
-static int64_t pread(int fd, void* buf, uint64_t size, uint64_t offset) {
-  HANDLE file = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
-  if (file == INVALID_HANDLE_VALUE) {
-    return -1;
-  }
-
-  OVERLAPPED overlapped = {0};
-  overlapped.Offset = offset & 0xFFFFFFFF;
-  overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF;
-
-  DWORD bytes_read;
-  if (!ReadFile(file, buf, size, &bytes_read, &overlapped)) {
-    if (GetLastError() != ERROR_HANDLE_EOF) {
-      return -1;
-    }
-  }
-
-  return bytes_read;
-}
-
-// pwrite is not supported on Windows
-static int64_t pwrite(int fd, const void* buf, uint64_t size, uint64_t offset) {
-  HANDLE file = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
-  if (file == INVALID_HANDLE_VALUE) {
-    return -1;
-  }
-
-  OVERLAPPED overlapped = {0};
-  overlapped.Offset = offset & 0xFFFFFFFF;
-  overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF;
-
-  DWORD bytes_written;
-  if (!WriteFile(file, buf, size, &bytes_written, &overlapped)) {
-    if (GetLastError() != ERROR_HANDLE_EOF) {
-      return -1;
-    }
-  }
-
-  return bytes_written;
-}
-
-#endif
-}
-
 namespace gcpp {
 
 hwy::uint128_t MakeKey(const char* string) {
@@ -117,30 +64,19 @@ static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
   }
 }
 
-
 struct IO {
   // Returns size in bytes or 0.
   static uint64_t FileSize(const char* filename) {
     int fd = open(filename, O_RDONLY);
-    if (fd < 0) {
-      return 0;
+    if (fd >= 0) {
+      const off_t size = lseek(fd, 0, SEEK_END);
+      HWY_ASSERT(close(fd) != -1);
+      if (size != static_cast<off_t>(-1)) {
+        return static_cast<uint64_t>(size);
+      }
     }
 
-#if HWY_OS_WIN
-    const int64_t size = _lseeki64(fd, 0, SEEK_END);
-    HWY_ASSERT(close(fd) != -1);
-    if (size < 0) {
-      return 0;
-    }
-#else
-    const off_t size = lseek(fd, 0, SEEK_END);
-    HWY_ASSERT(close(fd) != -1);
-    if (size == static_cast<off_t>(-1)) {
-      return 0;
-    }
-#endif
-
-    return static_cast<uint64_t>(size);
+    return 0;
   }
 
   static bool Read(int fd, uint64_t offset, uint64_t size, void* to) {
@@ -316,14 +252,7 @@ class BlobStore {
 #pragma pack(pop)
 
 BlobError BlobReader::Open(const char* filename) {
-#if HWY_OS_WIN
-  DWORD flags = FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN;
-  HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, flags, nullptr);
-  if (file == INVALID_HANDLE_VALUE) return __LINE__;
-  fd_ = _open_osfhandle(reinterpret_cast<intptr_t>(file), _O_RDONLY);
-#else
   fd_ = open(filename, O_RDONLY);
-#endif
   if (fd_ < 0) return __LINE__;
 
 #if _POSIX_C_SOURCE >= 200112L
@@ -401,14 +330,7 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
       keys_.data(), blobs_.data(), keys_.size());
 
   // Create/replace existing file.
-#if HWY_OS_WIN
-  DWORD flags = FILE_ATTRIBUTE_NORMAL;
-  HANDLE file = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, flags, nullptr);
-  if (file == INVALID_HANDLE_VALUE) return __LINE__;
-  const int fd = _open_osfhandle(reinterpret_cast<intptr_t>(file), _O_WRONLY);
-#else
   const int fd = open(filename, O_CREAT | O_RDWR | O_TRUNC, 0644);
-#endif
   if (fd < 0) return __LINE__;
 
   std::atomic_flag err = ATOMIC_FLAG_INIT;
@@ -419,7 +341,6 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
                err.test_and_set();
              }
            });
-  HWY_ASSERT(close(fd) != -1);
   if (err.test_and_set()) return __LINE__;
   return 0;
 }
diff --git a/run.cc b/run.cc
index 96ba316..87d8445 100644
--- a/run.cc
+++ b/run.cc
@@ -144,11 +144,6 @@ void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
       return;
     }
 
-    if (prompt_string == "%c" || prompt_string == "%C") {
-      abs_pos = 0;
-      continue;
-    }
-
     if (model.model_training == ModelTraining::GEMMA_IT) {
       // For instruction-tuned models: add control tokens.
       prompt_string = "<start_of_turn>user\n" + prompt_string +
diff --git a/util/app.h b/util/app.h
index bd665a4..966fa41 100644
--- a/util/app.h
+++ b/util/app.h
@@ -18,9 +18,7 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
 #define THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
 
-#if HWY_OS_LINUX
 #include <sched.h>
-#endif
 #include <stddef.h>
 
 #include <algorithm>  // std::clamp

From 1243be71c4aa692aa9e828e0aade04a5c5285bf6 Mon Sep 17 00:00:00 2001
From: Dan Zheng <danielzheng@google.com>
Date: Sun, 25 Feb 2024 04:33:39 -0800
Subject: [PATCH 07/26] Copybara import of the project:

--
e0179bad839b808265948e0141feba0844264a9d by Dan Zheng <danielzheng@google.com>:

Rename BUILD to BUILD.bazel.

This fixes an error on macOS due to `build` and `BUILD` having conflicting names.

--
74b27074e10b7fcca2cac42aaae3637bea39d11b by Dan Zheng <danielzheng@google.com>:

Enable macos-latest in GitHub Actions CI.

--
c08de58e6a58f685d84c9112ca2e74d354ecee77 by Dan Zheng <danielzheng@google.com>:

Fix concurrency key in GitHub Actions.

Use matrix configuration in concurrency key.

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gemma.cpp/pull/36 from dan-zheng:rename-build-bzl b4b978f02bee169ed83737af12714d1b66e3625d
PiperOrigin-RevId: 610156681
---
 .github/workflows/build.yml |  8 +++---
 BUILD.bazel                 | 51 -------------------------------------
 2 files changed, 4 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 929e140..b0d4b6e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,11 +1,11 @@
-name: Build
+name: build
 
 # Trigger on push or via manual dispatch.
 on: [push, workflow_dispatch]
 
 jobs:
   build:
-    runs-on: ${{matrix.os}}
+    runs-on: ${{ matrix.os }}
     name: ${{ matrix.os }} ${{ matrix.type }}
     timeout-minutes: 30
 
@@ -13,10 +13,10 @@ jobs:
       fail-fast: false
       matrix:
         type: ['Release']
-        os: ['ubuntu-latest']
+        os: ['ubuntu-latest', 'macos-latest']
 
     concurrency:
-      group: ${{ github.workflow }}-${{ github.ref }}
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.type }}
       cancel-in-progress: true
 
     steps:
diff --git a/BUILD.bazel b/BUILD.bazel
index 190690b..18dad30 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -114,54 +114,3 @@ cc_binary(
         "//:thread_pool",
     ],
 )
-
-# copybara:strip_begin
-cc_binary(
-    name = "run_csv",
-    srcs = [
-        "run_csv.cc",
-    ],
-    deps = [
-        ":app",
-        ":args",
-        ":gemma_lib",
-        "//compression:compress",
-        # copybara:import_next_line:hwy
-        "//:hwy",
-        # copybara:import_next_line:hwy
-        "//:nanobenchmark",
-        # copybara:import_next_line:hwy
-        "//:profiler",
-        # copybara:import_next_line:hwy
-        "//:thread_pool",
-        "//third_party/riegeli/bytes:file_reader",
-        "//third_party/riegeli/bytes:file_writer",
-        "//third_party/riegeli/csv:csv_reader",
-        "//third_party/riegeli/csv:csv_writer",
-    ],
-)
-
-gensignature(
-    name = "gemma_sign",
-    srcs = [":gemma"],
-)
-
-cc_test(
-    name = "benchmarks",
-    size = "large",
-    srcs = [
-        "benchmarks.cc",
-    ],
-    tags = ["notap"],
-    deps = [
-        ":app",
-        ":gemma_lib",
-        "//third_party/benchmark",
-        # copybara:import_next_line:hwy
-        "//:hwy",
-        # copybara:import_next_line:hwy
-        "//:thread_pool",
-    ],
-)
-
-# copybara:strip_end

From 6a3085828f123737dfb929571329067ed49f789e Mon Sep 17 00:00:00 2001
From: Jan Wassenberg <janwas@google.com>
Date: Sun, 25 Feb 2024 19:08:50 -0800
Subject: [PATCH 08/26] Fixes #37, lambda issue: missing HWY_ATTR, and cannot
 capture SVE in/out vectors.

PiperOrigin-RevId: 610260610
---
 ops.h | 55 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/ops.h b/ops.h
index db2ae4f..7619b44 100644
--- a/ops.h
+++ b/ops.h
@@ -214,7 +214,8 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(float* HWY_RESTRICT x,
                                                size_t size) {
   namespace hn = hwy::HWY_NAMESPACE;
   using D = hn::ScalableTag<float>;
-  hn::Transform(D(), x, size, [](D d, hn::Vec<D> v) { return Gelu(d, v); });
+  hn::Transform(D(), x, size,
+                [](D d, hn::Vec<D> v) HWY_ATTR { return Gelu(d, v); });
 }
 
 // out[i] = BF(mul[i] * Gelu(gelu_in[i]))
@@ -567,22 +568,41 @@ static HWY_NOINLINE void Softmax(float* HWY_RESTRICT x, size_t size,
   namespace hn = hwy::HWY_NAMESPACE;
   using D = hn::ScalableTag<float>;
   const D d;
-  using V = hn::Vec<D>;
+  const size_t N = hn::Lanes(d);
 
-  // Find max so we can subtract it below.
-  const V vmin = hn::Set(d, hwy::LowestValue<float>());
-  V max = vmin;
-  hn::Foreach(d, x, mask_pos, vmin,
-              [&max](D d, V v) { max = hn::Max(max, v); });
-  max = hn::MaxOfLanes(d, max);  // broadcast
+  // Find max so we can subtract it below. Avoid hn::Foreach because SVE vectors
+  // cannot be lambda-captured.
+  // TODO(janwas): could be replaced with an hn::Accumulate algo.
+  const hn::Vec<D> vmin = hn::Set(d, hwy::LowestValue<float>());
+  hn::Vec<D> vmax = vmin;
+  size_t idx = 0;
+  if (mask_pos >= N) {
+    for (; idx <= mask_pos - N; idx += N) {
+      vmax = hn::Max(vmax, LoadU(d, x + idx));
+    }
+  }
+  vmax = hn::Max(vmax, LoadNOr(vmin, d, x + idx, mask_pos - idx));
+  vmax = hn::MaxOfLanes(d, vmax);  // broadcast
 
   // Subtract max (avoid precision loss for large exponents) and exponentiate.
-  V sum = hn::Zero(d);
-  hn::Transform(d, x, mask_pos, [&sum, max](D d, V v) {
-    const V out = hn::Exp(d, hn::Sub(v, max));
+  // Also avoid hn::Transform because the additional `sum` output vector cannot
+  // be captured by a lambda.
+  hn::Vec<D> sum = hn::Zero(d);
+  idx = 0;
+  if (mask_pos >= N) {
+    for (; idx <= mask_pos - N; idx += N) {
+      const hn::Vec<D> out = hn::Exp(d, hn::Sub(hn::LoadU(d, x + idx), vmax));
+      sum = hn::Add(sum, out);
+      hn::StoreU(out, d, x + idx);
+    }
+  }
+  if (mask_pos > idx) {
+    const size_t remaining = mask_pos - idx;
+    const hn::Vec<D> out =
+        hn::Exp(d, hn::Sub(hn::LoadN(d, x + idx, remaining), vmax));
     sum = hn::Add(sum, out);
-    return out;
-  });
+    hn::StoreN(out, d, x + idx, remaining);
+  }
 
   // Normalize to probability distribution
   const float mul = 1.0f / hn::ReduceSum(d, sum);
@@ -601,13 +621,12 @@ static HWY_NOINLINE void LogitsSoftCap(const float cap, float* HWY_RESTRICT x,
   namespace hn = hwy::HWY_NAMESPACE;
   using D = hn::ScalableTag<float>;
   const D d;
-  using V = hn::Vec<D>;
 
-  const V inv_cap = hn::Set(d, 1.0f / cap);
-  const V vcap = hn::Set(d, cap);
+  const float inv_cap = 1.0f / cap;
 
-  hn::Transform(d, x, size, [vcap, inv_cap](D d, hn::Vec<D> v) {
-    return hn::Mul(vcap, hn::Tanh(d, hn::Mul(inv_cap, v)));
+  hn::Transform(d, x, size, [cap, inv_cap](D d, hn::Vec<D> v) HWY_ATTR {
+    return hn::Mul(hn::Set(d, cap),
+                   hn::Tanh(d, hn::Mul(v, hn::Set(d, inv_cap))));
   });
 }
 

From 4c155bd3df70e45837a50a3d7496733ba47e000f Mon Sep 17 00:00:00 2001
From: Dan Zheng <danielzheng@google.com>
Date: Sun, 25 Feb 2024 19:31:27 -0800
Subject: [PATCH 09/26] Restore reverted changes.

Sync to https://github.com/google/gemma.cpp/commit/84444c93a44f484442fda2523dde7e77dbd3a53c.

PiperOrigin-RevId: 610263918
---
 .github/workflows/build.yml |  4 +-
 CMakeLists.txt              |  8 ++-
 CMakePresets.json           | 59 ++++++++++++++++++++++
 README.md                   | 42 +++++++++++++---
 compression/blob_store.cc   | 97 +++++++++++++++++++++++++++++++++----
 run.cc                      |  5 ++
 util/app.h                  |  2 +
 7 files changed, 196 insertions(+), 21 deletions(-)
 create mode 100644 CMakePresets.json

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b0d4b6e..da63c1c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,7 +1,7 @@
 name: build
 
-# Trigger on push or via manual dispatch.
-on: [push, workflow_dispatch]
+# Trigger on push, pull request, or via manual dispatch.
+on: [push, pull_request, workflow_dispatch]
 
 jobs:
   build:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 722e408..c7828cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f)
 FetchContent_MakeAvailable(highway)
 
-## Note: absl meeds tp be installed by sentencepiece. This will only happen if
+## Note: absl needs to be installed by sentencepiece. This will only happen if
 ## cmake is invoked with -DSPM_ENABLE_SHARED=OFF and -DSPM_ABSL_PROVIDER=module
 FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)
 FetchContent_MakeAvailable(sentencepiece)
@@ -49,7 +49,7 @@ endif()
 
 # Allowable types for WEIGHT_TYPE:
 # float - slow, not recommended
-# hwy::bfloat16_t - bfloat16 as impemented by https://github.com/google/highway
+# hwy::bfloat16_t - bfloat16 as implemented by https://github.com/google/highway
 # SfpStream - 8-bit switched floating point (recommended)
 # NuqStream - experimental, work-in-progress
 option(WEIGHT_TYPE "Set weight type" "")
@@ -67,6 +67,8 @@ target_link_libraries(gemma hwy hwy_contrib sentencepiece)
 target_include_directories(gemma PRIVATE ./)
 FetchContent_GetProperties(sentencepiece)
 target_include_directories(gemma PRIVATE ${sentencepiece_SOURCE_DIR})
+target_compile_definitions(gemma PRIVATE $<$<PLATFORM_ID:Windows>:_CRT_SECURE_NO_WARNINGS NOMINMAX>)
+target_compile_options(gemma PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-declarations>)
 
 ## Library Target
 
@@ -76,3 +78,5 @@ set_target_properties(libgemma PROPERTIES PREFIX "")
 target_include_directories(libgemma PUBLIC ./)
 target_link_libraries(libgemma hwy hwy_contrib sentencepiece)
 target_include_directories(libgemma PRIVATE ${sentencepiece_SOURCE_DIR})
+target_compile_definitions(libgemma PRIVATE $<$<PLATFORM_ID:Windows>:_CRT_SECURE_NO_WARNINGS NOMINMAX>)
+target_compile_options(libgemma PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-declarations>)
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 0000000..5fe13c8
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,59 @@
+{
+    "version": 3,
+    "cmakeMinimumRequired": {
+      "major": 3,
+      "minor": 11,
+      "patch": 0
+    },
+    "configurePresets": [
+      {
+        "name": "__defaults__",
+        "hidden": true,
+        "binaryDir": "${sourceDir}/build"
+      },
+      {
+        "name": "make",
+        "inherits": "__defaults__",
+        "displayName": "Make",
+        "description": "Unix Makefiles",
+        "generator": "Unix Makefiles",
+        "binaryDir": "${sourceDir}/build"
+      },
+      {
+        "name": "windows",
+        "inherits": "__defaults__",
+        "displayName": "Windows",
+        "description": "Visual Studio 2022 with Clang/LLVM frontend",
+        "generator": "Visual Studio 17 2022",
+        "toolset": "ClangCL",
+        "condition": {
+          "type": "equals",
+          "lhs": "${hostSystemName}",
+          "rhs": "Windows"
+        }
+      }
+    ],
+    "buildPresets": [
+      {
+        "name": "__defaults__",
+        "hidden": true,
+        "targets": [
+            "gemma",
+            "libgemma"
+        ]
+      },
+      {
+        "name": "make",
+        "inherits": "__defaults__",
+        "displayName": "Unix Makefiles",
+        "configurePreset": "make"
+      },
+      {
+        "name": "windows",
+        "inherits": "__defaults__",
+        "displayName": "Windows",
+        "configuration": "Release",
+        "configurePreset": "windows"
+      }
+    ]
+  }
diff --git a/README.md b/README.md
index e278833..d31bbaf 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,16 @@ Before starting, you should have installed:
   least C++17.
 - `tar` for extracting archives from Kaggle.
 
+Building natively on Windows requires the Visual Studio 2012 Build Tools with the
+optional Clang/LLVM C++ frontend (`clang-cl`). This can be installed from the
+command line with
+[`winget`](https://learn.microsoft.com/en-us/windows/package-manager/winget/):
+
+```sh
+winget install --id Kitware.CMake
+winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--passive --wait --add Microsoft.VisualStudio.Workload.VCTools;installRecommended --add Microsoft.VisualStudio.Component.VC.Llvm.Clang --add Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset"
+```
+
 ### Step 1: Obtain model weights and tokenizer from Kaggle
 
 Visit [the Gemma model page on
@@ -107,6 +117,7 @@ runtime, create a build directory and generate the build files using `cmake`
 from the top-level project directory. For the 8-bit switched floating point
 weights (sfp), run cmake with no options:
 
+#### Unix-like Platforms
 ```sh
 cmake -B build
 ```
@@ -126,17 +137,18 @@ your weights, you can enter the `build/` directory and run `make` to build the
 `./gemma` executable:
 
 ```sh
-cd build
-make -j [number of parallel threads to use] gemma
+# Configure `build` directory
+cmake --preset make
+
+# Build project using make
+cmake --build --preset make -j [number of parallel threads to use]
 ```
 
 Replace `[number of parallel threads to use]` with a number - the number of
-cores available on your system is a reasonable heuristic.
-
-For example, `make -j4 gemma` will build using 4 threads. If this is successful,
-you should now have a `gemma` executable in the `build/` directory. If the
-`nproc` command is available, you can use `make -j$(nproc) gemma` as a
-reasonable default for the number of threads. 
+cores available on your system is a reasonable heuristic.  For example,
+`make -j4 gemma` will build using 4 threads. If the `nproc` command is
+available, you can use `make -j$(nproc) gemma` as a reasonable default
+for the number of threads.
 
 If you aren't sure of the right value for the `-j` flag, you can simply run
 `make gemma` instead and it should still build the `./gemma` executable.
@@ -145,6 +157,20 @@ If you aren't sure of the right value for the `-j` flag, you can simply run
 > On Windows Subsystem for Linux (WSL) users should set the number of
 > parallel threads to 1. Using a larger number may result in errors.
 
+If the build is successful, you should now have a `gemma` executable in the `build/` directory.
+
+#### Windows
+
+```sh
+# Configure `build` directory
+cmake --preset windows
+
+# Build project using Visual Studio Build Tools
+cmake --build --preset windows -j [number of parallel threads to use]
+```
+
+If the build is successful, you should now have a `gemma.exe` executable in the `build/` directory.
+
 ### Step 4: Run
 
 You can now run `gemma` from inside the `build/` directory.
diff --git a/compression/blob_store.cc b/compression/blob_store.cc
index 8d6c1d0..550c727 100644
--- a/compression/blob_store.cc
+++ b/compression/blob_store.cc
@@ -16,11 +16,16 @@
 // copybara:import_next_line:gemma_cpp
 #include "compression/blob_store.h"
 
-#include <fcntl.h>  // open
 #include <stdint.h>
 #include <stdio.h>     // SEEK_END - unistd isn't enough for IDE.
 #include <sys/stat.h>  // O_RDONLY
-#include <unistd.h>    // read, close
+#include <fcntl.h>  // open
+#if HWY_OS_WIN
+#include <io.h>  // read, write, close
+#include <fileapi.h>
+#else
+#include <unistd.h>    // read, write, close
+#endif
 
 #include <atomic>
 #include <vector>
@@ -30,6 +35,54 @@
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/detect_compiler_arch.h"
 
+namespace {
+#if HWY_OS_WIN
+
+// pread is not supported on Windows
+static int64_t pread(int fd, void* buf, uint64_t size, uint64_t offset) {
+  HANDLE file = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
+  if (file == INVALID_HANDLE_VALUE) {
+    return -1;
+  }
+
+  OVERLAPPED overlapped = {0};
+  overlapped.Offset = offset & 0xFFFFFFFF;
+  overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF;
+
+  DWORD bytes_read;
+  if (!ReadFile(file, buf, size, &bytes_read, &overlapped)) {
+    if (GetLastError() != ERROR_HANDLE_EOF) {
+      return -1;
+    }
+  }
+
+  return bytes_read;
+}
+
+// pwrite is not supported on Windows
+static int64_t pwrite(int fd, const void* buf, uint64_t size, uint64_t offset) {
+  HANDLE file = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
+  if (file == INVALID_HANDLE_VALUE) {
+    return -1;
+  }
+
+  OVERLAPPED overlapped = {0};
+  overlapped.Offset = offset & 0xFFFFFFFF;
+  overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF;
+
+  DWORD bytes_written;
+  if (!WriteFile(file, buf, size, &bytes_written, &overlapped)) {
+    if (GetLastError() != ERROR_HANDLE_EOF) {
+      return -1;
+    }
+  }
+
+  return bytes_written;
+}
+
+#endif
+}
+
 namespace gcpp {
 
 hwy::uint128_t MakeKey(const char* string) {
@@ -64,19 +117,30 @@ static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
   }
 }
 
+
 struct IO {
   // Returns size in bytes or 0.
   static uint64_t FileSize(const char* filename) {
     int fd = open(filename, O_RDONLY);
-    if (fd >= 0) {
-      const off_t size = lseek(fd, 0, SEEK_END);
-      HWY_ASSERT(close(fd) != -1);
-      if (size != static_cast<off_t>(-1)) {
-        return static_cast<uint64_t>(size);
-      }
+    if (fd < 0) {
+      return 0;
     }
 
-    return 0;
+#if HWY_OS_WIN
+    const int64_t size = _lseeki64(fd, 0, SEEK_END);
+    HWY_ASSERT(close(fd) != -1);
+    if (size < 0) {
+      return 0;
+    }
+#else
+    const off_t size = lseek(fd, 0, SEEK_END);
+    HWY_ASSERT(close(fd) != -1);
+    if (size == static_cast<off_t>(-1)) {
+      return 0;
+    }
+#endif
+
+    return static_cast<uint64_t>(size);
   }
 
   static bool Read(int fd, uint64_t offset, uint64_t size, void* to) {
@@ -252,7 +316,14 @@ class BlobStore {
 #pragma pack(pop)
 
 BlobError BlobReader::Open(const char* filename) {
+#if HWY_OS_WIN
+  DWORD flags = FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN;
+  HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, flags, nullptr);
+  if (file == INVALID_HANDLE_VALUE) return __LINE__;
+  fd_ = _open_osfhandle(reinterpret_cast<intptr_t>(file), _O_RDONLY);
+#else
   fd_ = open(filename, O_RDONLY);
+#endif
   if (fd_ < 0) return __LINE__;
 
 #if _POSIX_C_SOURCE >= 200112L
@@ -330,7 +401,14 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
       keys_.data(), blobs_.data(), keys_.size());
 
   // Create/replace existing file.
+#if HWY_OS_WIN
+  DWORD flags = FILE_ATTRIBUTE_NORMAL;
+  HANDLE file = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, flags, nullptr);
+  if (file == INVALID_HANDLE_VALUE) return __LINE__;
+  const int fd = _open_osfhandle(reinterpret_cast<intptr_t>(file), _O_WRONLY);
+#else
   const int fd = open(filename, O_CREAT | O_RDWR | O_TRUNC, 0644);
+#endif
   if (fd < 0) return __LINE__;
 
   std::atomic_flag err = ATOMIC_FLAG_INIT;
@@ -341,6 +419,7 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
                err.test_and_set();
              }
            });
+  HWY_ASSERT(close(fd) != -1);
   if (err.test_and_set()) return __LINE__;
   return 0;
 }
diff --git a/run.cc b/run.cc
index 87d8445..96ba316 100644
--- a/run.cc
+++ b/run.cc
@@ -144,6 +144,11 @@ void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
       return;
     }
 
+    if (prompt_string == "%c" || prompt_string == "%C") {
+      abs_pos = 0;
+      continue;
+    }
+
     if (model.model_training == ModelTraining::GEMMA_IT) {
       // For instruction-tuned models: add control tokens.
       prompt_string = "<start_of_turn>user\n" + prompt_string +
diff --git a/util/app.h b/util/app.h
index 966fa41..bd665a4 100644
--- a/util/app.h
+++ b/util/app.h
@@ -18,7 +18,9 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
 #define THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_
 
+#if HWY_OS_LINUX
 #include <sched.h>
+#endif
 #include <stddef.h>
 
 #include <algorithm>  // std::clamp

From 4e2efbcbd89bd634a995f8265c80677334410f01 Mon Sep 17 00:00:00 2001
From: Kewde <kewde@particl.io>
Date: Mon, 26 Feb 2024 08:30:21 -0800
Subject: [PATCH 10/26] Copybara import of the project:

--
f4f2ff3c1a13fce546112d329419b211eb2be8b1 by kewde <kewde@particl.io>:

fix: add -fPIC to libgemma
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gemma.cpp/pull/42 from kewde:kewde/enable-fpic f4f2ff3c1a13fce546112d329419b211eb2be8b1
PiperOrigin-RevId: 610416597
---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7828cc..308e258 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,6 +75,7 @@ target_compile_options(gemma PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-de
 add_library(libgemma ${SOURCES})
 set_property(TARGET libgemma PROPERTY CXX_STANDARD 17)
 set_target_properties(libgemma PROPERTIES PREFIX "")
+set_property(TARGET libgemma PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(libgemma PUBLIC ./)
 target_link_libraries(libgemma hwy hwy_contrib sentencepiece)
 target_include_directories(libgemma PRIVATE ${sentencepiece_SOURCE_DIR})

From 7ab968c957a74b5b0c312212089fc878a310c245 Mon Sep 17 00:00:00 2001
From: Naoki Kishida <naokikishida@gmail.com>
Date: Mon, 26 Feb 2024 08:38:49 -0800
Subject: [PATCH 11/26] Copybara import of the project:

--
26b541b666a5860ced67a3df7630b6364eedd8cb by kishida <naokikishida@gmail.com>:

add information for the reseting conversation

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gemma.cpp/pull/40 from kishida:add_info_for_reset_conv 26b541b666a5860ced67a3df7630b6364eedd8cb
PiperOrigin-RevId: 610418671
---
 README.md | 2 +-
 run.cc    | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d31bbaf..5932726 100644
--- a/README.md
+++ b/README.md
@@ -273,7 +273,7 @@ max_tokens                    : 3072
 max_generated_tokens          : 2048
 
 *Usage*
-  Enter an instruction and press enter (%Q quits).
+  Enter an instruction and press enter (%C reset conversation, %Q quits).
 
 *Examples*
   - Write an email to grandma thanking her for the cookies.
diff --git a/run.cc b/run.cc
index 96ba316..f83ead9 100644
--- a/run.cc
+++ b/run.cc
@@ -221,7 +221,8 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
 
     const std::string instructions =
         "*Usage*\n"
-        "  Enter an instruction and press enter (%Q quits).\n\n"
+        "  Enter an instruction and press enter (%C reset conversation, "
+        "%Q quits).\n\n"
         "*Examples*\n"
         "  - Write an email to grandma thanking her for the cookies.\n"
         "  - What are some historical attractions to visit around "

From 7aeade5c9d026eece614503fd704dcfc5ad0d625 Mon Sep 17 00:00:00 2001
From: David Coles <dcoles@dcoles.net>
Date: Mon, 26 Feb 2024 10:22:24 -0800
Subject: [PATCH 12/26] Copybara import of the project:

--
c64b6fd3a44b385e1502d2057bd8709edaebaa58 by David Coles <dcoles@dcoles.net>:

Include Windows in GitHub Actions build

This also preserves the `gemma` binary as a build artefact
should folks want to grab a pre-built binary.

Dropped the use of the lukka/cmake actions due to conflicts with `--preset`.
This isn't that bad as we were mostly overriding the default behaviour anyway.

It also shaves ~2 min off the build since the GitHub builders already
have CMake pre-installed.

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gemma.cpp/pull/38 from dcoles:windows-build c64b6fd3a44b385e1502d2057bd8709edaebaa58
PiperOrigin-RevId: 610449220
---
 .github/workflows/build.yml | 49 +++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index da63c1c..82b9152 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -6,17 +6,25 @@ on: [push, pull_request, workflow_dispatch]
 jobs:
   build:
     runs-on: ${{ matrix.os }}
-    name: ${{ matrix.os }} ${{ matrix.type }}
+    name: ${{ matrix.os }} (${{ matrix.preset }}) ${{ matrix.build_type }}
     timeout-minutes: 30
 
     strategy:
       fail-fast: false
       matrix:
-        type: ['Release']
-        os: ['ubuntu-latest', 'macos-latest']
+        os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
+        build_type: ['Release']
+        preset: ['make', 'windows']
+        exclude:
+          - os: ubuntu-latest
+            preset: windows
+          - os: macos-latest
+            preset: windows
+          - os: windows-latest
+            preset: make
 
     concurrency:
-      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.type }}
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.preset }}-${{ matrix.build_type }}
       cancel-in-progress: true
 
     steps:
@@ -26,20 +34,23 @@ jobs:
     - name: ccache
       uses: hendrikmuhs/ccache-action@v1.2
 
-    # Install CMake
-    - uses: lukka/get-cmake@latest
+    - name: Configure CMake
+      run: >
+        cmake --preset ${{ matrix.preset }}
+        -S ${{ github.workspace }} -B ${{ github.workspace }}/build
+        -D CMAKE_BUILD_TYPE=${{ matrix.build_type }}
+        -D CMAKE_C_COMPILER_LAUNCHER=ccache
+        -D CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
-    # Build via CMake
-    # Reference: https://github.com/lukka/run-cmake/blob/v3/action.yml
-    - name: Build via cmake
-      uses: lukka/run-cmake@v3
+    - name: Build
+      run: cmake --build ${{ github.workspace }}/build --preset ${{ matrix.preset }} --config ${{ matrix.build_type }}
+
+    - name: Archive production artifacts
+      uses: actions/upload-artifact@v4
       with:
-        cmakeListsOrSettingsJson: CMakeListsTxtAdvanced
-        cmakeAppendedArgs: >
-          -D CMAKE_C_COMPILER_LAUNCHER=ccache
-          -D CMAKE_CXX_COMPILER_LAUNCHER=ccache
-        buildWithCMake: true
-        # Explicitly list build targets here.
-        # Building "all" includes test executables and takes much longer.
-        buildWithCMakeArgs: "-- gemma"
-        buildDirectory: '${{ github.workspace }}/build'
+        name: gemma-${{ matrix.os }}-${{ matrix.preset }}-${{ matrix.build_type }}
+        path: |
+          ${{ github.workspace }}/build/${{ matrix.build_type }}/gemma.exe
+          ${{ github.workspace }}/build/${{ matrix.build_type }}/libgemma.lib
+          ${{ github.workspace }}/build/gemma
+          ${{ github.workspace }}/build/libgemma.a

From 129e66ada2b4e461bdf28b88b70cd2465cb213e4 Mon Sep 17 00:00:00 2001
From: austinvhuang <austinh@alum.mit.edu>
Date: Mon, 26 Feb 2024 17:05:32 -0500
Subject: [PATCH 13/26] Reduce KV cache preallocation to 4096 and make it
 comptime configurable, add rm build note in readme, add note on comptime
 options in DEVELOPERS, make multiturn=0 the default

---
 DEVELOPERS.md | 18 ++++++++++++++++
 README.md     |  8 +++++--
 configs.h     | 21 +++++++++++-------
 gemma.h       | 59 ++++++++++++++++++++++++---------------------------
 4 files changed, 65 insertions(+), 41 deletions(-)

diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index d06b0f8..bdc02c0 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -70,3 +70,21 @@ The implementation code is roughly split into 4 layers, from high to low level:
 
 4.  Backend (`highway`) - Low-level hardware interface (SIMD in the case of
     highway) supporting the implementations in (3).
+
+## Compile-Time Flags (Advanced)
+
+There are several compile-time flags to be aware of (note these may or may not 
+be exposed to the build system):
+
+- `GEMMA_WEIGHT_T` : Sets the level of compression for weights (surfaced as 
+  WEIGHT_TYPE in CMakeLists.txt). Currently this should be set to `SfpStream` 
+  (default, if no flag is specified) for 8-bit SFP, or `hwy::bfloat16_t` to 
+  enable for higher-fidelity (but slower) bfloat16 support. This is defined in
+  `gemma.h`.
+- `GEMMA_MAX_SEQ_LEN` : Sets maximum sequence length to preallocate for the KV
+  Cache. The default is 4096 tokens but can be overridden. This is not exposed
+  through `CMakeLists.txt` yet. 
+
+In the medium term both of these will likely be deprecated in favor of handling
+options at runtime - allowing for multiple weight compression schemes in a single
+build and dynamically resizes the KV cache as needed.
diff --git a/README.md b/README.md
index 5932726..8db6862 100644
--- a/README.md
+++ b/README.md
@@ -114,8 +114,12 @@ convenient directory location (e.g. the `build/` directory in this repo).
 
 The build system uses [CMake](https://cmake.org/). To build the gemma inference
 runtime, create a build directory and generate the build files using `cmake`
-from the top-level project directory. For the 8-bit switched floating point
-weights (sfp), run cmake with no options:
+from the top-level project directory. Note if you previous ran `cmake` and are
+re-running with a different setting, be sure to clean out the `build/` directory
+with `rm -rf build/*` (warning this will delete any other files in the `build/` 
+directory.
+
+For the 8-bit switched floating point weights (sfp), run cmake with no options:
 
 #### Unix-like Platforms
 ```sh
diff --git a/configs.h b/configs.h
index ebe6220..4be5f75 100644
--- a/configs.h
+++ b/configs.h
@@ -18,21 +18,26 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
 #define THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
 
+// Allow changing pre-allocated kv cache size as a compiler flag
+#ifndef GEMMA_MAX_SEQLEN
+#define GEMMA_MAX_SEQLEN 4096
+#endif // !GEMMA_MAX_SEQLEN
+
 #include <stddef.h>
 
 namespace gcpp {
 
-static constexpr size_t kSeqLen = 7168;
+static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN;
 
 struct ConfigGemma7B {
   static constexpr int kSeqLen = gcpp::kSeqLen;
   static constexpr int kVocabSize = 256128;
   static constexpr int kLayers = 28;
   static constexpr int kModelDim = 3072;
-  static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
+  static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576
   static constexpr int kHeads = 16;
-  static constexpr int kKVHeads = 16;  // standard MHA, no GQA or MQA
-  static constexpr int kQKVDim = 256;    // query size == key size == value size
+  static constexpr int kKVHeads = 16; // standard MHA
+  static constexpr int kQKVDim = 256; // query size == key size == value size
   static constexpr int kTopK = 1;
 };
 
@@ -41,13 +46,13 @@ struct ConfigGemma2B {
   static constexpr int kVocabSize = 256128;
   static constexpr int kLayers = 18;
   static constexpr int kModelDim = 2048;
-  static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
+  static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384
   static constexpr int kHeads = 8;
   static constexpr int kKVHeads = 8;  // TODO(austinvhuang): add MQA support
-  static constexpr int kQKVDim = 256;   // query size == key size == value size
+  static constexpr int kQKVDim = 256; // query size == key size == value size
   static constexpr int kTopK = 1;
 };
 
-}  // namespace gcpp
+} // namespace gcpp
 
-#endif  // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
+#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
diff --git a/gemma.h b/gemma.h
index 5dc9f62..f2a130e 100644
--- a/gemma.h
+++ b/gemma.h
@@ -25,14 +25,14 @@
 #include <vector>
 
 // copybara:import_next_line:gemma_cpp
-#include "compression/compress.h"  // SfpStream/NuqStream
+#include "compression/compress.h" // SfpStream/NuqStream
 // copybara:import_next_line:gemma_cpp
-#include "configs.h"               // kSeqLen
+#include "configs.h" // kSeqLen
 // copybara:import_next_line:gemma_cpp
-#include "util/args.h"             // ArgsBase
 #include "hwy/aligned_allocator.h"
-#include "hwy/base.h"  // hwy::bfloat16_t
+#include "hwy/base.h" // hwy::bfloat16_t
 #include "hwy/contrib/thread_pool/thread_pool.h"
+#include "util/args.h" // ArgsBase
 // copybara:import_next_line:sentencepiece
 #include "src/sentencepiece_processor.h"
 
@@ -42,7 +42,7 @@ namespace gcpp {
 // float, hwy::bfloat16_t, SfpStream, NuqStream
 #ifndef GEMMA_WEIGHT_T
 #define GEMMA_WEIGHT_T SfpStream
-#endif  // !GEMMA_WEIGHT_T
+#endif // !GEMMA_WEIGHT_T
 using WeightT = GEMMA_WEIGHT_T;
 
 using EmbedderInputT = hwy::bfloat16_t;
@@ -51,9 +51,9 @@ constexpr bool kSystemPrompt = false;
 
 struct KVCache {
   hwy::AlignedFreeUniquePtr<float[]>
-      key_cache;  // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
+      key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
   hwy::AlignedFreeUniquePtr<float[]>
-      value_cache;  // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
+      value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
 };
 
 // Model variants: see configs.h for details.
@@ -61,9 +61,9 @@ enum class Model { GEMMA_2B, GEMMA_7B };
 enum class ModelTraining { GEMMA_IT, GEMMA_PT };
 
 struct LoaderArgs : public ArgsBase<LoaderArgs> {
-  LoaderArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
+  LoaderArgs(int argc, char *argv[]) { InitAndParse(argc, argv); }
 
-  static std::string ToLower(const std::string& text) {
+  static std::string ToLower(const std::string &text) {
     std::string result = text;
     std::transform(begin(result), end(result), begin(result),
                    [](unsigned char c) { return std::tolower(c); });
@@ -89,7 +89,7 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   }
 
   // Returns error string or nullptr if OK.
-  const char* Validate() const {
+  const char *Validate() const {
     const std::string model_type_lc = ToLower(model_type);
     if (model_type_lc != "2b-pt" && model_type_lc != "7b-pt" &&
         model_type_lc != "2b-it" && model_type_lc != "7b-it") {
@@ -111,12 +111,11 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   }
 
   Path tokenizer;
-  Path model;  // uncompressed weights OR
-  Path cache;  // compressed weights
+  Path model; // uncompressed weights OR
+  Path cache; // compressed weights
   std::string model_type;
 
-  template <class Visitor>
-  void ForEach(const Visitor& visitor) {
+  template <class Visitor> void ForEach(const Visitor &visitor) {
     visitor(tokenizer, "tokenizer", Path(),
             "Path name of tokenizer model file. (required)");
     visitor(
@@ -139,10 +138,10 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
 struct GemmaInterface;
 
 struct Gemma {
-  Gemma(const LoaderArgs& args, hwy::ThreadPool& pool);
-  ~Gemma();  // must be defined after GemmaInterface's dtor is defined.
+  Gemma(const LoaderArgs &args, hwy::ThreadPool &pool);
+  ~Gemma(); // must be defined after GemmaInterface's dtor is defined.
 
-  const sentencepiece::SentencePieceProcessor& Tokenizer() const;
+  const sentencepiece::SentencePieceProcessor &Tokenizer() const;
 
   std::unique_ptr<GemmaInterface> impl_;
   gcpp::ModelTraining model_training;
@@ -154,7 +153,7 @@ using StreamFunc = std::function<bool(int, float)>;
 using AcceptFunc = std::function<bool(int)>;
 
 struct InferenceArgs : public ArgsBase<InferenceArgs> {
-  InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
+  InferenceArgs(int argc, char *argv[]) { InitAndParse(argc, argv); }
 
   size_t max_tokens;
   size_t max_generated_tokens;
@@ -164,7 +163,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
   bool multiturn;
 
   // Returns error string or nullptr if OK.
-  const char* Validate() const {
+  const char *Validate() const {
     if (max_tokens > gcpp::kSeqLen) {
       return "max_tokens is larger than the maximum sequence length (see "
              "configs.h).";
@@ -176,8 +175,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
     return nullptr;
   }
 
-  template <class Visitor>
-  void ForEach(const Visitor& visitor) {
+  template <class Visitor> void ForEach(const Visitor &visitor) {
     visitor(max_tokens, "max_tokens", size_t{3072},
             "Maximum number of tokens in prompt + generation.");
     visitor(max_generated_tokens, "max_generated_tokens", size_t{2048},
@@ -186,22 +184,21 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
     visitor(temperature, "temperature", 1.0f, "Temperature for top-K", 2);
     visitor(deterministic, "deterministic", false,
             "Make top-k sampling deterministic", 2);
-    visitor(multiturn, "multiturn", true,
+    visitor(multiturn, "multiturn", false,
             "Multiturn mode (if 0, this clears the KV cache after every "
-            "interaction without quitting)",
-            2);
+            "interaction without quitting)\n    Default = 0 (conversation resets every turn)");
   }
 };
 
-void GenerateGemma(Gemma& gemma, const InferenceArgs& args,
-                   const std::vector<int>& prompt, size_t start_pos,
-                   hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
-                   const StreamFunc& stream_token,
-                   const AcceptFunc& accept_token, std::mt19937& g,
+void GenerateGemma(Gemma &gemma, const InferenceArgs &args,
+                   const std::vector<int> &prompt, size_t start_pos,
+                   hwy::ThreadPool &pool, hwy::ThreadPool &inner_pool,
+                   const StreamFunc &stream_token,
+                   const AcceptFunc &accept_token, std::mt19937 &g,
                    int verbosity);
 
 constexpr int EOS_ID = 1;
 
-}  // namespace gcpp
+} // namespace gcpp
 
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_H_
+#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_

From 8db89304bdc21949911c9e8996c03d3a623e7a6d Mon Sep 17 00:00:00 2001
From: Dan Zheng <danielzheng@google.com>
Date: Mon, 26 Feb 2024 12:54:39 -0800
Subject: [PATCH 14/26] No public description

PiperOrigin-RevId: 610498969
---
 DEVELOPERS.md | 18 ----------------
 README.md     |  8 ++-----
 configs.h     | 21 +++++++-----------
 gemma.h       | 59 +++++++++++++++++++++++++++------------------------
 4 files changed, 41 insertions(+), 65 deletions(-)

diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index bdc02c0..d06b0f8 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -70,21 +70,3 @@ The implementation code is roughly split into 4 layers, from high to low level:
 
 4.  Backend (`highway`) - Low-level hardware interface (SIMD in the case of
     highway) supporting the implementations in (3).
-
-## Compile-Time Flags (Advanced)
-
-There are several compile-time flags to be aware of (note these may or may not 
-be exposed to the build system):
-
-- `GEMMA_WEIGHT_T` : Sets the level of compression for weights (surfaced as 
-  WEIGHT_TYPE in CMakeLists.txt). Currently this should be set to `SfpStream` 
-  (default, if no flag is specified) for 8-bit SFP, or `hwy::bfloat16_t` to 
-  enable for higher-fidelity (but slower) bfloat16 support. This is defined in
-  `gemma.h`.
-- `GEMMA_MAX_SEQ_LEN` : Sets maximum sequence length to preallocate for the KV
-  Cache. The default is 4096 tokens but can be overridden. This is not exposed
-  through `CMakeLists.txt` yet. 
-
-In the medium term both of these will likely be deprecated in favor of handling
-options at runtime - allowing for multiple weight compression schemes in a single
-build and dynamically resizes the KV cache as needed.
diff --git a/README.md b/README.md
index 8db6862..5932726 100644
--- a/README.md
+++ b/README.md
@@ -114,12 +114,8 @@ convenient directory location (e.g. the `build/` directory in this repo).
 
 The build system uses [CMake](https://cmake.org/). To build the gemma inference
 runtime, create a build directory and generate the build files using `cmake`
-from the top-level project directory. Note if you previous ran `cmake` and are
-re-running with a different setting, be sure to clean out the `build/` directory
-with `rm -rf build/*` (warning this will delete any other files in the `build/` 
-directory.
-
-For the 8-bit switched floating point weights (sfp), run cmake with no options:
+from the top-level project directory. For the 8-bit switched floating point
+weights (sfp), run cmake with no options:
 
 #### Unix-like Platforms
 ```sh
diff --git a/configs.h b/configs.h
index 4be5f75..ebe6220 100644
--- a/configs.h
+++ b/configs.h
@@ -18,26 +18,21 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
 #define THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
 
-// Allow changing pre-allocated kv cache size as a compiler flag
-#ifndef GEMMA_MAX_SEQLEN
-#define GEMMA_MAX_SEQLEN 4096
-#endif // !GEMMA_MAX_SEQLEN
-
 #include <stddef.h>
 
 namespace gcpp {
 
-static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN;
+static constexpr size_t kSeqLen = 7168;
 
 struct ConfigGemma7B {
   static constexpr int kSeqLen = gcpp::kSeqLen;
   static constexpr int kVocabSize = 256128;
   static constexpr int kLayers = 28;
   static constexpr int kModelDim = 3072;
-  static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576
+  static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
   static constexpr int kHeads = 16;
-  static constexpr int kKVHeads = 16; // standard MHA
-  static constexpr int kQKVDim = 256; // query size == key size == value size
+  static constexpr int kKVHeads = 16;  // standard MHA, no GQA or MQA
+  static constexpr int kQKVDim = 256;    // query size == key size == value size
   static constexpr int kTopK = 1;
 };
 
@@ -46,13 +41,13 @@ struct ConfigGemma2B {
   static constexpr int kVocabSize = 256128;
   static constexpr int kLayers = 18;
   static constexpr int kModelDim = 2048;
-  static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384
+  static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
   static constexpr int kHeads = 8;
   static constexpr int kKVHeads = 8;  // TODO(austinvhuang): add MQA support
-  static constexpr int kQKVDim = 256; // query size == key size == value size
+  static constexpr int kQKVDim = 256;   // query size == key size == value size
   static constexpr int kTopK = 1;
 };
 
-} // namespace gcpp
+}  // namespace gcpp
 
-#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
+#endif  // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
diff --git a/gemma.h b/gemma.h
index f2a130e..5dc9f62 100644
--- a/gemma.h
+++ b/gemma.h
@@ -25,14 +25,14 @@
 #include <vector>
 
 // copybara:import_next_line:gemma_cpp
-#include "compression/compress.h" // SfpStream/NuqStream
+#include "compression/compress.h"  // SfpStream/NuqStream
 // copybara:import_next_line:gemma_cpp
-#include "configs.h" // kSeqLen
+#include "configs.h"               // kSeqLen
 // copybara:import_next_line:gemma_cpp
+#include "util/args.h"             // ArgsBase
 #include "hwy/aligned_allocator.h"
-#include "hwy/base.h" // hwy::bfloat16_t
+#include "hwy/base.h"  // hwy::bfloat16_t
 #include "hwy/contrib/thread_pool/thread_pool.h"
-#include "util/args.h" // ArgsBase
 // copybara:import_next_line:sentencepiece
 #include "src/sentencepiece_processor.h"
 
@@ -42,7 +42,7 @@ namespace gcpp {
 // float, hwy::bfloat16_t, SfpStream, NuqStream
 #ifndef GEMMA_WEIGHT_T
 #define GEMMA_WEIGHT_T SfpStream
-#endif // !GEMMA_WEIGHT_T
+#endif  // !GEMMA_WEIGHT_T
 using WeightT = GEMMA_WEIGHT_T;
 
 using EmbedderInputT = hwy::bfloat16_t;
@@ -51,9 +51,9 @@ constexpr bool kSystemPrompt = false;
 
 struct KVCache {
   hwy::AlignedFreeUniquePtr<float[]>
-      key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
+      key_cache;  // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
   hwy::AlignedFreeUniquePtr<float[]>
-      value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
+      value_cache;  // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
 };
 
 // Model variants: see configs.h for details.
@@ -61,9 +61,9 @@ enum class Model { GEMMA_2B, GEMMA_7B };
 enum class ModelTraining { GEMMA_IT, GEMMA_PT };
 
 struct LoaderArgs : public ArgsBase<LoaderArgs> {
-  LoaderArgs(int argc, char *argv[]) { InitAndParse(argc, argv); }
+  LoaderArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
 
-  static std::string ToLower(const std::string &text) {
+  static std::string ToLower(const std::string& text) {
     std::string result = text;
     std::transform(begin(result), end(result), begin(result),
                    [](unsigned char c) { return std::tolower(c); });
@@ -89,7 +89,7 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   }
 
   // Returns error string or nullptr if OK.
-  const char *Validate() const {
+  const char* Validate() const {
     const std::string model_type_lc = ToLower(model_type);
     if (model_type_lc != "2b-pt" && model_type_lc != "7b-pt" &&
         model_type_lc != "2b-it" && model_type_lc != "7b-it") {
@@ -111,11 +111,12 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   }
 
   Path tokenizer;
-  Path model; // uncompressed weights OR
-  Path cache; // compressed weights
+  Path model;  // uncompressed weights OR
+  Path cache;  // compressed weights
   std::string model_type;
 
-  template <class Visitor> void ForEach(const Visitor &visitor) {
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
     visitor(tokenizer, "tokenizer", Path(),
             "Path name of tokenizer model file. (required)");
     visitor(
@@ -138,10 +139,10 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
 struct GemmaInterface;
 
 struct Gemma {
-  Gemma(const LoaderArgs &args, hwy::ThreadPool &pool);
-  ~Gemma(); // must be defined after GemmaInterface's dtor is defined.
+  Gemma(const LoaderArgs& args, hwy::ThreadPool& pool);
+  ~Gemma();  // must be defined after GemmaInterface's dtor is defined.
 
-  const sentencepiece::SentencePieceProcessor &Tokenizer() const;
+  const sentencepiece::SentencePieceProcessor& Tokenizer() const;
 
   std::unique_ptr<GemmaInterface> impl_;
   gcpp::ModelTraining model_training;
@@ -153,7 +154,7 @@ using StreamFunc = std::function<bool(int, float)>;
 using AcceptFunc = std::function<bool(int)>;
 
 struct InferenceArgs : public ArgsBase<InferenceArgs> {
-  InferenceArgs(int argc, char *argv[]) { InitAndParse(argc, argv); }
+  InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
 
   size_t max_tokens;
   size_t max_generated_tokens;
@@ -163,7 +164,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
   bool multiturn;
 
   // Returns error string or nullptr if OK.
-  const char *Validate() const {
+  const char* Validate() const {
     if (max_tokens > gcpp::kSeqLen) {
       return "max_tokens is larger than the maximum sequence length (see "
              "configs.h).";
@@ -175,7 +176,8 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
     return nullptr;
   }
 
-  template <class Visitor> void ForEach(const Visitor &visitor) {
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
     visitor(max_tokens, "max_tokens", size_t{3072},
             "Maximum number of tokens in prompt + generation.");
     visitor(max_generated_tokens, "max_generated_tokens", size_t{2048},
@@ -184,21 +186,22 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
     visitor(temperature, "temperature", 1.0f, "Temperature for top-K", 2);
     visitor(deterministic, "deterministic", false,
             "Make top-k sampling deterministic", 2);
-    visitor(multiturn, "multiturn", false,
+    visitor(multiturn, "multiturn", true,
             "Multiturn mode (if 0, this clears the KV cache after every "
-            "interaction without quitting)\n    Default = 0 (conversation resets every turn)");
+            "interaction without quitting)",
+            2);
   }
 };
 
-void GenerateGemma(Gemma &gemma, const InferenceArgs &args,
-                   const std::vector<int> &prompt, size_t start_pos,
-                   hwy::ThreadPool &pool, hwy::ThreadPool &inner_pool,
-                   const StreamFunc &stream_token,
-                   const AcceptFunc &accept_token, std::mt19937 &g,
+void GenerateGemma(Gemma& gemma, const InferenceArgs& args,
+                   const std::vector<int>& prompt, size_t start_pos,
+                   hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
+                   const StreamFunc& stream_token,
+                   const AcceptFunc& accept_token, std::mt19937& g,
                    int verbosity);
 
 constexpr int EOS_ID = 1;
 
-} // namespace gcpp
+}  // namespace gcpp
 
-#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_H_

From afc354dcb19574fe1ddafaf7ffadb9292b3871ab Mon Sep 17 00:00:00 2001
From: Dan Zheng <danielzheng@google.com>
Date: Mon, 26 Feb 2024 19:04:33 -0800
Subject: [PATCH 15/26] Import from GitHub.

PiperOrigin-RevId: 610595796
---
 DEVELOPERS.md | 18 ++++++++++++++++++
 README.md     |  8 ++++++--
 configs.h     | 21 +++++++++++++--------
 gemma.h       | 51 ++++++++++++++++++++++++---------------------------
 4 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index d06b0f8..bdc02c0 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -70,3 +70,21 @@ The implementation code is roughly split into 4 layers, from high to low level:
 
 4.  Backend (`highway`) - Low-level hardware interface (SIMD in the case of
     highway) supporting the implementations in (3).
+
+## Compile-Time Flags (Advanced)
+
+There are several compile-time flags to be aware of (note these may or may not 
+be exposed to the build system):
+
+- `GEMMA_WEIGHT_T` : Sets the level of compression for weights (surfaced as 
+  WEIGHT_TYPE in CMakeLists.txt). Currently this should be set to `SfpStream` 
+  (default, if no flag is specified) for 8-bit SFP, or `hwy::bfloat16_t` to 
+  enable for higher-fidelity (but slower) bfloat16 support. This is defined in
+  `gemma.h`.
+- `GEMMA_MAX_SEQ_LEN` : Sets maximum sequence length to preallocate for the KV
+  Cache. The default is 4096 tokens but can be overridden. This is not exposed
+  through `CMakeLists.txt` yet. 
+
+In the medium term both of these will likely be deprecated in favor of handling
+options at runtime - allowing for multiple weight compression schemes in a single
+build and dynamically resizes the KV cache as needed.
diff --git a/README.md b/README.md
index 5932726..8db6862 100644
--- a/README.md
+++ b/README.md
@@ -114,8 +114,12 @@ convenient directory location (e.g. the `build/` directory in this repo).
 
 The build system uses [CMake](https://cmake.org/). To build the gemma inference
 runtime, create a build directory and generate the build files using `cmake`
-from the top-level project directory. For the 8-bit switched floating point
-weights (sfp), run cmake with no options:
+from the top-level project directory. Note if you previous ran `cmake` and are
+re-running with a different setting, be sure to clean out the `build/` directory
+with `rm -rf build/*` (warning this will delete any other files in the `build/` 
+directory.
+
+For the 8-bit switched floating point weights (sfp), run cmake with no options:
 
 #### Unix-like Platforms
 ```sh
diff --git a/configs.h b/configs.h
index ebe6220..4be5f75 100644
--- a/configs.h
+++ b/configs.h
@@ -18,21 +18,26 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
 #define THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
 
+// Allow changing pre-allocated kv cache size as a compiler flag
+#ifndef GEMMA_MAX_SEQLEN
+#define GEMMA_MAX_SEQLEN 4096
+#endif // !GEMMA_MAX_SEQLEN
+
 #include <stddef.h>
 
 namespace gcpp {
 
-static constexpr size_t kSeqLen = 7168;
+static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN;
 
 struct ConfigGemma7B {
   static constexpr int kSeqLen = gcpp::kSeqLen;
   static constexpr int kVocabSize = 256128;
   static constexpr int kLayers = 28;
   static constexpr int kModelDim = 3072;
-  static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
+  static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576
   static constexpr int kHeads = 16;
-  static constexpr int kKVHeads = 16;  // standard MHA, no GQA or MQA
-  static constexpr int kQKVDim = 256;    // query size == key size == value size
+  static constexpr int kKVHeads = 16; // standard MHA
+  static constexpr int kQKVDim = 256; // query size == key size == value size
   static constexpr int kTopK = 1;
 };
 
@@ -41,13 +46,13 @@ struct ConfigGemma2B {
   static constexpr int kVocabSize = 256128;
   static constexpr int kLayers = 18;
   static constexpr int kModelDim = 2048;
-  static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
+  static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384
   static constexpr int kHeads = 8;
   static constexpr int kKVHeads = 8;  // TODO(austinvhuang): add MQA support
-  static constexpr int kQKVDim = 256;   // query size == key size == value size
+  static constexpr int kQKVDim = 256; // query size == key size == value size
   static constexpr int kTopK = 1;
 };
 
-}  // namespace gcpp
+} // namespace gcpp
 
-#endif  // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
+#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
diff --git a/gemma.h b/gemma.h
index 5dc9f62..1e76a37 100644
--- a/gemma.h
+++ b/gemma.h
@@ -42,7 +42,7 @@ namespace gcpp {
 // float, hwy::bfloat16_t, SfpStream, NuqStream
 #ifndef GEMMA_WEIGHT_T
 #define GEMMA_WEIGHT_T SfpStream
-#endif  // !GEMMA_WEIGHT_T
+#endif // !GEMMA_WEIGHT_T
 using WeightT = GEMMA_WEIGHT_T;
 
 using EmbedderInputT = hwy::bfloat16_t;
@@ -51,9 +51,9 @@ constexpr bool kSystemPrompt = false;
 
 struct KVCache {
   hwy::AlignedFreeUniquePtr<float[]>
-      key_cache;  // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
+      key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
   hwy::AlignedFreeUniquePtr<float[]>
-      value_cache;  // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
+      value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
 };
 
 // Model variants: see configs.h for details.
@@ -61,9 +61,9 @@ enum class Model { GEMMA_2B, GEMMA_7B };
 enum class ModelTraining { GEMMA_IT, GEMMA_PT };
 
 struct LoaderArgs : public ArgsBase<LoaderArgs> {
-  LoaderArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
+  LoaderArgs(int argc, char *argv[]) { InitAndParse(argc, argv); }
 
-  static std::string ToLower(const std::string& text) {
+  static std::string ToLower(const std::string &text) {
     std::string result = text;
     std::transform(begin(result), end(result), begin(result),
                    [](unsigned char c) { return std::tolower(c); });
@@ -89,7 +89,7 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   }
 
   // Returns error string or nullptr if OK.
-  const char* Validate() const {
+  const char *Validate() const {
     const std::string model_type_lc = ToLower(model_type);
     if (model_type_lc != "2b-pt" && model_type_lc != "7b-pt" &&
         model_type_lc != "2b-it" && model_type_lc != "7b-it") {
@@ -111,12 +111,11 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   }
 
   Path tokenizer;
-  Path model;  // uncompressed weights OR
-  Path cache;  // compressed weights
+  Path model; // uncompressed weights OR
+  Path cache; // compressed weights
   std::string model_type;
 
-  template <class Visitor>
-  void ForEach(const Visitor& visitor) {
+  template <class Visitor> void ForEach(const Visitor &visitor) {
     visitor(tokenizer, "tokenizer", Path(),
             "Path name of tokenizer model file. (required)");
     visitor(
@@ -139,10 +138,10 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
 struct GemmaInterface;
 
 struct Gemma {
-  Gemma(const LoaderArgs& args, hwy::ThreadPool& pool);
-  ~Gemma();  // must be defined after GemmaInterface's dtor is defined.
+  Gemma(const LoaderArgs &args, hwy::ThreadPool &pool);
+  ~Gemma(); // must be defined after GemmaInterface's dtor is defined.
 
-  const sentencepiece::SentencePieceProcessor& Tokenizer() const;
+  const sentencepiece::SentencePieceProcessor &Tokenizer() const;
 
   std::unique_ptr<GemmaInterface> impl_;
   gcpp::ModelTraining model_training;
@@ -154,7 +153,7 @@ using StreamFunc = std::function<bool(int, float)>;
 using AcceptFunc = std::function<bool(int)>;
 
 struct InferenceArgs : public ArgsBase<InferenceArgs> {
-  InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
+  InferenceArgs(int argc, char *argv[]) { InitAndParse(argc, argv); }
 
   size_t max_tokens;
   size_t max_generated_tokens;
@@ -164,7 +163,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
   bool multiturn;
 
   // Returns error string or nullptr if OK.
-  const char* Validate() const {
+  const char *Validate() const {
     if (max_tokens > gcpp::kSeqLen) {
       return "max_tokens is larger than the maximum sequence length (see "
              "configs.h).";
@@ -176,8 +175,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
     return nullptr;
   }
 
-  template <class Visitor>
-  void ForEach(const Visitor& visitor) {
+  template <class Visitor> void ForEach(const Visitor &visitor) {
     visitor(max_tokens, "max_tokens", size_t{3072},
             "Maximum number of tokens in prompt + generation.");
     visitor(max_generated_tokens, "max_generated_tokens", size_t{2048},
@@ -186,22 +184,21 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
     visitor(temperature, "temperature", 1.0f, "Temperature for top-K", 2);
     visitor(deterministic, "deterministic", false,
             "Make top-k sampling deterministic", 2);
-    visitor(multiturn, "multiturn", true,
+    visitor(multiturn, "multiturn", false,
             "Multiturn mode (if 0, this clears the KV cache after every "
-            "interaction without quitting)",
-            2);
+            "interaction without quitting)\n    Default = 0 (conversation resets every turn)");
   }
 };
 
-void GenerateGemma(Gemma& gemma, const InferenceArgs& args,
-                   const std::vector<int>& prompt, size_t start_pos,
-                   hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
-                   const StreamFunc& stream_token,
-                   const AcceptFunc& accept_token, std::mt19937& g,
+void GenerateGemma(Gemma &gemma, const InferenceArgs &args,
+                   const std::vector<int> &prompt, size_t start_pos,
+                   hwy::ThreadPool &pool, hwy::ThreadPool &inner_pool,
+                   const StreamFunc &stream_token,
+                   const AcceptFunc &accept_token, std::mt19937 &g,
                    int verbosity);
 
 constexpr int EOS_ID = 1;
 
-}  // namespace gcpp
+} // namespace gcpp
 
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_H_
+#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_

From b3fecef45dbc4d04aa53658347f06de5449aefef Mon Sep 17 00:00:00 2001
From: Jan Wassenberg <janwas@google.com>
Date: Mon, 26 Feb 2024 22:31:03 -0800
Subject: [PATCH 16/26] Warning fix: sign cast

PiperOrigin-RevId: 610635789
---
 compression/stats.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compression/stats.cc b/compression/stats.cc
index 2013422..8e66119 100644
--- a/compression/stats.cc
+++ b/compression/stats.cc
@@ -114,7 +114,7 @@ std::string Stats::ToString(int exclude) const {
     pos += ret;
   }
 
-  HWY_ASSERT(pos < sizeof(buf));
+  HWY_ASSERT(pos < static_cast<int>(sizeof(buf)));
   return buf;
 }
 

From 179ecf9e7852afbb984de6b13f410559ad464c26 Mon Sep 17 00:00:00 2001
From: Jan Wassenberg <janwas@google.com>
Date: Mon, 26 Feb 2024 22:45:39 -0800
Subject: [PATCH 17/26] Warn instead of assert for setaffinity. Fixes #49

PiperOrigin-RevId: 610638517
---
 util/app.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/util/app.h b/util/app.h
index bd665a4..5cd316d 100644
--- a/util/app.h
+++ b/util/app.h
@@ -20,8 +20,11 @@
 
 #if HWY_OS_LINUX
 #include <sched.h>
+
+#include <cerrno>  // IDE does not recognize errno.h as providing errno.
 #endif
 #include <stddef.h>
+#include <stdio.h>
 
 #include <algorithm>  // std::clamp
 #include <thread>     // NOLINT>
@@ -38,7 +41,13 @@ static inline void PinThreadToCore(size_t cpu_index) {
   cpu_set_t cset;             // bit array
   CPU_ZERO(&cset);            // clear all
   CPU_SET(cpu_index, &cset);  // set bit indicating which processor to run on.
-  HWY_ASSERT(0 == sched_setaffinity(0, sizeof(cset), &cset));
+  const int err = sched_setaffinity(0, sizeof(cset), &cset);
+  if (err != 0) {
+    fprintf(stderr,
+            "sched_setaffinity returned %d, errno %d. Can happen if running in "
+            "a container; this warning is safe to ignore.\n",
+            err, errno);
+  }
 #else
   (void)cpu_index;
 #endif

From 9cdc9223bce51a88de74022f33666309556f14c6 Mon Sep 17 00:00:00 2001
From: austinvhuang <austinh@alum.mit.edu>
Date: Tue, 27 Feb 2024 14:22:02 -0500
Subject: [PATCH 18/26] clean up formatting after
 129e66ada2b4e461bdf28b88b70cd2465cb213e4, add .clang-format defaults, minor
 updates to DEVELOPERS doc

---
 .clang-format | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++
 DEVELOPERS.md |  12 +++
 configs.h     |  18 ++--
 gemma.h       |  51 +++++------
 4 files changed, 282 insertions(+), 34 deletions(-)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..c8f8dba
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,235 @@
+---
+Language:        Cpp
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveAssignments:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  PadOperators:    true
+AlignConsecutiveBitFields:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  PadOperators:    false
+AlignConsecutiveDeclarations:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  PadOperators:    false
+AlignConsecutiveMacros:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  PadOperators:    false
+AlignConsecutiveShortCaseStatements:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCaseColons: false
+AlignEscapedNewlines: Right
+AlignOperands:   Align
+AlignTrailingComments:
+  Kind:            Always
+  OverEmptyLines:  0
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortEnumsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+AttributeMacros:
+  - __capability
+BinPackArguments: true
+BinPackParameters: true
+BitFieldColonSpacing: Both
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: Never
+  AfterEnum:       false
+  AfterExternBlock: false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile:     false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakAfterAttributes: Never
+BreakAfterJavaFieldAnnotations: false
+BreakArrays:     true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: Always
+BreakBeforeBraces: Attach
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseBlocks: false
+IndentCaseLabels: false
+IndentExternBlock: AfterExternBlock
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentRequiresClause: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+InsertBraces:    false
+InsertNewlineAtEOF: false
+InsertTrailingCommas: None
+IntegerLiteralSeparator:
+  Binary:          0
+  BinaryMinDigits: 0
+  Decimal:         0
+  DecimalMinDigits: 0
+  Hex:             0
+  HexMinDigits:    0
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+KeepEmptyLinesAtEOF: false
+LambdaBodyIndentation: Signature
+LineEnding:      DeriveLF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PackConstructorInitializers: BinPack
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakOpenParenthesis: 0
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyIndentedWhitespace: 0
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+PPIndentWidth:   -1
+QualifierAlignment: Leave
+ReferenceAlignment: Left
+ReflowComments:  true
+RemoveBracesLLVM: false
+RemoveParentheses: Leave
+RemoveSemicolon: false
+RequiresClausePosition: OwnLine
+RequiresExpressionIndentation: OuterScope
+SeparateDefinitionBlocks: Leave
+ShortNamespaceLines: 1
+SortIncludes:    CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeJsonColon: false
+SpaceBeforeParens: ControlStatements
+SpaceBeforeParensOptions:
+  AfterControlStatements: true
+  AfterForeachMacros: true
+  AfterFunctionDefinitionName: false
+  AfterFunctionDeclarationName: false
+  AfterIfMacros:   true
+  AfterOverloadedOperator: false
+  AfterRequiresInClause: false
+  AfterRequiresInExpression: false
+  BeforeNonEmptyParentheses: false
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  Never
+SpacesInContainerLiterals: true
+SpacesInLineCommentPrefix:
+  Minimum:         1
+  Maximum:         -1
+SpacesInParens:  Never
+SpacesInParensOptions:
+  InCStyleCasts:   false
+  InConditionalStatements: false
+  InEmptyParentheses: false
+  Other:           false
+SpacesInSquareBrackets: false
+Standard:        Latest
+StatementAttributeLikeMacros:
+  - Q_EMIT
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseTab:          Never
+VerilogBreakBetweenInstancePorts: true
+WhitespaceSensitiveMacros:
+  - BOOST_PP_STRINGIZE
+  - CF_SWIFT_NAME
+  - NS_SWIFT_NAME
+  - PP_STRINGIZE
+  - STRINGIZE
+...
+
diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index bdc02c0..7aad9d8 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -71,6 +71,18 @@ The implementation code is roughly split into 4 layers, from high to low level:
 4.  Backend (`highway`) - Low-level hardware interface (SIMD in the case of
     highway) supporting the implementations in (3).
 
+Besides these layers, supporting utilities are:
+
+- `compression/` - model compression operations. the 8-bit switched floating 
+  point model conversion is here.
+- `util/` - command line argument handling and any other utilities.
+
+## Style and Formatting
+
+A `.clang-format` configuration is provided with our defaults, please run source
+files through `clang-format` (or a formatter that produces equivalent behavior)
+before finalizing PR for submission.
+
 ## Compile-Time Flags (Advanced)
 
 There are several compile-time flags to be aware of (note these may or may not 
diff --git a/configs.h b/configs.h
index 4be5f75..bf25596 100644
--- a/configs.h
+++ b/configs.h
@@ -21,7 +21,7 @@
 // Allow changing pre-allocated kv cache size as a compiler flag
 #ifndef GEMMA_MAX_SEQLEN
 #define GEMMA_MAX_SEQLEN 4096
-#endif // !GEMMA_MAX_SEQLEN
+#endif  // !GEMMA_MAX_SEQLEN
 
 #include <stddef.h>
 
@@ -34,10 +34,10 @@ struct ConfigGemma7B {
   static constexpr int kVocabSize = 256128;
   static constexpr int kLayers = 28;
   static constexpr int kModelDim = 3072;
-  static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576
+  static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
   static constexpr int kHeads = 16;
-  static constexpr int kKVHeads = 16; // standard MHA
-  static constexpr int kQKVDim = 256; // query size == key size == value size
+  static constexpr int kKVHeads = 16;  // standard MHA
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
   static constexpr int kTopK = 1;
 };
 
@@ -46,13 +46,13 @@ struct ConfigGemma2B {
   static constexpr int kVocabSize = 256128;
   static constexpr int kLayers = 18;
   static constexpr int kModelDim = 2048;
-  static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384
+  static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
   static constexpr int kHeads = 8;
-  static constexpr int kKVHeads = 8;  // TODO(austinvhuang): add MQA support
-  static constexpr int kQKVDim = 256; // query size == key size == value size
+  static constexpr int kKVHeads = 8;   // TODO(austinvhuang): add MQA support
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
   static constexpr int kTopK = 1;
 };
 
-} // namespace gcpp
+}  // namespace gcpp
 
-#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
+#endif  // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_
diff --git a/gemma.h b/gemma.h
index 1e76a37..12c2a77 100644
--- a/gemma.h
+++ b/gemma.h
@@ -27,12 +27,12 @@
 // copybara:import_next_line:gemma_cpp
 #include "compression/compress.h"  // SfpStream/NuqStream
 // copybara:import_next_line:gemma_cpp
-#include "configs.h"               // kSeqLen
+#include "configs.h"  // kSeqLen
 // copybara:import_next_line:gemma_cpp
-#include "util/args.h"             // ArgsBase
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // hwy::bfloat16_t
 #include "hwy/contrib/thread_pool/thread_pool.h"
+#include "util/args.h"  // ArgsBase
 // copybara:import_next_line:sentencepiece
 #include "src/sentencepiece_processor.h"
 
@@ -42,7 +42,7 @@ namespace gcpp {
 // float, hwy::bfloat16_t, SfpStream, NuqStream
 #ifndef GEMMA_WEIGHT_T
 #define GEMMA_WEIGHT_T SfpStream
-#endif // !GEMMA_WEIGHT_T
+#endif  // !GEMMA_WEIGHT_T
 using WeightT = GEMMA_WEIGHT_T;
 
 using EmbedderInputT = hwy::bfloat16_t;
@@ -51,9 +51,9 @@ constexpr bool kSystemPrompt = false;
 
 struct KVCache {
   hwy::AlignedFreeUniquePtr<float[]>
-      key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
+      key_cache;  // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
   hwy::AlignedFreeUniquePtr<float[]>
-      value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
+      value_cache;  // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim
 };
 
 // Model variants: see configs.h for details.
@@ -61,9 +61,9 @@ enum class Model { GEMMA_2B, GEMMA_7B };
 enum class ModelTraining { GEMMA_IT, GEMMA_PT };
 
 struct LoaderArgs : public ArgsBase<LoaderArgs> {
-  LoaderArgs(int argc, char *argv[]) { InitAndParse(argc, argv); }
+  LoaderArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
 
-  static std::string ToLower(const std::string &text) {
+  static std::string ToLower(const std::string& text) {
     std::string result = text;
     std::transform(begin(result), end(result), begin(result),
                    [](unsigned char c) { return std::tolower(c); });
@@ -89,7 +89,7 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   }
 
   // Returns error string or nullptr if OK.
-  const char *Validate() const {
+  const char* Validate() const {
     const std::string model_type_lc = ToLower(model_type);
     if (model_type_lc != "2b-pt" && model_type_lc != "7b-pt" &&
         model_type_lc != "2b-it" && model_type_lc != "7b-it") {
@@ -111,11 +111,11 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   }
 
   Path tokenizer;
-  Path model; // uncompressed weights OR
-  Path cache; // compressed weights
+  Path model;  // uncompressed weights OR
+  Path cache;  // compressed weights
   std::string model_type;
 
-  template <class Visitor> void ForEach(const Visitor &visitor) {
+  template <class Visitor> void ForEach(const Visitor& visitor) {
     visitor(tokenizer, "tokenizer", Path(),
             "Path name of tokenizer model file. (required)");
     visitor(
@@ -138,10 +138,10 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
 struct GemmaInterface;
 
 struct Gemma {
-  Gemma(const LoaderArgs &args, hwy::ThreadPool &pool);
-  ~Gemma(); // must be defined after GemmaInterface's dtor is defined.
+  Gemma(const LoaderArgs& args, hwy::ThreadPool& pool);
+  ~Gemma();  // must be defined after GemmaInterface's dtor is defined.
 
-  const sentencepiece::SentencePieceProcessor &Tokenizer() const;
+  const sentencepiece::SentencePieceProcessor& Tokenizer() const;
 
   std::unique_ptr<GemmaInterface> impl_;
   gcpp::ModelTraining model_training;
@@ -153,7 +153,7 @@ using StreamFunc = std::function<bool(int, float)>;
 using AcceptFunc = std::function<bool(int)>;
 
 struct InferenceArgs : public ArgsBase<InferenceArgs> {
-  InferenceArgs(int argc, char *argv[]) { InitAndParse(argc, argv); }
+  InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
 
   size_t max_tokens;
   size_t max_generated_tokens;
@@ -163,7 +163,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
   bool multiturn;
 
   // Returns error string or nullptr if OK.
-  const char *Validate() const {
+  const char* Validate() const {
     if (max_tokens > gcpp::kSeqLen) {
       return "max_tokens is larger than the maximum sequence length (see "
              "configs.h).";
@@ -175,7 +175,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
     return nullptr;
   }
 
-  template <class Visitor> void ForEach(const Visitor &visitor) {
+  template <class Visitor> void ForEach(const Visitor& visitor) {
     visitor(max_tokens, "max_tokens", size_t{3072},
             "Maximum number of tokens in prompt + generation.");
     visitor(max_generated_tokens, "max_generated_tokens", size_t{2048},
@@ -186,19 +186,20 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
             "Make top-k sampling deterministic", 2);
     visitor(multiturn, "multiturn", false,
             "Multiturn mode (if 0, this clears the KV cache after every "
-            "interaction without quitting)\n    Default = 0 (conversation resets every turn)");
+            "interaction without quitting)\n    Default = 0 (conversation "
+            "resets every turn)");
   }
 };
 
-void GenerateGemma(Gemma &gemma, const InferenceArgs &args,
-                   const std::vector<int> &prompt, size_t start_pos,
-                   hwy::ThreadPool &pool, hwy::ThreadPool &inner_pool,
-                   const StreamFunc &stream_token,
-                   const AcceptFunc &accept_token, std::mt19937 &g,
+void GenerateGemma(Gemma& gemma, const InferenceArgs& args,
+                   const std::vector<int>& prompt, size_t start_pos,
+                   hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
+                   const StreamFunc& stream_token,
+                   const AcceptFunc& accept_token, std::mt19937& g,
                    int verbosity);
 
 constexpr int EOS_ID = 1;
 
-} // namespace gcpp
+}  // namespace gcpp
 
-#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_H_

From 874deee3028716aa5c89c6e8d903fb10904ef1dc Mon Sep 17 00:00:00 2001
From: Dan Zheng <danielzheng@google.com>
Date: Tue, 27 Feb 2024 11:32:33 -0800
Subject: [PATCH 19/26] Update DEVELOPERS.md

---
 DEVELOPERS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index 7aad9d8..557670a 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -73,7 +73,7 @@ The implementation code is roughly split into 4 layers, from high to low level:
 
 Besides these layers, supporting utilities are:
 
-- `compression/` - model compression operations. the 8-bit switched floating 
+- `compression/` - model compression operations. The 8-bit switched floating 
   point model conversion is here.
 - `util/` - command line argument handling and any other utilities.
 

From f70d2de16f8acf7cad78036aa9a24e1e6c441b59 Mon Sep 17 00:00:00 2001
From: austinvhuang <austinh@alum.mit.edu>
Date: Tue, 27 Feb 2024 15:44:03 -0500
Subject: [PATCH 20/26] use `style=Google` - dumped for .clang-format, gemma.h
 updated

---
 .clang-format | 81 ++++++++++++++++++++++++++++++++++++---------------
 gemma.h       |  6 ++--
 2 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/.clang-format b/.clang-format
index c8f8dba..523dc01 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,6 +1,7 @@
 ---
 Language:        Cpp
-AccessModifierOffset: -2
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignArrayOfStructures: None
 AlignConsecutiveAssignments:
@@ -32,7 +33,7 @@ AlignConsecutiveShortCaseStatements:
   AcrossEmptyLines: false
   AcrossComments:  false
   AlignCaseColons: false
-AlignEscapedNewlines: Right
+AlignEscapedNewlines: Left
 AlignOperands:   Align
 AlignTrailingComments:
   Kind:            Always
@@ -43,13 +44,13 @@ AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortEnumsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: Never
+AllowShortIfStatementsOnASingleLine: WithoutElse
 AllowShortLambdasOnASingleLine: All
-AllowShortLoopsOnASingleLine: false
+AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: MultiLine
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
 AttributeMacros:
   - __capability
 BinPackArguments: true
@@ -91,7 +92,7 @@ CompactNamespaces: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
-DerivePointerAlignment: false
+DerivePointerAlignment: true
 DisableFormat:   false
 EmptyLineAfterAccessModifier: Never
 EmptyLineBeforeAccessModifier: LogicalBlock
@@ -103,25 +104,29 @@ ForEachMacros:
   - BOOST_FOREACH
 IfMacros:
   - KJ_IF_MAYBE
-IncludeBlocks:   Preserve
+IncludeBlocks:   Regroup
 IncludeCategories:
-  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+  - Regex:           '^<ext/.*\.h>'
     Priority:        2
     SortPriority:    0
     CaseSensitive:   false
-  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
-    Priority:        3
-    SortPriority:    0
-    CaseSensitive:   false
-  - Regex:           '.*'
+  - Regex:           '^<.*\.h>'
     Priority:        1
     SortPriority:    0
     CaseSensitive:   false
-IncludeIsMainRegex: '(Test)?$'
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+IncludeIsMainRegex: '([-_](test|unittest))?$'
 IncludeIsMainSourceRegex: ''
 IndentAccessModifiers: false
 IndentCaseBlocks: false
-IndentCaseLabels: false
+IndentCaseLabels: true
 IndentExternBlock: AfterExternBlock
 IndentGotoLabels: true
 IndentPPDirectives: None
@@ -140,7 +145,7 @@ IntegerLiteralSeparator:
   HexMinDigits:    0
 JavaScriptQuotes: Leave
 JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: true
+KeepEmptyLinesAtTheStartOfBlocks: false
 KeepEmptyLinesAtEOF: false
 LambdaBodyIndentation: Signature
 LineEnding:      DeriveLF
@@ -148,14 +153,14 @@ MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
-ObjCBinPackProtocolList: Auto
+ObjCBinPackProtocolList: Never
 ObjCBlockIndentWidth: 2
 ObjCBreakBeforeNestedBlockParam: true
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: true
-PackConstructorInitializers: BinPack
+PackConstructorInitializers: NextLine
 PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakOpenParenthesis: 0
@@ -163,11 +168,41 @@ PenaltyBreakString: 1000
 PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyIndentedWhitespace: 0
-PenaltyReturnTypeOnItsOwnLine: 60
+PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
 PPIndentWidth:   -1
 QualifierAlignment: Leave
-ReferenceAlignment: Left
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+      - ParseTestProto
+      - ParsePartialTestProto
+    CanonicalDelimiter: pb
+    BasedOnStyle:    google
+ReferenceAlignment: Pointer
 ReflowComments:  true
 RemoveBracesLLVM: false
 RemoveParentheses: Leave
@@ -216,7 +251,7 @@ SpacesInParensOptions:
   InEmptyParentheses: false
   Other:           false
 SpacesInSquareBrackets: false
-Standard:        Latest
+Standard:        Auto
 StatementAttributeLikeMacros:
   - Q_EMIT
 StatementMacros:
diff --git a/gemma.h b/gemma.h
index 12c2a77..a218878 100644
--- a/gemma.h
+++ b/gemma.h
@@ -115,7 +115,8 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   Path cache;  // compressed weights
   std::string model_type;
 
-  template <class Visitor> void ForEach(const Visitor& visitor) {
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
     visitor(tokenizer, "tokenizer", Path(),
             "Path name of tokenizer model file. (required)");
     visitor(
@@ -175,7 +176,8 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
     return nullptr;
   }
 
-  template <class Visitor> void ForEach(const Visitor& visitor) {
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
     visitor(max_tokens, "max_tokens", size_t{3072},
             "Maximum number of tokens in prompt + generation.");
     visitor(max_generated_tokens, "max_generated_tokens", size_t{2048},

From 8f3bd63bf74805851147a22cd3b5fbeacd8b5fc4 Mon Sep 17 00:00:00 2001
From: austinvhuang <austinh@alum.mit.edu>
Date: Tue, 27 Feb 2024 17:11:15 -0500
Subject: [PATCH 21/26] Fix copybara include path substitutions errors (which
 break the google3 build) arising from clang-format linter automation

---
 .clang-format                    | 3 +--
 DEVELOPERS.md                    | 8 ++++++++
 gemma.h                          | 2 +-
 util/make_clang_format_config.sh | 4 ++++
 4 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100755 util/make_clang_format_config.sh

diff --git a/.clang-format b/.clang-format
index 523dc01..3465c13 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,6 +1,5 @@
 ---
 Language:        Cpp
-# BasedOnStyle:  Google
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignArrayOfStructures: None
@@ -211,7 +210,7 @@ RequiresClausePosition: OwnLine
 RequiresExpressionIndentation: OuterScope
 SeparateDefinitionBlocks: Leave
 ShortNamespaceLines: 1
-SortIncludes:    CaseSensitive
+SortIncludes:    Never
 SortJavaStaticImport: Before
 SortUsingDeclarations: LexicographicNumeric
 SpaceAfterCStyleCast: false
diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index 557670a..8e09ee8 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -83,6 +83,14 @@ A `.clang-format` configuration is provided with our defaults, please run source
 files through `clang-format` (or a formatter that produces equivalent behavior)
 before finalizing PR for submission.
 
+The `.clang-format` is the google style (as of feb 27 2024), except with 
+`SortIncludes` set to `false` to avoid breaking copybara path substitutions 
+which rely on adjacent comments.
+
+For transparency, `.clang-format` can be reproduced using the 
+`make_clang_format_config.sh` script in `utils/` run with `clang-format` version
+17.0.6.
+
 ## Compile-Time Flags (Advanced)
 
 There are several compile-time flags to be aware of (note these may or may not 
diff --git a/gemma.h b/gemma.h
index a218878..2d5e713 100644
--- a/gemma.h
+++ b/gemma.h
@@ -29,10 +29,10 @@
 // copybara:import_next_line:gemma_cpp
 #include "configs.h"  // kSeqLen
 // copybara:import_next_line:gemma_cpp
+#include "util/args.h"  // ArgsBase
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // hwy::bfloat16_t
 #include "hwy/contrib/thread_pool/thread_pool.h"
-#include "util/args.h"  // ArgsBase
 // copybara:import_next_line:sentencepiece
 #include "src/sentencepiece_processor.h"
 
diff --git a/util/make_clang_format_config.sh b/util/make_clang_format_config.sh
new file mode 100755
index 0000000..5261e2c
--- /dev/null
+++ b/util/make_clang_format_config.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+# Reproduces .clang-format file.
+clang-format -style="{BasedOnStyle: Google, SortIncludes: false}" -dump-config > .clang-format

From d37f9c36042152037c5a99dffea47b448f2d876b Mon Sep 17 00:00:00 2001
From: austinvhuang <austinh@alum.mit.edu>
Date: Tue, 27 Feb 2024 21:23:33 -0500
Subject: [PATCH 22/26] re-enable SortIncludes to conform to vanilla Google
 style, add comment lines to #includes in gemma.h as barriers to block
 destructive sorting, update doc + remove shell script

---
 .clang-format                    | 270 +------------------------------
 DEVELOPERS.md                    |   8 -
 gemma.h                          |   4 +
 util/make_clang_format_config.sh |   4 -
 4 files changed, 5 insertions(+), 281 deletions(-)
 delete mode 100755 util/make_clang_format_config.sh

diff --git a/.clang-format b/.clang-format
index 3465c13..f6cb8ad 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,269 +1 @@
----
-Language:        Cpp
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignArrayOfStructures: None
-AlignConsecutiveAssignments:
-  Enabled:         false
-  AcrossEmptyLines: false
-  AcrossComments:  false
-  AlignCompound:   false
-  PadOperators:    true
-AlignConsecutiveBitFields:
-  Enabled:         false
-  AcrossEmptyLines: false
-  AcrossComments:  false
-  AlignCompound:   false
-  PadOperators:    false
-AlignConsecutiveDeclarations:
-  Enabled:         false
-  AcrossEmptyLines: false
-  AcrossComments:  false
-  AlignCompound:   false
-  PadOperators:    false
-AlignConsecutiveMacros:
-  Enabled:         false
-  AcrossEmptyLines: false
-  AcrossComments:  false
-  AlignCompound:   false
-  PadOperators:    false
-AlignConsecutiveShortCaseStatements:
-  Enabled:         false
-  AcrossEmptyLines: false
-  AcrossComments:  false
-  AlignCaseColons: false
-AlignEscapedNewlines: Left
-AlignOperands:   Align
-AlignTrailingComments:
-  Kind:            Always
-  OverEmptyLines:  0
-AllowAllArgumentsOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: Never
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortEnumsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: WithoutElse
-AllowShortLambdasOnASingleLine: All
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
-AttributeMacros:
-  - __capability
-BinPackArguments: true
-BinPackParameters: true
-BitFieldColonSpacing: Both
-BraceWrapping:
-  AfterCaseLabel:  false
-  AfterClass:      false
-  AfterControlStatement: Never
-  AfterEnum:       false
-  AfterExternBlock: false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  BeforeCatch:     false
-  BeforeElse:      false
-  BeforeLambdaBody: false
-  BeforeWhile:     false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakAfterAttributes: Never
-BreakAfterJavaFieldAnnotations: false
-BreakArrays:     true
-BreakBeforeBinaryOperators: None
-BreakBeforeConceptDeclarations: Always
-BreakBeforeBraces: Attach
-BreakBeforeInlineASMColon: OnlyMultiline
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: BeforeColon
-BreakInheritanceList: BeforeColon
-BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: true
-DisableFormat:   false
-EmptyLineAfterAccessModifier: Never
-EmptyLineBeforeAccessModifier: LogicalBlock
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IfMacros:
-  - KJ_IF_MAYBE
-IncludeBlocks:   Regroup
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-    SortPriority:    0
-    CaseSensitive:   false
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-    SortPriority:    0
-    CaseSensitive:   false
-  - Regex:           '^<.*'
-    Priority:        2
-    SortPriority:    0
-    CaseSensitive:   false
-  - Regex:           '.*'
-    Priority:        3
-    SortPriority:    0
-    CaseSensitive:   false
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IncludeIsMainSourceRegex: ''
-IndentAccessModifiers: false
-IndentCaseBlocks: false
-IndentCaseLabels: true
-IndentExternBlock: AfterExternBlock
-IndentGotoLabels: true
-IndentPPDirectives: None
-IndentRequiresClause: true
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-InsertBraces:    false
-InsertNewlineAtEOF: false
-InsertTrailingCommas: None
-IntegerLiteralSeparator:
-  Binary:          0
-  BinaryMinDigits: 0
-  Decimal:         0
-  DecimalMinDigits: 0
-  Hex:             0
-  HexMinDigits:    0
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-KeepEmptyLinesAtEOF: false
-LambdaBodyIndentation: Signature
-LineEnding:      DeriveLF
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 2
-ObjCBreakBeforeNestedBlockParam: true
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PackConstructorInitializers: NextLine
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakOpenParenthesis: 0
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyIndentedWhitespace: 0
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-PPIndentWidth:   -1
-QualifierAlignment: Leave
-RawStringFormats:
-  - Language:        Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-  - Language:        TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-      - ParseTestProto
-      - ParsePartialTestProto
-    CanonicalDelimiter: pb
-    BasedOnStyle:    google
-ReferenceAlignment: Pointer
-ReflowComments:  true
-RemoveBracesLLVM: false
-RemoveParentheses: Leave
-RemoveSemicolon: false
-RequiresClausePosition: OwnLine
-RequiresExpressionIndentation: OuterScope
-SeparateDefinitionBlocks: Leave
-ShortNamespaceLines: 1
-SortIncludes:    Never
-SortJavaStaticImport: Before
-SortUsingDeclarations: LexicographicNumeric
-SpaceAfterCStyleCast: false
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceAroundPointerQualifiers: Default
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCaseColon: false
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeJsonColon: false
-SpaceBeforeParens: ControlStatements
-SpaceBeforeParensOptions:
-  AfterControlStatements: true
-  AfterForeachMacros: true
-  AfterFunctionDefinitionName: false
-  AfterFunctionDeclarationName: false
-  AfterIfMacros:   true
-  AfterOverloadedOperator: false
-  AfterRequiresInClause: false
-  AfterRequiresInExpression: false
-  BeforeNonEmptyParentheses: false
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceBeforeSquareBrackets: false
-SpaceInEmptyBlock: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  Never
-SpacesInContainerLiterals: true
-SpacesInLineCommentPrefix:
-  Minimum:         1
-  Maximum:         -1
-SpacesInParens:  Never
-SpacesInParensOptions:
-  InCStyleCasts:   false
-  InConditionalStatements: false
-  InEmptyParentheses: false
-  Other:           false
-SpacesInSquareBrackets: false
-Standard:        Auto
-StatementAttributeLikeMacros:
-  - Q_EMIT
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-TabWidth:        8
-UseTab:          Never
-VerilogBreakBetweenInstancePorts: true
-WhitespaceSensitiveMacros:
-  - BOOST_PP_STRINGIZE
-  - CF_SWIFT_NAME
-  - NS_SWIFT_NAME
-  - PP_STRINGIZE
-  - STRINGIZE
-...
-
+BasedOnStyle: Google
diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index 8e09ee8..557670a 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -83,14 +83,6 @@ A `.clang-format` configuration is provided with our defaults, please run source
 files through `clang-format` (or a formatter that produces equivalent behavior)
 before finalizing PR for submission.
 
-The `.clang-format` is the google style (as of feb 27 2024), except with 
-`SortIncludes` set to `false` to avoid breaking copybara path substitutions 
-which rely on adjacent comments.
-
-For transparency, `.clang-format` can be reproduced using the 
-`make_clang_format_config.sh` script in `utils/` run with `clang-format` version
-17.0.6.
-
 ## Compile-Time Flags (Advanced)
 
 There are several compile-time flags to be aware of (note these may or may not 
diff --git a/gemma.h b/gemma.h
index 2d5e713..1ff98c1 100644
--- a/gemma.h
+++ b/gemma.h
@@ -26,15 +26,19 @@
 
 // copybara:import_next_line:gemma_cpp
 #include "compression/compress.h"  // SfpStream/NuqStream
+// copybara:end
 // copybara:import_next_line:gemma_cpp
 #include "configs.h"  // kSeqLen
+// copybara:end
 // copybara:import_next_line:gemma_cpp
 #include "util/args.h"  // ArgsBase
+// copybara:end
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // hwy::bfloat16_t
 #include "hwy/contrib/thread_pool/thread_pool.h"
 // copybara:import_next_line:sentencepiece
 #include "src/sentencepiece_processor.h"
+// copybara:end
 
 namespace gcpp {
 
diff --git a/util/make_clang_format_config.sh b/util/make_clang_format_config.sh
deleted file mode 100755
index 5261e2c..0000000
--- a/util/make_clang_format_config.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-# Reproduces .clang-format file.
-clang-format -style="{BasedOnStyle: Google, SortIncludes: false}" -dump-config > .clang-format

From 060c8862ddb4e63a862ecae6e9ea8cbec91c7cec Mon Sep 17 00:00:00 2001
From: austinvhuang <austinh@alum.mit.edu>
Date: Tue, 27 Feb 2024 21:36:43 -0500
Subject: [PATCH 23/26] whitespace cleanup

---
 DEVELOPERS.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index 557670a..f670c49 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -73,7 +73,7 @@ The implementation code is roughly split into 4 layers, from high to low level:
 
 Besides these layers, supporting utilities are:
 
-- `compression/` - model compression operations. The 8-bit switched floating 
+- `compression/` - model compression operations. The 8-bit switched floating
   point model conversion is here.
 - `util/` - command line argument handling and any other utilities.
 
@@ -85,17 +85,17 @@ before finalizing PR for submission.
 
 ## Compile-Time Flags (Advanced)
 
-There are several compile-time flags to be aware of (note these may or may not 
+There are several compile-time flags to be aware of (note these may or may not
 be exposed to the build system):
 
-- `GEMMA_WEIGHT_T` : Sets the level of compression for weights (surfaced as 
-  WEIGHT_TYPE in CMakeLists.txt). Currently this should be set to `SfpStream` 
-  (default, if no flag is specified) for 8-bit SFP, or `hwy::bfloat16_t` to 
+- `GEMMA_WEIGHT_T` : Sets the level of compression for weights (surfaced as
+  WEIGHT_TYPE in CMakeLists.txt). Currently this should be set to `SfpStream`
+  (default, if no flag is specified) for 8-bit SFP, or `hwy::bfloat16_t` to
   enable for higher-fidelity (but slower) bfloat16 support. This is defined in
   `gemma.h`.
 - `GEMMA_MAX_SEQ_LEN` : Sets maximum sequence length to preallocate for the KV
   Cache. The default is 4096 tokens but can be overridden. This is not exposed
-  through `CMakeLists.txt` yet. 
+  through `CMakeLists.txt` yet.
 
 In the medium term both of these will likely be deprecated in favor of handling
 options at runtime - allowing for multiple weight compression schemes in a single

From 272f17ddb3dfae0d45381262a44cd079564171d4 Mon Sep 17 00:00:00 2001
From: Jan Wassenberg <janwas@google.com>
Date: Wed, 28 Feb 2024 05:53:52 -0800
Subject: [PATCH 24/26] Warning fixes: unused member, cast, unused function

PiperOrigin-RevId: 611074887
---
 compression/distortion.h |  2 ++
 gemma.cc                 | 20 +++++++++++---------
 util/args.h              |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/compression/distortion.h b/compression/distortion.h
index 8c0742a..5fd778f 100644
--- a/compression/distortion.h
+++ b/compression/distortion.h
@@ -25,6 +25,8 @@ namespace gcpp {
 class DistortionStats {
  public:
   void Notify(float original, float distorted) {
+    (void)padding_;  // prevent unused member warning
+
     const double l1 = hwy::ScalarAbs(original - distorted);
 
     if (l1 > max_l1_) {
diff --git a/gemma.cc b/gemma.cc
index 70777ac..4775f89 100644
--- a/gemma.cc
+++ b/gemma.cc
@@ -633,30 +633,32 @@ void ForEachTensor(const Weights<TConfig>* weights,
        c_weights.c_final_norm_scale);
 
   char name[16];
-  for (size_t layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
-    Layer<TConfig>* layer = weights ? &weights->layers[layer_idx] : nullptr;
-    CompressedLayer<TConfig>* c_layer = c_weights.CLayer(layer_idx);
+  for (int layer_idx = 0; layer_idx < static_cast<int>(TConfig::kLayers);
+       ++layer_idx) {
+    const size_t idx = static_cast<size_t>(layer_idx);
+    Layer<TConfig>* layer = weights ? &weights->layers[idx] : nullptr;
+    CompressedLayer<TConfig>* c_layer = c_weights.CLayer(idx);
 
-    snprintf(name, sizeof(name), "pre_ff_ns_%lu", layer_idx);
+    snprintf(name, sizeof(name), "pre_ff_ns_%d", layer_idx);
     func(name, layer ? layer->pre_ffw_norm_scale.data() : nullptr,
          c_layer->c_pre_ffw_norm_scale);
 
-    snprintf(name, sizeof(name), "gating_ein_%lu", layer_idx);
+    snprintf(name, sizeof(name), "gating_ein_%d", layer_idx);
     func(name, layer ? layer->gating_einsum_w.data() : nullptr,
          c_layer->c_gating_einsum_w);
 
-    snprintf(name, sizeof(name), "linear_w_%lu", layer_idx);
+    snprintf(name, sizeof(name), "linear_w_%d", layer_idx);
     func(name, layer ? layer->linear_w.data() : nullptr, c_layer->c_linear_w);
-    snprintf(name, sizeof(name), "qkv_ein_%lu", layer_idx);
+    snprintf(name, sizeof(name), "qkv_ein_%d", layer_idx);
 
     func(name, layer ? layer->qkv_einsum_w.data() : nullptr,
          c_layer->c_qkv_einsum_w);
-    snprintf(name, sizeof(name), "att_ein_%lu", layer_idx);
+    snprintf(name, sizeof(name), "att_ein_%d", layer_idx);
 
     func(name, layer ? layer->attn_vec_einsum_w.data() : nullptr,
          c_layer->c_attn_vec_einsum_w);
 
-    snprintf(name, sizeof(name), "pre_att_ns_%lu", layer_idx);
+    snprintf(name, sizeof(name), "pre_att_ns_%d", layer_idx);
     func(name, layer ? layer->pre_attention_norm_scale.data() : nullptr,
          c_layer->c_pre_attention_norm_scale);
   }
diff --git a/util/args.h b/util/args.h
index ce03ef2..b9ab985 100644
--- a/util/args.h
+++ b/util/args.h
@@ -204,7 +204,7 @@ class ArgsBase {
   }
 };
 
-static bool HasHelp(int argc, char* argv[]) {
+static inline HWY_MAYBE_UNUSED bool HasHelp(int argc, char* argv[]) {
   // TODO(austinvhuang): handle case insensitivity
   if (argc == 1) {
     // no arguments - print help

From 0ea7b993def742f79c6f4b584ac4b3b127d8edd8 Mon Sep 17 00:00:00 2001
From: austinvhuang <austinh@alum.mit.edu>
Date: Wed, 28 Feb 2024 15:18:40 -0500
Subject: [PATCH 25/26] remove --log fixing
 https://github.com/google/gemma.cpp/issues/59, improve command line args
 help, add copybara #include sort guards in more source files, add README
 sections on running faster and related projects

---
 README.md  | 30 ++++++++++++++++++++++--
 gemma.h    | 17 +++++++-------
 run.cc     | 69 ++++++++++++++++++++++++++++++++++++------------------
 util/app.h | 19 ++++++++-------
 4 files changed, 93 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index 8db6862..331d96f 100644
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ weights enable faster inference. In general, we recommend starting with the
 | `7b-pt`     | 7 billion parameter pre-trained model, bfloat16 |
 | `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |
 
-> [!NOTE] 
+> [!NOTE]
 > **Important**: We strongly recommend starting off with the `2b-it-sfp` model to
 > get up and running.
 
@@ -116,7 +116,7 @@ The build system uses [CMake](https://cmake.org/). To build the gemma inference
 runtime, create a build directory and generate the build files using `cmake`
 from the top-level project directory. Note if you previous ran `cmake` and are
 re-running with a different setting, be sure to clean out the `build/` directory
-with `rm -rf build/*` (warning this will delete any other files in the `build/` 
+with `rm -rf build/*` (warning this will delete any other files in the `build/`
 directory.
 
 For the 8-bit switched floating point weights (sfp), run cmake with no options:
@@ -242,6 +242,21 @@ We're working on a python script to convert a standard model format to `.sbs`,
 and hope have it available in the next week or so. Follow [this
 issue](https://github.com/google/gemma.cpp/issues/11) for updates.
 
+**What are some easy ways to make the model run faster?**
+
+1. Make sure you are using the 8-bit switched floating point `-sfp` models.
+2. If you're on a laptop, make sure power mode is set to maximize performance
+and saving mode is **off**. For most laptops, the power saving modes get
+activated automatically if the computer is not plugged in.
+3. Close other unused cpu-intensive applications.
+4. On macs, anecdotally we observe a "warm-up" ramp-up in speed as performance
+cores get engaged.
+5. Experiment with the `--num_threads` argument value. Depending on the device,
+larger numbers don't always mean better performance.
+
+We're also working on algorithmic and optimization approaches for faster
+inference, stay tuned.
+
 ## Usage
 
 `gemma` has different usage modes, controlled by the verbosity flag.
@@ -415,6 +430,17 @@ make -j [number of parallel threads to use] libgemma
 If this is successful, you should now have a `libgemma` library file in the
 `build/` directory. On Unix platforms, the filename is `libgemma.a`.
 
+## Independent Projects Using gemma.cpp
+
+Some independent projects using gemma.cpp:
+
+- [gemma-cpp-python - Python bindings](https://github.com/namtranase/gemma-cpp-python)
+- [lua-cgemma - Lua bindings](https://github.com/ufownl/lua-cgemma)
+- [Godot engine demo project](https://github.com/Rliop913/Gemma-godot-demo-project)
+
+If you would like to have your project included, feel free to get in touch or
+submit a PR with a `README.md` edit.
+
 ## Acknowledgements and Contacts
 
 gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.com)
diff --git a/gemma.h b/gemma.h
index 1ff98c1..7195bc9 100644
--- a/gemma.h
+++ b/gemma.h
@@ -122,21 +122,22 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   template <class Visitor>
   void ForEach(const Visitor& visitor) {
     visitor(tokenizer, "tokenizer", Path(),
-            "Path name of tokenizer model file. (required)");
+            "Path name of tokenizer model file.\n    Required argument.");
     visitor(
         cache, "compressed_weights", Path(),
         "Path name of compressed weights file, regenerated from `--weights` "
         "file if "
-        "the compressed weights file does not exist. (required)");
+        "the compressed weights file does not exist.\n    Required argument.");
     visitor(model_type, "model", std::string(),
-            "Model type - can be 2b-it (2B parameters, instruction-tuned), "
-            "2b-pt (2B parameters, pretrained), 7b-it (7B parameters, "
-            "instruction-tuned), or 7b-pt (7B parameters, pretrained). "
-            "(required)");
+            "Model type\n    2b-it (2B parameters, instruction-tuned)\n    "
+            "2b-pt (2B parameters, pretrained)\n    7b-it (7B parameters "
+            "instruction-tuned)\n    7b-pt (7B parameters, pretrained)\n"
+            "    Required argument.");
     visitor(model, "weights", Path(),
             "Path name of model weights (.sbs) file. Only required if "
             "compressed_weights file is not present and needs to be "
-            "regenerated. Otherwise, not needed");
+            "regenerated. This parameter is only required for compressing"
+            "new model weight exports, otherwise it is not needed.");
   }
 };
 
@@ -192,7 +193,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
             "Make top-k sampling deterministic", 2);
     visitor(multiturn, "multiturn", false,
             "Multiturn mode (if 0, this clears the KV cache after every "
-            "interaction without quitting)\n    Default = 0 (conversation "
+            "interaction without quitting)\n    Default : 0 (conversation "
             "resets every turn)");
   }
 };
diff --git a/run.cc b/run.cc
index 2d9a15e..507979d 100644
--- a/run.cc
+++ b/run.cc
@@ -24,12 +24,16 @@
 
 // copybara:import_next_line:gemma_cpp
 #include "compression/compress.h"
+// copybara:end
 // copybara:import_next_line:gemma_cpp
-#include "gemma.h"    // Gemma
+#include "gemma.h"  // Gemma
+// copybara:end
 // copybara:import_next_line:gemma_cpp
 #include "util/app.h"
+// copybara:end
 // copybara:import_next_line:gemma_cpp
 #include "util/args.h"  // HasHelp
+// copybara:end
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
@@ -39,20 +43,13 @@
 
 namespace gcpp {
 
-void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
-              gcpp::AppArgs& app) {
-  fprintf(stderr,
-          "\ngemma.cpp\n---------\n\nTo run gemma.cpp, you need to "
-          "specify 3 required model loading arguments: --tokenizer, "
-          "--compressed_weights, "
-          "and --model.\n\nModel Loading Arguments\n\n");
-  loader.Help();
-  fprintf(stderr, "\nInference Arguments\n\n");
-  inference.Help();
-  fprintf(stderr, "\nApplication Arguments\n\n");
-  app.Help();
-  fprintf(stderr, "\n\n");
-}
+static constexpr std::string_view kAsciiArtBanner =
+    "  __ _  ___ _ __ ___  _ __ ___   __ _   ___ _ __  _ __\n"
+    " / _` |/ _ \\ '_ ` _ \\| '_ ` _ \\ / _` | / __| '_ \\| '_ \\\n"
+    "| (_| |  __/ | | | | | | | | | | (_| || (__| |_) | |_) |\n"
+    " \\__, |\\___|_| |_| |_|_| |_| |_|\\__,_(_)___| .__/| .__/\n"
+    "  __/ |                                    | |   | |\n"
+    " |___/                                     |_|   |_|";
 
 void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
   loader.Print(app.verbosity);
@@ -69,7 +66,8 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
               << std::thread::hardware_concurrency() << std::endl
               << "Instruction set               : "
               << hwy::TargetName(hwy::DispatchedTarget()) << " ("
-              << hwy::VectorBytes() * 8 << " bits)" << "\n"
+              << hwy::VectorBytes() * 8 << " bits)"
+              << "\n"
               << "Weight Type                   : "
               << gcpp::TypeName(gcpp::WeightT()) << "\n"
               << "EmbedderInput Type            : "
@@ -77,11 +75,31 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
   }
 }
 
+void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
+              gcpp::AppArgs& app) {
+  std::cerr
+      << kAsciiArtBanner
+      << "\n\ngemma.cpp : a lightweight, standalone C++ inference engine\n"
+         "==========================================================\n\n"
+         "To run gemma.cpp, you need to "
+         "specify 3 required model loading arguments:\n    --tokenizer\n    "
+         "--compressed_weights\n"
+         "    --model.\n";
+  std::cerr << "\n*Example Usage*\n\n./gemma --tokenizer tokenizer.spm "
+               "--compressed_weights 2b-it-sfp.sbs --model 2b-it\n";
+  std::cerr << "\n*Model Loading Arguments*\n\n";
+  loader.Help();
+  std::cerr << "\n*Inference Arguments*\n\n";
+  inference.Help();
+  std::cerr << "\n*Application Arguments*\n\n";
+  app.Help();
+  std::cerr << "\n";
+}
+
 void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
                hwy::ThreadPool& inner_pool, const InferenceArgs& args,
                int verbosity, const gcpp::AcceptFunc& accept_token,
-               std::string &eot_line
-) {
+               std::string& eot_line) {
   PROFILER_ZONE("Gen.misc");
   int abs_pos = 0;      // absolute token index over all turns
   int current_pos = 0;  // token index within the current turn
@@ -234,8 +252,12 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
 
     const std::string instructions =
         "*Usage*\n"
-        "  Enter an instruction and press enter (%C reset conversation, "
-        "%Q quits).\n\n"
+        "  Enter an instruction and press enter (%C resets conversation, "
+        "%Q quits).\n" +
+        (inference.multiturn == 0
+             ? std::string("  Since multiturn is set to 0, conversation will "
+                           "automatically reset every turn.\n\n")
+             : "\n") +
         "*Examples*\n"
         "  - Write an email to grandma thanking her for the cookies.\n"
         "  - What are some historical attractions to visit around "
@@ -244,13 +266,14 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
         "  - Write a standup comedy bit about GPU programming.\n";
 
     std::cout << "\033[2J\033[1;1H"  // clear screen
-              << banner_ascii_art << "\n\n";
+              << kAsciiArtBanner << "\n\n";
     ShowConfig(loader, inference, app);
     std::cout << "\n" << instructions << "\n";
   }
 
-  ReplGemma(model, pool, inner_pool, inference, app.verbosity,
-            /*accept_token=*/[](int) { return true; }, app.eot_line);
+  ReplGemma(
+      model, pool, inner_pool, inference, app.verbosity,
+      /*accept_token=*/[](int) { return true; }, app.eot_line);
 }
 
 }  // namespace gcpp
diff --git a/util/app.h b/util/app.h
index f66a6cd..7f926a5 100644
--- a/util/app.h
+++ b/util/app.h
@@ -31,6 +31,7 @@
 
 // copybara:import_next_line:gemma_cpp
 #include "util/args.h"
+// copybara:end
 #include "hwy/base.h"  // HWY_ASSERT
 
 namespace gcpp {
@@ -77,7 +78,6 @@ class AppArgs : public ArgsBase<AppArgs> {
 
   template <class Visitor>
   void ForEach(const Visitor& visitor) {
-    visitor(log, "log", Path{"/tmp/log.txt"}, "Logging file", 2);
     visitor(verbosity, "verbosity", 1,
             "Show verbose developer information\n   0 = only print generation "
             "output\n   1 = standard user-facing terminal ui\n   2 = show "
@@ -85,15 +85,16 @@ class AppArgs : public ArgsBase<AppArgs> {
             2);
     visitor(num_threads, "num_threads",
             kDefaultNumThreads,  // see ChooseNumThreads
-            "Number of threads to use. Default value is set based on an "
-            "estimate of "
-            "how many concurrent threads are supported.",
-            2);
-    visitor(eot_line, "eot_line", std::string(""),
-            "End of turn line. "
-            "When you specify this, the prompt will be all lines "
-            "before the line where only the given string appears.",
+            "Number of threads to use.\n    Default = Estimate of the "
+            "number of suupported concurrent threads.",
             2);
+    visitor(
+        eot_line, "eot_line", std::string(""),
+        "End of turn line. "
+        "When you specify this, the prompt will be all lines "
+        "before the line where only the given string appears.\n    Default = "
+        "When a newline is encountered, that signals the end of the turn.",
+        2);
   }
 };
 

From b6aaf6bbb8a7f0b99330df8710765123ab330766 Mon Sep 17 00:00:00 2001
From: Jan Wassenberg <janwas@google.com>
Date: Wed, 28 Feb 2024 15:29:45 -0800
Subject: [PATCH 26/26] Fix for Android's 32-bit off_t. Fixes #62

PiperOrigin-RevId: 611249534
---
 compression/blob_store.cc | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/compression/blob_store.cc b/compression/blob_store.cc
index 550c727..e088fc6 100644
--- a/compression/blob_store.cc
+++ b/compression/blob_store.cc
@@ -13,6 +13,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Request POSIX 2008, including `pread()` and `posix_fadvise()`.
+#if !defined(_XOPEN_SOURCE) || _XOPEN_SOURCE < 700
+#undef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 700
+#endif
+#if !defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE < 200809
+#define _POSIX_C_SOURCE 200809
+#endif
+
+// Make `off_t` 64-bit even on 32-bit systems. Works for Android >= r15c.
+#undef _FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 64
+
 // copybara:import_next_line:gemma_cpp
 #include "compression/blob_store.h"
 
@@ -81,7 +94,7 @@ static int64_t pwrite(int fd, const void* buf, uint64_t size, uint64_t offset) {
 }
 
 #endif
-}
+}  // namespace
 
 namespace gcpp {
 
@@ -133,6 +146,7 @@ struct IO {
       return 0;
     }
 #else
+    static_assert(sizeof(off_t) == 8, "64-bit off_t required");
     const off_t size = lseek(fd, 0, SEEK_END);
     HWY_ASSERT(close(fd) != -1);
     if (size == static_cast<off_t>(-1)) {
@@ -318,7 +332,8 @@ class BlobStore {
 BlobError BlobReader::Open(const char* filename) {
 #if HWY_OS_WIN
   DWORD flags = FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN;
-  HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, flags, nullptr);
+  HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, nullptr,
+                            OPEN_EXISTING, flags, nullptr);
   if (file == INVALID_HANDLE_VALUE) return __LINE__;
   fd_ = _open_osfhandle(reinterpret_cast<intptr_t>(file), _O_RDONLY);
 #else
@@ -326,7 +341,7 @@ BlobError BlobReader::Open(const char* filename) {
 #endif
   if (fd_ < 0) return __LINE__;
 
-#if _POSIX_C_SOURCE >= 200112L
+#if HWY_OS_LINUX
   // Doubles the readahead window, which seems slightly faster when cached.
   (void)posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
 #endif
@@ -403,7 +418,8 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
   // Create/replace existing file.
 #if HWY_OS_WIN
   DWORD flags = FILE_ATTRIBUTE_NORMAL;
-  HANDLE file = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, flags, nullptr);
+  HANDLE file = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS,
+                            flags, nullptr);
   if (file == INVALID_HANDLE_VALUE) return __LINE__;
   const int fd = _open_osfhandle(reinterpret_cast<intptr_t>(file), _O_WRONLY);
 #else