From 0263ccf2763c40b5866365acb8dde3db13aa5f2b Mon Sep 17 00:00:00 2001
From: "The gemma.cpp Authors" <no-reply@google.com>
Date: Fri, 23 Feb 2024 01:19:17 -0800
Subject: [PATCH 1/4] remove UNROLL, negligible effect on codegen and may raise
 GCC warnings

PiperOrigin-RevId: 609648249
---
 ops.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ops.h b/ops.h
index ac91cc5..5539892 100644
--- a/ops.h
+++ b/ops.h
@@ -151,7 +151,6 @@ HWY_INLINE void FullDotProductsForStrip(
                              MaxCols(), vec_aligned, out);
   // For further multiples of MaxCols, accumulate. Remainders handled below.
   size_t c0 = MaxCols();
-  HWY_UNROLL(1)
   for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) {
     AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, num_rows,
                                  MaxCols(), vec_aligned, out);

From 7c9954dea532ac4bee98d9d945747fb287ffaac9 Mon Sep 17 00:00:00 2001
From: "The gemma.cpp Authors" <no-reply@google.com>
Date: Fri, 23 Feb 2024 07:12:42 -0800
Subject: [PATCH 2/4] Code update

PiperOrigin-RevId: 609719211
---
 README.md | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 60 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index ab7bf2d..ce7fe25 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,8 @@ Visit [the Gemma model page on
 Kaggle](https://www.kaggle.com/models/google/gemma) and select `Model Variations
 |> Gemma C++`. On this tab, the `Variation` dropdown includes the options below.
 Note bfloat16 weights are higher fidelity, while 8-bit switched floating point
-weights enable faster inference.
+weights enable faster inference. In general, we recommend starting with the
+`-sfp` checkpoints.
 
 2B instruction-tuned (`it`) and pre-trained (`pt`) models:
 
@@ -81,8 +82,9 @@ weights enable faster inference.
 | `7b-pt`     | 7 billion parameter pre-trained model, bfloat16 |
 | `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |
 
-> [!NOTE]
-> We *recommend starting with `2b-it-sfp`* to get up and running.
+> [!NOTE] 
+> **Important**: We strongly recommend starting off with the `2b-it-sfp` model to
+> get up and running.
 
 ### Step 2: Extract Files
 
@@ -102,22 +104,42 @@ convenient directory location (e.g. the `build/` directory in this repo).
 
 The build system uses [CMake](https://cmake.org/). To build the gemma inference
 runtime, create a build directory and generate the build files using `cmake`
-from the top-level project directory:
+from the top-level project directory. For the 8-bit switched floating point
+weights (sfp), run cmake with no options:
 
 ```sh
 cmake -B build
 ```
 
-Then run `make` to build the `./gemma` executable:
+**or** if you downloaded bfloat16 weights (any model *without* `-sfp` in the name),
+instead of running cmake with no options as above, run cmake with WEIGHT_TYPE
+set to [highway's](https://github.com/google/highway) `hwy::bfloat16_t` type
+(this will be simplified in the future, we recommend using `-sfp` weights
+instead of bfloat16 for faster inference):
+
+```sh
+cmake -B build -DWEIGHT_TYPE=hwy::bfloat16_t
+```
+
+After running whichever of the above `cmake` invocations that is appropriate for
+your weights, you can enter the `build/` directory and run `make` to build the
+`./gemma` executable:
 
 ```sh
 cd build
 make -j [number of parallel threads to use] gemma
 ```
 
+Replace `[number of parallel threads to use]` with a number - the number of
+cores available on your system is a reasonable heuristic.
+
 For example, `make -j4 gemma` will build using 4 threads. If this is successful,
 you should now have a `gemma` executable in the `build/` directory. If the
-`nproc` command is available, you can use `make -j$(nproc) gemma`.
+`nproc` command is available, you can use `make -j$(nproc) gemma` as a
+reasonable default for the number of threads. 
+
+If you aren't sure of the right value for the `-j` flag, you can simply run
+`make gemma` instead and it should still build the `./gemma` executable.
 
 > [!NOTE]
 > On Windows Subsystem for Linux (WSL) users should set the number of
@@ -158,6 +180,38 @@ Example invocation for the following configuration:
 --model 2b-it
 ```
 
+### Troubleshooting and FAQs
+
+**Running `./gemma` fails with "Failed to read cache gating_ein_0 (error 294) ..."**
+
+The most common problem is that `cmake` was built with the wrong weight type and
+`gemma` is attempting to load `bfloat16` weights (`2b-it`, `2b-pt`, `7b-it`,
+`7b-pt`) using the default switched floating point (sfp) or vice versa. Revisit
+step #3 and check that the `cmake` command used to build `gemma` was correct for
+the weights that you downloaded.
+
+In the future we will handle model format handling from compile time to runtime
+to simplify this.
+
+**Problems building in Windows / Visual Studio**
+
+Currently if you're using Windows, we recommend building in WSL (Windows
+Subsystem for Linux). We are exploring options to enable other build
+configurations, see issues for active discussion.
+
+**Model does not respond to instructions and produces strange output**
+
+A common issue is that you are using a pre-trained model, which is not
+instruction-tuned and thus does not respond to instructions. Make sure you are
+using an instruction-tuned model (`2b-it-sfp`, `2b-it`, `7b-it-sfp`, `7b-it`)
+and not a pre-trained model (any model with a `-pt` suffix).
+
+**How do I convert my fine-tune to a `.sbs` compressed model file?**
+
+We're working on a python script to convert a standard model format to `.sbs`,
+and hope have it available in the next week or so. Follow [this
+issue](https://github.com/google/gemma.cpp/issues/11) for updates.
+
 ## Usage
 
 `gemma` has different usage modes, controlled by the verbosity flag.

From a16df06cf2eb38e010b6d8e12f390be24bf63e9d Mon Sep 17 00:00:00 2001
From: "The gemma.cpp Authors" <no-reply@google.com>
Date: Fri, 23 Feb 2024 08:22:48 -0800
Subject: [PATCH 3/4] Toward Bazel support: expose BUILD, add
 WORKSPACE/MODULE.bazel. Refs #16

PiperOrigin-RevId: 609734560
---
 MODULE.bazel | 14 ++++++++++++++
 WORKSPACE    | 24 ++++++++++++++++++++++++
 gemma.cc     |  4 +---
 gemma.h      |  4 ++--
 4 files changed, 41 insertions(+), 5 deletions(-)
 create mode 100644 MODULE.bazel
 create mode 100644 WORKSPACE

diff --git a/MODULE.bazel b/MODULE.bazel
new file mode 100644
index 0000000..5c3eafa
--- /dev/null
+++ b/MODULE.bazel
@@ -0,0 +1,14 @@
+module(
+    name = "gemma",
+    version = "0.1.0",
+)
+
+bazel_dep(
+    name = "rules_license",
+    version = "0.0.7",
+)
+
+bazel_dep(
+    name = "com_google_sentencepiece",
+    version = "0.1.96",
+)
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 0000000..0be580d
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,24 @@
+workspace(name = "gemma")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
+
+maybe(
+    http_archive,
+    name = "rules_license",
+    sha256 = "4531deccb913639c30e5c7512a054d5d875698daeb75d8cf90f284375fe7c360",
+    urls = [
+        "https://github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
+    ],
+)
+
+maybe(
+    http_archive,
+    name = "com_google_sentencepiece",
+    sha256 = "8409b0126ebd62b256c685d5757150cf7fcb2b92a2f2b98efb3f38fc36719754",
+    strip_prefix = "sentencepiece-0.1.96",
+    urls = ["https://github.com/google/sentencepiece/archive/refs/tags/v0.1.96.zip"],
+    build_file = "@//third_party:sentencepiece.bazel",
+    patches = ["@//third_party:com_google_sentencepiece.patch"],
+    patch_args = ["-p1"],
+)
diff --git a/gemma.cc b/gemma.cc
index d8e1ca6..70777ac 100644
--- a/gemma.cc
+++ b/gemma.cc
@@ -62,7 +62,6 @@
 #include "hwy/contrib/thread_pool/thread_pool.h"
 // copybara:import_next_line:sentencepiece
 #include "src/sentencepiece_processor.h"
-// #include "third_party/sentencepiece/src/util.h"
 
 namespace gcpp {
 
@@ -205,8 +204,7 @@ struct Activations {
   static constexpr size_t kQKVDim = TConfig::kQKVDim;
   static constexpr size_t kHeads = TConfig::kHeads;
   static constexpr size_t kKVHeads = TConfig::kKVHeads;
-  static constexpr size_t kCachePosSize =
-      TConfig::kLayers * kKVHeads * kQKVDim;
+  static constexpr size_t kCachePosSize = TConfig::kLayers * kKVHeads * kQKVDim;
   static constexpr size_t kCacheLayerSize = kKVHeads * kQKVDim;
 
   std::array<float, kBatchSize * kModelDim> x;  // input
diff --git a/gemma.h b/gemma.h
index 67b7f85..5dc9f62 100644
--- a/gemma.h
+++ b/gemma.h
@@ -24,11 +24,11 @@
 #include <string>
 #include <vector>
 
-// copybara:import_next_line:gemma_cpp
-#include "configs.h" // kSeqLen
 // copybara:import_next_line:gemma_cpp
 #include "compression/compress.h"  // SfpStream/NuqStream
 // copybara:import_next_line:gemma_cpp
+#include "configs.h"               // kSeqLen
+// copybara:import_next_line:gemma_cpp
 #include "util/args.h"             // ArgsBase
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // hwy::bfloat16_t

From 52e8b88bb014c001edff7da325a30a1f509cca63 Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Sat, 24 Feb 2024 05:58:39 +0900
Subject: [PATCH 4/4] Update build.yml (#22)

dispath -> dispatch
---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 2813be5..929e140 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,6 +1,6 @@
 name: Build
 
-# Trigger on push or via manual dispath.
+# Trigger on push or via manual dispatch.
 on: [push, workflow_dispatch]
 
 jobs:
@@ -42,4 +42,4 @@ jobs:
         # Explicitly list build targets here.
         # Building "all" includes test executables and takes much longer.
         buildWithCMakeArgs: "-- gemma"
-        buildDirectory: '${{ github.workspace }}/build'
\ No newline at end of file
+        buildDirectory: '${{ github.workspace }}/build'