From 0263ccf2763c40b5866365acb8dde3db13aa5f2b Mon Sep 17 00:00:00 2001 From: "The gemma.cpp Authors" Date: Fri, 23 Feb 2024 01:19:17 -0800 Subject: [PATCH 1/4] remove UNROLL, negligible effect on codegen and may raise GCC warnings PiperOrigin-RevId: 609648249 --- ops.h | 1 - 1 file changed, 1 deletion(-) diff --git a/ops.h b/ops.h index ac91cc5..5539892 100644 --- a/ops.h +++ b/ops.h @@ -151,7 +151,6 @@ HWY_INLINE void FullDotProductsForStrip( MaxCols(), vec_aligned, out); // For further multiples of MaxCols, accumulate. Remainders handled below. size_t c0 = MaxCols(); - HWY_UNROLL(1) for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) { AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, num_rows, MaxCols(), vec_aligned, out); From 7c9954dea532ac4bee98d9d945747fb287ffaac9 Mon Sep 17 00:00:00 2001 From: "The gemma.cpp Authors" Date: Fri, 23 Feb 2024 07:12:42 -0800 Subject: [PATCH 2/4] Code update PiperOrigin-RevId: 609719211 --- README.md | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ab7bf2d..ce7fe25 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,8 @@ Visit [the Gemma model page on Kaggle](https://www.kaggle.com/models/google/gemma) and select `Model Variations |> Gemma C++`. On this tab, the `Variation` dropdown includes the options below. Note bfloat16 weights are higher fidelity, while 8-bit switched floating point -weights enable faster inference. +weights enable faster inference. In general, we recommend starting with the +`-sfp` checkpoints. 2B instruction-tuned (`it`) and pre-trained (`pt`) models: @@ -81,8 +82,9 @@ weights enable faster inference. | `7b-pt` | 7 billion parameter pre-trained model, bfloat16 | | `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point | -> [!NOTE] -> We *recommend starting with `2b-it-sfp`* to get up and running. +> [!NOTE] +> **Important**: We strongly recommend starting off with the `2b-it-sfp` model to +> get up and running. ### Step 2: Extract Files @@ -102,22 +104,42 @@ convenient directory location (e.g. the `build/` directory in this repo). The build system uses [CMake](https://cmake.org/). To build the gemma inference runtime, create a build directory and generate the build files using `cmake` -from the top-level project directory: +from the top-level project directory. For the 8-bit switched floating point +weights (sfp), run cmake with no options: ```sh cmake -B build ``` -Then run `make` to build the `./gemma` executable: +**or** if you downloaded bfloat16 weights (any model *without* `-sfp` in the name), +instead of running cmake with no options as above, run cmake with WEIGHT_TYPE +set to [highway's](https://github.com/google/highway) `hwy::bfloat16_t` type +(this will be simplified in the future, we recommend using `-sfp` weights +instead of bfloat16 for faster inference): + +```sh +cmake -B build -DWEIGHT_TYPE=hwy::bfloat16_t +``` + +After running whichever of the above `cmake` invocations that is appropriate for +your weights, you can enter the `build/` directory and run `make` to build the +`./gemma` executable: ```sh cd build make -j [number of parallel threads to use] gemma ``` +Replace `[number of parallel threads to use]` with a number - the number of +cores available on your system is a reasonable heuristic. + For example, `make -j4 gemma` will build using 4 threads. If this is successful, you should now have a `gemma` executable in the `build/` directory. If the -`nproc` command is available, you can use `make -j$(nproc) gemma`. +`nproc` command is available, you can use `make -j$(nproc) gemma` as a +reasonable default for the number of threads. + +If you aren't sure of the right value for the `-j` flag, you can simply run +`make gemma` instead and it should still build the `./gemma` executable. > [!NOTE] > On Windows Subsystem for Linux (WSL) users should set the number of @@ -158,6 +180,38 @@ Example invocation for the following configuration: --model 2b-it ``` +### Troubleshooting and FAQs + +**Running `./gemma` fails with "Failed to read cache gating_ein_0 (error 294) ..."** + +The most common problem is that `cmake` was built with the wrong weight type and +`gemma` is attempting to load `bfloat16` weights (`2b-it`, `2b-pt`, `7b-it`, +`7b-pt`) using the default switched floating point (sfp) or vice versa. Revisit +step #3 and check that the `cmake` command used to build `gemma` was correct for +the weights that you downloaded. + +In the future we will handle model format handling from compile time to runtime +to simplify this. + +**Problems building in Windows / Visual Studio** + +Currently if you're using Windows, we recommend building in WSL (Windows +Subsystem for Linux). We are exploring options to enable other build +configurations, see issues for active discussion. + +**Model does not respond to instructions and produces strange output** + +A common issue is that you are using a pre-trained model, which is not +instruction-tuned and thus does not respond to instructions. Make sure you are +using an instruction-tuned model (`2b-it-sfp`, `2b-it`, `7b-it-sfp`, `7b-it`) +and not a pre-trained model (any model with a `-pt` suffix). + +**How do I convert my fine-tune to a `.sbs` compressed model file?** + +We're working on a python script to convert a standard model format to `.sbs`, +and hope have it available in the next week or so. Follow [this +issue](https://github.com/google/gemma.cpp/issues/11) for updates. + ## Usage `gemma` has different usage modes, controlled by the verbosity flag. From a16df06cf2eb38e010b6d8e12f390be24bf63e9d Mon Sep 17 00:00:00 2001 From: "The gemma.cpp Authors" Date: Fri, 23 Feb 2024 08:22:48 -0800 Subject: [PATCH 3/4] Toward Bazel support: expose BUILD, add WORKSPACE/MODULE.bazel. Refs #16 PiperOrigin-RevId: 609734560 --- MODULE.bazel | 14 ++++++++++++++ WORKSPACE | 24 ++++++++++++++++++++++++ gemma.cc | 4 +--- gemma.h | 4 ++-- 4 files changed, 41 insertions(+), 5 deletions(-) create mode 100644 MODULE.bazel create mode 100644 WORKSPACE diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 0000000..5c3eafa --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,14 @@ +module( + name = "gemma", + version = "0.1.0", +) + +bazel_dep( + name = "rules_license", + version = "0.0.7", +) + +bazel_dep( + name = "com_google_sentencepiece", + version = "0.1.96", +) diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 0000000..0be580d --- /dev/null +++ b/WORKSPACE @@ -0,0 +1,24 @@ +workspace(name = "gemma") + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") +load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe") + +maybe( + http_archive, + name = "rules_license", + sha256 = "4531deccb913639c30e5c7512a054d5d875698daeb75d8cf90f284375fe7c360", + urls = [ + "https://github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz", + ], +) + +maybe( + http_archive, + name = "com_google_sentencepiece", + sha256 = "8409b0126ebd62b256c685d5757150cf7fcb2b92a2f2b98efb3f38fc36719754", + strip_prefix = "sentencepiece-0.1.96", + urls = ["https://github.com/google/sentencepiece/archive/refs/tags/v0.1.96.zip"], + build_file = "@//third_party:sentencepiece.bazel", + patches = ["@//third_party:com_google_sentencepiece.patch"], + patch_args = ["-p1"], +) diff --git a/gemma.cc b/gemma.cc index d8e1ca6..70777ac 100644 --- a/gemma.cc +++ b/gemma.cc @@ -62,7 +62,6 @@ #include "hwy/contrib/thread_pool/thread_pool.h" // copybara:import_next_line:sentencepiece #include "src/sentencepiece_processor.h" -// #include "third_party/sentencepiece/src/util.h" namespace gcpp { @@ -205,8 +204,7 @@ struct Activations { static constexpr size_t kQKVDim = TConfig::kQKVDim; static constexpr size_t kHeads = TConfig::kHeads; static constexpr size_t kKVHeads = TConfig::kKVHeads; - static constexpr size_t kCachePosSize = - TConfig::kLayers * kKVHeads * kQKVDim; + static constexpr size_t kCachePosSize = TConfig::kLayers * kKVHeads * kQKVDim; static constexpr size_t kCacheLayerSize = kKVHeads * kQKVDim; std::array x; // input diff --git a/gemma.h b/gemma.h index 67b7f85..5dc9f62 100644 --- a/gemma.h +++ b/gemma.h @@ -24,11 +24,11 @@ #include #include -// copybara:import_next_line:gemma_cpp -#include "configs.h" // kSeqLen // copybara:import_next_line:gemma_cpp #include "compression/compress.h" // SfpStream/NuqStream // copybara:import_next_line:gemma_cpp +#include "configs.h" // kSeqLen +// copybara:import_next_line:gemma_cpp #include "util/args.h" // ArgsBase #include "hwy/aligned_allocator.h" #include "hwy/base.h" // hwy::bfloat16_t From 52e8b88bb014c001edff7da325a30a1f509cca63 Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Sat, 24 Feb 2024 05:58:39 +0900 Subject: [PATCH 4/4] Update build.yml (#22) dispath -> dispatch --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2813be5..929e140 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,6 +1,6 @@ name: Build -# Trigger on push or via manual dispath. +# Trigger on push or via manual dispatch. on: [push, workflow_dispatch] jobs: @@ -42,4 +42,4 @@ jobs: # Explicitly list build targets here. # Building "all" includes test executables and takes much longer. buildWithCMakeArgs: "-- gemma" - buildDirectory: '${{ github.workspace }}/build' \ No newline at end of file + buildDirectory: '${{ github.workspace }}/build'