mirror of https://github.com/google/gemma.cpp.git
Merge branch 'dev' into clang-cl
This commit is contained in:
commit
8f27580fb6
|
|
@ -1,6 +1,6 @@
|
||||||
name: Build
|
name: Build
|
||||||
|
|
||||||
# Trigger on push or via manual dispath.
|
# Trigger on push or via manual dispatch.
|
||||||
on: [push, workflow_dispatch]
|
on: [push, workflow_dispatch]
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
@ -42,4 +42,4 @@ jobs:
|
||||||
# Explicitly list build targets here.
|
# Explicitly list build targets here.
|
||||||
# Building "all" includes test executables and takes much longer.
|
# Building "all" includes test executables and takes much longer.
|
||||||
buildWithCMakeArgs: "-- gemma"
|
buildWithCMakeArgs: "-- gemma"
|
||||||
buildDirectory: '${{ github.workspace }}/build'
|
buildDirectory: '${{ github.workspace }}/build'
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
module(
|
||||||
|
name = "gemma",
|
||||||
|
version = "0.1.0",
|
||||||
|
)
|
||||||
|
|
||||||
|
bazel_dep(
|
||||||
|
name = "rules_license",
|
||||||
|
version = "0.0.7",
|
||||||
|
)
|
||||||
|
|
||||||
|
bazel_dep(
|
||||||
|
name = "com_google_sentencepiece",
|
||||||
|
version = "0.1.96",
|
||||||
|
)
|
||||||
73
README.md
73
README.md
|
|
@ -71,7 +71,8 @@ Visit [the Gemma model page on
|
||||||
Kaggle](https://www.kaggle.com/models/google/gemma) and select `Model Variations
|
Kaggle](https://www.kaggle.com/models/google/gemma) and select `Model Variations
|
||||||
|> Gemma C++`. On this tab, the `Variation` dropdown includes the options below.
|
|> Gemma C++`. On this tab, the `Variation` dropdown includes the options below.
|
||||||
Note bfloat16 weights are higher fidelity, while 8-bit switched floating point
|
Note bfloat16 weights are higher fidelity, while 8-bit switched floating point
|
||||||
weights enable faster inference.
|
weights enable faster inference. In general, we recommend starting with the
|
||||||
|
`-sfp` checkpoints.
|
||||||
|
|
||||||
2B instruction-tuned (`it`) and pre-trained (`pt`) models:
|
2B instruction-tuned (`it`) and pre-trained (`pt`) models:
|
||||||
|
|
||||||
|
|
@ -91,8 +92,9 @@ weights enable faster inference.
|
||||||
| `7b-pt` | 7 billion parameter pre-trained model, bfloat16 |
|
| `7b-pt` | 7 billion parameter pre-trained model, bfloat16 |
|
||||||
| `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |
|
| `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> We *recommend starting with `2b-it-sfp`* to get up and running.
|
> **Important**: We strongly recommend starting off with the `2b-it-sfp` model to
|
||||||
|
> get up and running.
|
||||||
|
|
||||||
### Step 2: Extract Files
|
### Step 2: Extract Files
|
||||||
|
|
||||||
|
|
@ -112,9 +114,27 @@ convenient directory location (e.g. the `build/` directory in this repo).
|
||||||
|
|
||||||
The build system uses [CMake](https://cmake.org/). To build the gemma inference
|
The build system uses [CMake](https://cmake.org/). To build the gemma inference
|
||||||
runtime, create a build directory and generate the build files using `cmake`
|
runtime, create a build directory and generate the build files using `cmake`
|
||||||
from the top-level project directory:
|
from the top-level project directory. For the 8-bit switched floating point
|
||||||
|
weights (sfp), run cmake with no options:
|
||||||
|
|
||||||
#### Unix-like Platforms
|
#### Unix-like Platforms
|
||||||
|
```sh
|
||||||
|
cmake -B build
|
||||||
|
```
|
||||||
|
|
||||||
|
**or** if you downloaded bfloat16 weights (any model *without* `-sfp` in the name),
|
||||||
|
instead of running cmake with no options as above, run cmake with WEIGHT_TYPE
|
||||||
|
set to [highway's](https://github.com/google/highway) `hwy::bfloat16_t` type
|
||||||
|
(this will be simplified in the future, we recommend using `-sfp` weights
|
||||||
|
instead of bfloat16 for faster inference):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
cmake -B build -DWEIGHT_TYPE=hwy::bfloat16_t
|
||||||
|
```
|
||||||
|
|
||||||
|
After running whichever of the above `cmake` invocations that is appropriate for
|
||||||
|
your weights, you can enter the `build/` directory and run `make` to build the
|
||||||
|
`./gemma` executable:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Configure `build` directory
|
# Configure `build` directory
|
||||||
|
|
@ -124,14 +144,21 @@ cmake --preset make
|
||||||
cmake --build --preset make -j [number of parallel threads to use]
|
cmake --build --preset make -j [number of parallel threads to use]
|
||||||
```
|
```
|
||||||
|
|
||||||
If the `nproc` command is available, you can use `-j $(nproc)`.
|
Replace `[number of parallel threads to use]` with a number - the number of
|
||||||
|
cores available on your system is a reasonable heuristic. For example,
|
||||||
|
`make -j4 gemma` will build using 4 threads. If the `nproc` command is
|
||||||
|
available, you can use `make -j$(nproc) gemma` as a reasonable default
|
||||||
|
for the number of threads.
|
||||||
|
|
||||||
If this is successful, you should now have a `gemma` executable in the `build/` directory.
|
If you aren't sure of the right value for the `-j` flag, you can simply run
|
||||||
|
`make gemma` instead and it should still build the `./gemma` executable.
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> On Windows Subsystem for Linux (WSL) users should set the number of
|
> On Windows Subsystem for Linux (WSL) users should set the number of
|
||||||
> parallel threads to 1. Using a larger number may result in errors.
|
> parallel threads to 1. Using a larger number may result in errors.
|
||||||
|
|
||||||
|
If the build is successful, you should now have a `gemma` executable in the `build/` directory.
|
||||||
|
|
||||||
#### Windows
|
#### Windows
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
|
@ -142,7 +169,7 @@ cmake --preset windows
|
||||||
cmake --build --preset windows -j [number of parallel threads to use]
|
cmake --build --preset windows -j [number of parallel threads to use]
|
||||||
```
|
```
|
||||||
|
|
||||||
If this is successful, you should now have a `gemma.exe` executable in the `build/` directory.
|
If the build is successful, you should now have a `gemma.exe` executable in the `build/` directory.
|
||||||
|
|
||||||
### Step 4: Run
|
### Step 4: Run
|
||||||
|
|
||||||
|
|
@ -179,6 +206,38 @@ Example invocation for the following configuration:
|
||||||
--model 2b-it
|
--model 2b-it
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Troubleshooting and FAQs
|
||||||
|
|
||||||
|
**Running `./gemma` fails with "Failed to read cache gating_ein_0 (error 294) ..."**
|
||||||
|
|
||||||
|
The most common problem is that `cmake` was built with the wrong weight type and
|
||||||
|
`gemma` is attempting to load `bfloat16` weights (`2b-it`, `2b-pt`, `7b-it`,
|
||||||
|
`7b-pt`) using the default switched floating point (sfp) or vice versa. Revisit
|
||||||
|
step #3 and check that the `cmake` command used to build `gemma` was correct for
|
||||||
|
the weights that you downloaded.
|
||||||
|
|
||||||
|
In the future we will handle model format handling from compile time to runtime
|
||||||
|
to simplify this.
|
||||||
|
|
||||||
|
**Problems building in Windows / Visual Studio**
|
||||||
|
|
||||||
|
Currently if you're using Windows, we recommend building in WSL (Windows
|
||||||
|
Subsystem for Linux). We are exploring options to enable other build
|
||||||
|
configurations, see issues for active discussion.
|
||||||
|
|
||||||
|
**Model does not respond to instructions and produces strange output**
|
||||||
|
|
||||||
|
A common issue is that you are using a pre-trained model, which is not
|
||||||
|
instruction-tuned and thus does not respond to instructions. Make sure you are
|
||||||
|
using an instruction-tuned model (`2b-it-sfp`, `2b-it`, `7b-it-sfp`, `7b-it`)
|
||||||
|
and not a pre-trained model (any model with a `-pt` suffix).
|
||||||
|
|
||||||
|
**How do I convert my fine-tune to a `.sbs` compressed model file?**
|
||||||
|
|
||||||
|
We're working on a python script to convert a standard model format to `.sbs`,
|
||||||
|
and hope have it available in the next week or so. Follow [this
|
||||||
|
issue](https://github.com/google/gemma.cpp/issues/11) for updates.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
`gemma` has different usage modes, controlled by the verbosity flag.
|
`gemma` has different usage modes, controlled by the verbosity flag.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
workspace(name = "gemma")
|
||||||
|
|
||||||
|
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
|
||||||
|
load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
|
||||||
|
|
||||||
|
maybe(
|
||||||
|
http_archive,
|
||||||
|
name = "rules_license",
|
||||||
|
sha256 = "4531deccb913639c30e5c7512a054d5d875698daeb75d8cf90f284375fe7c360",
|
||||||
|
urls = [
|
||||||
|
"https://github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
maybe(
|
||||||
|
http_archive,
|
||||||
|
name = "com_google_sentencepiece",
|
||||||
|
sha256 = "8409b0126ebd62b256c685d5757150cf7fcb2b92a2f2b98efb3f38fc36719754",
|
||||||
|
strip_prefix = "sentencepiece-0.1.96",
|
||||||
|
urls = ["https://github.com/google/sentencepiece/archive/refs/tags/v0.1.96.zip"],
|
||||||
|
build_file = "@//third_party:sentencepiece.bazel",
|
||||||
|
patches = ["@//third_party:com_google_sentencepiece.patch"],
|
||||||
|
patch_args = ["-p1"],
|
||||||
|
)
|
||||||
4
gemma.cc
4
gemma.cc
|
|
@ -62,7 +62,6 @@
|
||||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||||
// copybara:import_next_line:sentencepiece
|
// copybara:import_next_line:sentencepiece
|
||||||
#include "src/sentencepiece_processor.h"
|
#include "src/sentencepiece_processor.h"
|
||||||
// #include "third_party/sentencepiece/src/util.h"
|
|
||||||
|
|
||||||
namespace gcpp {
|
namespace gcpp {
|
||||||
|
|
||||||
|
|
@ -205,8 +204,7 @@ struct Activations {
|
||||||
static constexpr size_t kQKVDim = TConfig::kQKVDim;
|
static constexpr size_t kQKVDim = TConfig::kQKVDim;
|
||||||
static constexpr size_t kHeads = TConfig::kHeads;
|
static constexpr size_t kHeads = TConfig::kHeads;
|
||||||
static constexpr size_t kKVHeads = TConfig::kKVHeads;
|
static constexpr size_t kKVHeads = TConfig::kKVHeads;
|
||||||
static constexpr size_t kCachePosSize =
|
static constexpr size_t kCachePosSize = TConfig::kLayers * kKVHeads * kQKVDim;
|
||||||
TConfig::kLayers * kKVHeads * kQKVDim;
|
|
||||||
static constexpr size_t kCacheLayerSize = kKVHeads * kQKVDim;
|
static constexpr size_t kCacheLayerSize = kKVHeads * kQKVDim;
|
||||||
|
|
||||||
std::array<float, kBatchSize * kModelDim> x; // input
|
std::array<float, kBatchSize * kModelDim> x; // input
|
||||||
|
|
|
||||||
4
gemma.h
4
gemma.h
|
|
@ -24,11 +24,11 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// copybara:import_next_line:gemma_cpp
|
|
||||||
#include "configs.h" // kSeqLen
|
|
||||||
// copybara:import_next_line:gemma_cpp
|
// copybara:import_next_line:gemma_cpp
|
||||||
#include "compression/compress.h" // SfpStream/NuqStream
|
#include "compression/compress.h" // SfpStream/NuqStream
|
||||||
// copybara:import_next_line:gemma_cpp
|
// copybara:import_next_line:gemma_cpp
|
||||||
|
#include "configs.h" // kSeqLen
|
||||||
|
// copybara:import_next_line:gemma_cpp
|
||||||
#include "util/args.h" // ArgsBase
|
#include "util/args.h" // ArgsBase
|
||||||
#include "hwy/aligned_allocator.h"
|
#include "hwy/aligned_allocator.h"
|
||||||
#include "hwy/base.h" // hwy::bfloat16_t
|
#include "hwy/base.h" // hwy::bfloat16_t
|
||||||
|
|
|
||||||
1
ops.h
1
ops.h
|
|
@ -151,7 +151,6 @@ HWY_INLINE void FullDotProductsForStrip(
|
||||||
MaxCols(), vec_aligned, out);
|
MaxCols(), vec_aligned, out);
|
||||||
// For further multiples of MaxCols, accumulate. Remainders handled below.
|
// For further multiples of MaxCols, accumulate. Remainders handled below.
|
||||||
size_t c0 = MaxCols();
|
size_t c0 = MaxCols();
|
||||||
HWY_UNROLL(1)
|
|
||||||
for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) {
|
for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) {
|
||||||
AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, num_rows,
|
AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, num_rows,
|
||||||
MaxCols(), vec_aligned, out);
|
MaxCols(), vec_aligned, out);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue