Merge pull request #130 from veluca93:weight-handling

PiperOrigin-RevId: 622405491
This commit is contained in:
Copybara-Service 2024-04-06 02:22:00 -07:00
commit 325ef06cf9
11 changed files with 424 additions and 270 deletions

View File

@ -29,15 +29,15 @@
// copybara:import_next_line:gemma_cpp
#include "compression/blob_store.h"
#include <fcntl.h> // open
#include <stdint.h>
#include <stdio.h> // SEEK_END - unistd isn't enough for IDE.
#include <sys/stat.h> // O_RDONLY
#include <fcntl.h> // open
#if HWY_OS_WIN
#include <io.h> // read, write, close
#include <fileapi.h>
#include <io.h> // read, write, close
#else
#include <unistd.h> // read, write, close
#include <unistd.h> // read, write, close
#endif
#include <atomic>
@ -113,8 +113,9 @@ hwy::uint128_t MakeKey(const char* string) {
return ret;
}
static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
std::vector<BlobIO>& requests) {
namespace {
void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
std::vector<BlobIO>& requests) {
// Split into chunks for load-balancing even if blob sizes vary.
constexpr size_t kChunkSize = 4 * 1024 * 1024;
@ -129,7 +130,7 @@ static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
requests.emplace_back(offset + pos, size - pos, data + pos, 0);
}
}
} // namespace
struct IO {
// Returns size in bytes or 0.
@ -197,12 +198,6 @@ static_assert(HWY_IS_LITTLE_ENDIAN, "Assumes little endian");
class BlobStore {
static constexpr uint32_t kMagic = 0x0A534253; // SBS\n
// Blob offsets on disk and memory addresses are a multiple of this, because
// we pad the header and each blob's size. This matches CUDA alignment and the
// maximum SVE vector size, and exceeds typical x86 cache line sizes (64 or
// 128), which can help performance.
static constexpr size_t kAlign = 256;
public:
// NOT including padding, so that we can also use ZeroFillPadding after
// copying the header.
@ -215,13 +210,13 @@ class BlobStore {
// blobs. Requires num_blobs_ to already be set, typically by reading
// sizeof(BlobStore) bytes from disk.
size_t PaddedHeaderSize() const {
return hwy::RoundUpTo(HeaderSize(num_blobs_), kAlign);
return hwy::RoundUpTo(HeaderSize(num_blobs_), kBlobAlign);
}
// Returns aligned offset and zero-fills between that and `offset`.
uint64_t ZeroFillPadding(uint64_t offset) {
uint8_t* const bytes = reinterpret_cast<uint8_t*>(this);
const uint64_t padded = hwy::RoundUpTo(offset, kAlign);
const uint64_t padded = hwy::RoundUpTo(offset, kBlobAlign);
hwy::ZeroBytes(bytes + offset, padded - offset);
return padded;
}
@ -236,7 +231,7 @@ class BlobStore {
for (size_t i = 0; i < num_blobs_; ++i) {
const hwy::uint128_t val = keys_[num_blobs_ + i];
if (val.lo != offset) return __LINE__;
offset = ZeroFillPadding(offset + val.hi);
offset = hwy::RoundUpTo(offset + val.hi, kBlobAlign);
}
if (offset != file_size_) return __LINE__;
@ -253,25 +248,24 @@ class BlobStore {
static std::vector<BlobIO> PrepareWriteRequests(
const hwy::uint128_t keys[], const hwy::Span<uint8_t> blobs[],
size_t num_blobs) {
size_t num_blobs, BlobStore* bs) {
// Sanity check and ensure the cast below is safe.
HWY_ASSERT(num_blobs < (1ULL << 20));
// Allocate var-length header.
const size_t header_size = HeaderSize(num_blobs);
const size_t padded_header_size = hwy::RoundUpTo(header_size, kAlign);
BlobStorePtr bs = Allocate(padded_header_size);
const size_t padded_header_size = hwy::RoundUpTo(header_size, kBlobAlign);
const uint64_t padded_header_end = bs->ZeroFillPadding(header_size);
HWY_ASSERT(padded_header_end == padded_header_size);
// All-zero buffer used to write padding to the file without copying the
// input blobs.
static uint8_t zeros[kAlign] = {0};
static uint8_t zeros[kBlobAlign] = {0};
// Total file size will be the header plus all padded blobs.
uint64_t payload = 0;
for (size_t i = 0; i < num_blobs; ++i) {
payload += hwy::RoundUpTo(blobs[i].size(), kAlign);
payload += hwy::RoundUpTo(blobs[i].size(), kBlobAlign);
}
const size_t total_size = padded_header_size + payload;
@ -285,7 +279,7 @@ class BlobStore {
std::vector<BlobIO> requests;
requests.reserve(1 + 2 * num_blobs);
requests.emplace_back(/*offset=*/0, padded_header_size,
reinterpret_cast<uint8_t*>(bs.get()), 0);
reinterpret_cast<uint8_t*>(bs), 0);
// Fill second half of keys_ with offset/size and prepare IO requests.
uint64_t offset = padded_header_end;
@ -295,10 +289,10 @@ class BlobStore {
EnqueueChunkRequests(offset, blobs[i].size(), blobs[i].data(), requests);
offset += blobs[i].size();
const size_t padded_size = hwy::RoundUpTo(blobs[i].size(), kAlign);
const size_t padded_size = hwy::RoundUpTo(blobs[i].size(), kBlobAlign);
if (padded_size != blobs[i].size()) {
const size_t padding = padded_size - blobs[i].size();
HWY_ASSERT(padding <= kAlign);
HWY_ASSERT(padding <= kBlobAlign);
requests.emplace_back(offset, padding, zeros, 0);
offset += padding;
}
@ -418,8 +412,11 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
HWY_ASSERT(keys_.size() == blobs_.size());
// Concatenate blobs in memory.
const size_t header_size = BlobStore::HeaderSize(keys_.size());
const size_t padded_header_size = hwy::RoundUpTo(header_size, kBlobAlign);
BlobStorePtr bs = BlobStore::Allocate(padded_header_size);
std::vector<BlobIO> requests = BlobStore::PrepareWriteRequests(
keys_.data(), blobs_.data(), keys_.size());
keys_.data(), blobs_.data(), keys_.size(), bs.get());
// Create/replace existing file.
#if HWY_OS_WIN

View File

@ -40,6 +40,12 @@ using BlobStorePtr = hwy::AlignedFreeUniquePtr<BlobStore>;
// 0 if successful, otherwise the line number of the failing check.
using BlobError = int;
// Blob offsets on disk and memory addresses are a multiple of this, because
// we pad the header and each blob's size. This matches CUDA alignment and the
// maximum SVE vector size, and exceeds typical x86 cache line sizes (64 or
// 128), which can help performance.
static constexpr size_t kBlobAlign = 256;
struct BlobIO {
BlobIO(uint64_t offset, size_t size, void* data, uint64_t padding)
: offset(offset), size(size), data(data), padding(padding) {}

View File

@ -381,13 +381,14 @@ HWY_INLINE void Compress(const std::array<float, kCapacity>& in,
}
// Decompresses `num` values from `compressed` starting at `compressed_ofs`.
template <typename MatT, size_t kCapacity, typename OutT>
HWY_NOINLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
size_t compressed_ofs, OutT* out, size_t num) {
HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
template <typename ArrayT, typename OutT>
HWY_NOINLINE void Decompress(const ArrayT& compressed, size_t compressed_ofs,
OutT* out, size_t num) {
HWY_DASSERT(compressed_ofs + num <= compressed.size());
const hn::ScalableTag<OutT> d;
using Traits = CompressTraits<MatT>;
Traits::Decompress(d, kCapacity, compressed.data(), compressed_ofs, out, num);
using Traits = CompressTraits<typename ArrayT::value_type>;
Traits::Decompress(d, compressed.size(), compressed.data(), compressed_ofs,
out, num);
}
// As above, but with threading and benchmarking.
@ -395,7 +396,7 @@ template <typename MatT, size_t kCapacity, typename OutT>
HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
size_t compressed_ofs, OutT* out, size_t num,
hwy::ThreadPool& pool) {
HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
HWY_DASSERT(compressed_ofs + num <= compressed.size());
const double t0 = hwy::platform::Now();
using Traits = CompressTraits<MatT>;
@ -407,7 +408,7 @@ HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
const size_t ofs = idx_batch * kBatch;
const size_t num = idx_batch == num_batches - 1 ? (num - ofs) : kBatch;
Traits::Decompress(d, compressed.NumElements(), compressed.data(),
Traits::Decompress(d, compressed.size(), compressed.data(),
compressed_ofs + ofs, out + ofs, num);
});
@ -417,16 +418,28 @@ HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
fprintf(stderr, "Decompress %.1f MB/s\n", mbps);
}
// Returns dot product with `vec_aligned` of length `num`.
template <class DF, typename ArrayT, typename VecT>
HWY_INLINE float Dot(DF df, const ArrayT& compressed, size_t compressed_ofs,
const VecT* vec_aligned, size_t num) {
HWY_DASSERT(compressed_ofs + num <= compressed.size());
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
using Traits = CompressTraits<typename ArrayT::value_type>;
return Traits::Dot(df, compressed.size(), compressed.data(), compressed_ofs,
vec_aligned, num);
}
// Returns dot product with `vec_aligned` of length `num`.
template <class DF, typename MatT, size_t kCapacity, typename VecT>
HWY_INLINE float Dot(DF df, const CompressedArray<MatT, kCapacity>& compressed,
size_t compressed_ofs, const VecT* vec_aligned,
size_t num) {
HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
HWY_DASSERT(compressed_ofs + num <= compressed.size());
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
using Traits = CompressTraits<MatT>;
return Traits::Dot(df, kCapacity, compressed.data(), compressed_ofs,
vec_aligned, num);
return (compressed.scale() * Traits::Dot(df, compressed.size(),
compressed.data(), compressed_ofs,
vec_aligned, num));
}
// Callback used by ForeachTensor.
@ -445,6 +458,12 @@ class Compressor {
compressed.CompressedSize());
}
void AddScales(float* scales, size_t len) {
if (len) {
writer_.Add(CacheKey<float>("scales"), scales, len * sizeof(scales[0]));
}
}
void WriteAll(hwy::ThreadPool& pool, const char* blob_filename) {
const BlobError err = writer_.WriteAll(pool, blob_filename);
if (err != 0) {

View File

@ -71,10 +71,15 @@ class CompressedArray {
}
public:
using value_type = MatT;
MatT* data() { return data_.data(); }
const MatT* data() const { return data_.data(); }
constexpr size_t NumElements() const { return kCapacity; }
float scale() const { return scale_[0]; }
void set_scale(float scale) { scale_[0] = scale; }
constexpr size_t size() const { return kCapacity; }
constexpr size_t CompressedSize() const {
return NumCompressed() * sizeof(MatT);
@ -82,6 +87,7 @@ class CompressedArray {
private:
std::array<MatT, NumCompressed()> data_;
float scale_[kBlobAlign / sizeof(float)];
};
#if COMPRESS_STATS
@ -187,11 +193,21 @@ class CacheLoader {
err_ = reader_.Enqueue(CacheKey<MatT>(name), compressed.data(),
compressed.CompressedSize());
compressed.set_scale(1.0f);
if (err_ != 0) {
fprintf(stderr, "Failed to read cache %s (error %d)\n", name, err_);
}
}
void LoadScales(float* scales, size_t len) {
if (0 != reader_.Enqueue(CacheKey<float>("scales"), scales,
len * sizeof(scales[0]))) {
for (size_t i = 0; i < len; ++i) {
scales[i] = 1.0f;
}
}
}
// Returns whether all tensors are successfully loaded from cache.
bool ReadAll(hwy::ThreadPool& pool) {
// reader_ invalid or any Enqueue failed

View File

@ -30,6 +30,16 @@
#include <stddef.h>
// copybara:import_next_line:gemma_cpp
#include "compression/sfp.h"
#include "hwy/base.h" // hwy::bfloat16_t
// Allowable types for GEMMA_WEIGHT_T (can be specified at compilation time):
// float, hwy::bfloat16_t, SfpStream, NuqStream
#ifndef GEMMA_WEIGHT_T
#define GEMMA_WEIGHT_T SfpStream
#endif // !GEMMA_WEIGHT_T
namespace gcpp {
static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN;
@ -45,6 +55,8 @@ struct ConfigGemma7B {
static constexpr int kKVHeads = 16; // standard MHA
static constexpr int kQKVDim = 256; // query size == key size == value size
static constexpr int kTopK = gcpp::kTopK;
static constexpr int kNumTensorScales = 0;
using WeightT = GEMMA_WEIGHT_T;
};
struct ConfigGemma2B {
@ -57,6 +69,8 @@ struct ConfigGemma2B {
static constexpr int kKVHeads = 1;
static constexpr int kQKVDim = 256; // query size == key size == value size
static constexpr int kTopK = gcpp::kTopK;
static constexpr int kNumTensorScales = 0;
using WeightT = GEMMA_WEIGHT_T;
};
} // namespace gcpp

View File

@ -19,9 +19,9 @@
#include "gemma.h"
// copybara:import_next_line:gemma_cpp
#include "util/app.h" // LoaderArgs
#include "hwy/contrib/thread_pool/thread_pool.h"
// copybara:import_next_line:gemma_cpp
#include "util/args.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
std::vector<int> tokenize(
const std::string& prompt_string,
@ -43,8 +43,7 @@ int main(int argc, char** argv) {
hwy::ThreadPool pool(num_threads);
// Instantiate model and KV Cache
gcpp::Gemma model(loader.tokenizer, loader.compressed_weights,
loader.ModelType(), pool);
gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
auto kv_cache = CreateKVCache(loader.ModelType());
size_t pos = 0; // KV Cache position

483
gemma.cc
View File

@ -19,18 +19,18 @@
// which we pass the filename via macro 'argument'.
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "gemma.cc" // NOLINT
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/foreach_target.h" // IWYU pragma: keep
// Must come after foreach_target.h to avoid redefinition errors.
// copybara:import_next_line:gemma_cpp
#include "compression/compress-inl.h"
// copybara:import_next_line:gemma_cpp
#include "ops.h"
// copybara:import_next_line:gemma_cpp
#include "util/args.h" // Path
#include "hwy/contrib/matvec/matvec-inl.h"
#include "hwy/highway.h"
#include "hwy/profiler.h"
#include "hwy/timer.h"
// copybara:import_next_line:gemma_cpp
#include "util/args.h" // Path
// Non-SIMD includes and types. Note that HWY_ONCE is only true on the last
// compile pass, whereas we want this defined in the first.
@ -64,6 +64,12 @@
// copybara:import_next_line:sentencepiece
#include "src/sentencepiece_processor.h"
// Setting this to true disables fread() calls that read the model file.
constexpr bool kDryRunFread = false;
// Setting this to false will load and use uncompressed weights.
constexpr bool kWeightsAreCompressed = true;
namespace gcpp {
template <class TConfig>
@ -88,70 +94,145 @@ struct Layer {
std::array<float, kModelDim> pre_ffw_norm_scale;
};
float ScaleWeights(float* data, size_t len) {
float maxabs = 0.0;
for (size_t i = 0; i < len; ++i) {
maxabs = std::max(maxabs, std::abs(data[i]));
}
const float kMaxRange = 1.875f;
if (maxabs <= kMaxRange) {
return 1.0f;
}
const float scale = maxabs / kMaxRange;
const float inv_scale = 1.0f / scale;
for (size_t i = 0; i < len; ++i) {
data[i] *= inv_scale;
}
return scale;
}
// Array instead of single large allocation for parallel mem init. Split out of
// Weights so that only these pointers are initialized.
template <class TConfig>
struct LayerPointers {
explicit LayerPointers(hwy::ThreadPool& pool) {
pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) {
this->layers[task] = hwy::AllocateAligned<Layer<TConfig>>(1);
});
}
using TLayer = Layer<TConfig>;
std::array<hwy::AlignedFreeUniquePtr<TLayer[]>, TConfig::kLayers> layers;
};
template <class TConfig>
struct Weights {
Weights() = default;
hwy::AlignedUniquePtr<Layer<TConfig>[]> layers; // kLayers
// No ctor/dtor, allocated via AllocateAligned.
std::array<float, TConfig::kVocabSize * TConfig::kModelDim>
embedder_input_embedding;
std::array<float, TConfig::kModelDim> final_norm_scale;
LayerPointers<TConfig> layer_ptrs;
std::array<float, TConfig::kNumTensorScales> scales;
const Layer<TConfig>* GetLayer(size_t layer) const {
return layer_ptrs.layers[layer].get();
}
Layer<TConfig>* GetLayer(size_t layer) {
return layer_ptrs.layers[layer].get();
}
};
// Only called if cached loading fails.
template <typename TConfig>
hwy::AlignedUniquePtr<Weights<TConfig>> LoadWeights(const Path& checkpoint) {
hwy::AlignedFreeUniquePtr<uint8_t[]> LoadWeights(
const Path& checkpoint, hwy::ThreadPool& pool,
bool scale_for_compression = false) {
PROFILER_ZONE("Startup.LoadWeights");
using TWeights = Weights<TConfig>;
hwy::AlignedUniquePtr<TWeights> weights = hwy::MakeUniqueAligned<TWeights>();
weights->layers =
hwy::MakeUniqueAlignedArray<Layer<TConfig>>(TConfig::kLayers);
if (checkpoint.path.empty()) {
HWY_ABORT(
"Loading --compressed_weights failed; we require a --weights argument. "
"Please see issue #11 on how to create this file.\n");
if (!std::filesystem::exists(checkpoint.path)) {
HWY_ABORT("The model weights file '%s' does not exist.",
checkpoint.path.c_str());
}
using TWeights = Weights<TConfig>;
hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8 =
hwy::AllocateAligned<uint8_t>(sizeof(TWeights));
TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
new (&weights->layer_ptrs) LayerPointers<TConfig>(pool);
size_t scale_pos = 0;
FILE* fptr;
fptr = fopen(checkpoint.path.c_str(), "rb");
if (fptr == nullptr) {
HWY_ABORT("Failed to open model file %s - does it exist?",
checkpoint.path.c_str());
if constexpr (kDryRunFread) {
fprintf(stderr, "Dry-Run, not reading model-file.\n");
} else {
fptr = fopen(checkpoint.path.c_str(), "rb");
if (fptr == nullptr) {
HWY_ABORT("Failed to open model file %s - does it exist?",
checkpoint.path.c_str());
}
}
bool ok = true;
uint64_t total_size = 0;
ok &= 1 == fread(&(weights->embedder_input_embedding),
sizeof(weights->embedder_input_embedding), 1, fptr);
ok &= 1 == fread(&(weights->final_norm_scale),
sizeof(weights->final_norm_scale), 1, fptr);
total_size += sizeof(weights->embedder_input_embedding) +
sizeof(weights->final_norm_scale);
auto do_fread = [&](void* var, int layer, const char* name, size_t size) {
if (layer == -1) {
fprintf(stderr, "Loading Parameters (size %zu): %s\n", size, name);
} else {
fprintf(stderr, "Loading Parameters (layer=%d, size %zu): %s\n", layer,
size, name);
}
if constexpr (!kDryRunFread) {
ok &= 1 == fread(var, size, 1, fptr);
total_size += size;
}
};
do_fread(&(weights->embedder_input_embedding), -1, "embedder_input_embedding",
sizeof(weights->embedder_input_embedding));
do_fread(&(weights->final_norm_scale), -1, "final_norm_scale",
sizeof(weights->final_norm_scale));
for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
Layer<TConfig>* layer_view = &weights->layers[layer];
ok &= 1 == fread(&layer_view->attn_vec_einsum_w,
sizeof(layer_view->attn_vec_einsum_w), 1, fptr);
ok &= 1 == fread(&layer_view->qkv_einsum_w,
sizeof(layer_view->qkv_einsum_w), 1, fptr);
ok &= 1 == fread(&layer_view->gating_einsum_w,
sizeof(layer_view->gating_einsum_w), 1, fptr);
ok &= 1 ==
fread(&layer_view->linear_w, sizeof(layer_view->linear_w), 1, fptr);
ok &= 1 == fread(&layer_view->pre_attention_norm_scale,
sizeof(layer_view->pre_attention_norm_scale), 1, fptr);
ok &= 1 == fread(&layer_view->pre_ffw_norm_scale,
sizeof(layer_view->pre_ffw_norm_scale), 1, fptr);
total_size += sizeof(*layer_view);
Layer<TConfig>* layer_view = weights->GetLayer(layer);
#define READ_WEIGHTS(name) \
do { \
do_fread(&(layer_view->name), layer, #name, sizeof(layer_view->name)); \
} while (0)
#define SCALE_WEIGHTS(name) \
do { \
if (ok && !kDryRunFread && scale_for_compression) { \
weights->scales[scale_pos++] = \
ScaleWeights(layer_view->name.data(), layer_view->name.size()); \
} \
} while (0)
// Make sure we don't have uninitialized memory.
hwy::ZeroBytes(layer_view, sizeof(*layer_view));
READ_WEIGHTS(attn_vec_einsum_w);
READ_WEIGHTS(qkv_einsum_w);
SCALE_WEIGHTS(attn_vec_einsum_w);
SCALE_WEIGHTS(qkv_einsum_w);
READ_WEIGHTS(gating_einsum_w);
READ_WEIGHTS(linear_w);
SCALE_WEIGHTS(gating_einsum_w);
SCALE_WEIGHTS(linear_w);
READ_WEIGHTS(pre_attention_norm_scale);
READ_WEIGHTS(pre_ffw_norm_scale);
#undef READ_WEIGHTS
}
if (!ok) {
HWY_ABORT("Failed to read from %s - might be a directory, or too small? "
"expected size: %d kB", checkpoint.path.c_str(),
static_cast<uint32_t>(total_size >> 10));
HWY_ABORT(
"Failed to read from %s - might be a directory, or too small? "
"expected size: %d kB",
checkpoint.path.c_str(), static_cast<uint32_t>(total_size >> 10));
}
HWY_ASSERT(0 == fclose(fptr));
return weights;
if (!kDryRunFread) {
HWY_ASSERT(0 == fclose(fptr));
if (scale_for_compression) {
HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
}
}
return weights_u8;
}
template <class TConfig>
@ -159,18 +240,19 @@ struct CompressedLayer {
// No ctor/dtor, allocated via AllocateAligned.
using TLayer = gcpp::Layer<TConfig>;
using WeightT = typename TConfig::WeightT;
static constexpr size_t kModelDim = TConfig::kModelDim;
static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
// Compressed Parameters
// We don't yet have an RMSNorm that accepts all WeightT.
CompressedArray<hwy::bfloat16_t, kModelDim> c_pre_attention_norm_scale;
CompressedArray<hwy::bfloat16_t, kModelDim> c_pre_ffw_norm_scale;
CompressedArray<WeightT, TLayer::kGatingEinsumWSize> c_gating_einsum_w;
CompressedArray<WeightT, kModelDim * kFFHiddenDim> c_linear_w;
CompressedArray<WeightT, TLayer::kQKVEinsumWSize> c_qkv_einsum_w;
CompressedArray<WeightT, TLayer::kAttVecEinsumWSize> c_attn_vec_einsum_w;
CompressedArray<hwy::bfloat16_t, kModelDim> pre_attention_norm_scale;
CompressedArray<hwy::bfloat16_t, kModelDim> pre_ffw_norm_scale;
CompressedArray<WeightT, TLayer::kGatingEinsumWSize> gating_einsum_w;
CompressedArray<WeightT, kModelDim * kFFHiddenDim> linear_w;
CompressedArray<WeightT, TLayer::kQKVEinsumWSize> qkv_einsum_w;
CompressedArray<WeightT, TLayer::kAttVecEinsumWSize> attn_vec_einsum_w;
};
// Array instead of single large allocation for parallel mem init. Split out of
@ -193,21 +275,25 @@ struct CompressedWeights {
// No ctor/dtor, allocated via AllocateAligned.
CompressedArray<EmbedderInputT, TConfig::kVocabSize * TConfig::kModelDim>
c_embedder_input_embedding;
embedder_input_embedding;
CompressedArray<hwy::bfloat16_t, TConfig::kModelDim> c_final_norm_scale;
CompressedArray<hwy::bfloat16_t, TConfig::kModelDim> final_norm_scale;
// Must be last so that the other arrays remain aligned.
CompressedLayerPointers<TConfig> c_layer_ptrs;
const CompressedLayer<TConfig>* CLayer(size_t layer) const {
const CompressedLayer<TConfig>* GetLayer(size_t layer) const {
return c_layer_ptrs.c_layers[layer].get();
}
CompressedLayer<TConfig>* CLayer(size_t layer) {
CompressedLayer<TConfig>* GetLayer(size_t layer) {
return c_layer_ptrs.c_layers[layer].get();
}
};
template <class TConfig>
using WeightsT = hwy::If<kWeightsAreCompressed, CompressedWeights<TConfig>,
Weights<TConfig>>;
// Aligned.
template <class TConfig, size_t TBatchSize>
struct Activations {
@ -272,16 +358,27 @@ KVCache CreateKVCache(Model type) {
}
}
namespace {
template <class Config>
void DeleteLayersPtrs(CompressedWeights<Config>* c_weights) {
c_weights->c_layer_ptrs.~CompressedLayerPointers<Config>();
}
template <class Config>
void DeleteLayersPtrs(Weights<Config>* weights) {
weights->layer_ptrs.~LayerPointers<Config>();
}
} // namespace
template <class Config>
struct GemmaImpl : public GemmaInterface {
GemmaImpl(std::unique_ptr<sentencepiece::SentencePieceProcessor>& tokenizer,
hwy::AlignedFreeUniquePtr<uint8_t[]>& compressed_weights,
hwy::AlignedFreeUniquePtr<uint8_t[]>& weights_u8,
hwy::ThreadPool& pool);
~GemmaImpl() {
using CWeights = CompressedWeights<Config>;
CWeights* c_weights = reinterpret_cast<CWeights*>(compressed_weights.get());
c_weights->c_layer_ptrs.~CompressedLayerPointers<Config>();
WeightsT<Config>* weights =
reinterpret_cast<WeightsT<Config>*>(weights_u8.get());
DeleteLayersPtrs(weights);
}
const sentencepiece::SentencePieceProcessor* Tokenizer() const override {
@ -296,7 +393,7 @@ struct GemmaImpl : public GemmaInterface {
int verbosity) override;
std::unique_ptr<sentencepiece::SentencePieceProcessor> tokenizer;
hwy::AlignedFreeUniquePtr<uint8_t[]> compressed_weights;
hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8;
hwy::AlignedUniquePtr<Activations<Config, kPrefillBatchSize>> prefill;
hwy::AlignedUniquePtr<Activations<Config, 1>> state;
};
@ -309,11 +406,11 @@ HWY_BEFORE_NAMESPACE();
namespace gcpp {
namespace HWY_NAMESPACE {
template <class TConfig, size_t kBatchSize>
template <size_t kBatchSize, typename LayerT, class TConfig>
HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
Activations<TConfig, kBatchSize>& activations,
const CompressedLayer<TConfig>* c_layer,
KVCache& kv_cache, hwy::ThreadPool& pool) {
const LayerT* layer_weights, KVCache& kv_cache,
hwy::ThreadPool& pool) {
PROFILER_ZONE("Gen.Attention");
const size_t pos = batch_start + batch_idx;
HWY_DASSERT(batch_idx < kBatchSize);
@ -329,27 +426,26 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
static const float kQueryScale =
static_cast<float>(1.0 / sqrt(static_cast<double>(kQKVDim)));
const size_t batch_offset = batch_idx * kModelDim;
float* x = activations.pre_att_rms_out.data() + batch_idx * kModelDim;
auto ProjQ = [&](uint64_t head, size_t head_offset) HWY_ATTR {
float* HWY_RESTRICT q =
activations.q.data() + head * kQKVDim + batch_idx * kHeads * kQKVDim;
MatVecLoop<kQKVDim, kModelDim>(
c_layer->c_qkv_einsum_w, head_offset + 0 * kQKVDim * kModelDim,
activations.pre_att_rms_out.data() + batch_offset, q);
MatVecLoop<kQKVDim, kModelDim>(layer_weights->qkv_einsum_w,
head_offset + 0 * kQKVDim * kModelDim, x, q);
};
auto ProjKV =
[&](size_t k_offset, size_t v_offset, size_t kv_offset) HWY_ATTR {
TwoOfsMatVecLoop<kQKVDim, kModelDim>(
c_layer->c_qkv_einsum_w, k_offset, v_offset,
activations.pre_att_rms_out.data() + batch_offset,
kv_cache.key_cache.get() + kv_offset,
kv_cache.value_cache.get() + kv_offset);
auto ProjKV = [&](size_t k_offset, size_t v_offset,
size_t kv_offset) HWY_ATTR {
float* HWY_RESTRICT k = kv_cache.key_cache.get() + kv_offset;
float* HWY_RESTRICT v = kv_cache.value_cache.get() + kv_offset;
Rope(kv_cache.key_cache.get() + kv_offset, kQKVDim, pos);
};
TwoOfsMatVecLoop<kQKVDim, kModelDim>(layer_weights->qkv_einsum_w, k_offset,
v_offset, x, k, v);
Rope(k, kQKVDim, pos);
};
auto Attn = [&](uint64_t head, size_t head_offset) HWY_ATTR {
// Calculate scores
@ -388,7 +484,7 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
head == 0
? activations.att_post2.data() + batch_idx * kModelDim
: activations.att_post1.data() + head * kBatchSize * kModelDim;
MatVecLoop<kModelDim, kQKVDim>(c_layer->c_attn_vec_einsum_w,
MatVecLoop<kModelDim, kQKVDim>(layer_weights->attn_vec_einsum_w,
head * kModelDim * kQKVDim, att_out,
head_out);
};
@ -431,9 +527,9 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
}
}
template <typename TConfig, size_t kBatchSize>
template <size_t kBatchSize, typename LayerT, typename TConfig>
HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
size_t batch_idx, const CompressedLayer<TConfig>* c_layer,
size_t batch_idx, const LayerT* layer_weights,
hwy::ThreadPool& pool) {
HWY_DASSERT(batch_idx < kBatchSize);
static constexpr size_t kModelDim = TConfig::kModelDim;
@ -449,12 +545,12 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
// Same matrix, first and second half of rows. Could fuse into one MatVec,
// but separating them could help on NUMA e.g. multiple sockets.
MatVec<kFFHiddenDim, kModelDim>(c_layer->c_gating_einsum_w,
MatVec<kFFHiddenDim, kModelDim>(layer_weights->gating_einsum_w,
kFFHiddenDim * kModelDim, vec, out_mul,
pool);
// Gate, will go through the nonlinearity.
MatVec<kFFHiddenDim, kModelDim>(c_layer->c_gating_einsum_w, 0, vec, out,
MatVec<kFFHiddenDim, kModelDim>(layer_weights->gating_einsum_w, 0, vec, out,
pool);
namespace hn = hwy::HWY_NAMESPACE;
@ -467,7 +563,7 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
PROFILER_ZONE("Gen.FFW\\GatedGELU");
MatVec<kModelDim, kFFHiddenDim>(
c_layer->c_linear_w, 0, activations.ffw_hidden.data() + hidden_offset,
layer_weights->linear_w, 0, activations.ffw_hidden.data() + hidden_offset,
activations.ffw_out.data() + batch_idx * kModelDim, pool);
}
@ -486,9 +582,9 @@ GEMMA_CONSTEXPR_EMBSCALING float EmbeddingScaling() {
Sqrt(static_cast<float>(TConfig::kModelDim))));
}
template <typename TConfig, size_t kBatchSize>
template <size_t kBatchSize, typename WeightArrayT, typename TConfig>
HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
const CompressedWeights<TConfig>& c_weights,
const WeightArrayT& weights,
Activations<TConfig, kBatchSize>& activations,
KVCache& kv_cache, hwy::ThreadPool& pool,
hwy::ThreadPool& inner_pool) {
@ -500,22 +596,22 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
pool.Run(
0, num_tokens, [&](const uint64_t token_idx, size_t /*thread*/) HWY_ATTR {
const int token = tokens[token_idx];
Decompress(c_weights.c_embedder_input_embedding, token * kModelDim,
Decompress(weights.embedder_input_embedding, token * kModelDim,
activations.x.data() + token_idx * kModelDim, kModelDim);
MulByConst(kEmbScaling, activations.x.data() + token_idx * kModelDim,
kModelDim);
});
for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
const CompressedLayer<TConfig>* c_layer = c_weights.CLayer(layer);
const auto* layer_weights = weights.GetLayer(layer);
for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
RMSNorm(activations.x.data() + token_idx * kModelDim,
c_layer->c_pre_attention_norm_scale.data(),
layer_weights->pre_attention_norm_scale.data(),
activations.pre_att_rms_out.data() + token_idx * kModelDim,
kModelDim);
Attention<TConfig, kBatchSize>(pos, token_idx, layer, activations,
c_layer, kv_cache, pool);
Attention<kBatchSize>(pos, token_idx, layer, activations, layer_weights,
kv_cache, pool);
}
// TODO: sink the loop into these functions, i.e. make them matmuls.
@ -525,10 +621,10 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
AddFrom(activations.att_post2.data() + token_idx * kModelDim,
activations.x.data() + token_idx * kModelDim, kModelDim);
RMSNorm(activations.x.data() + token_idx * kModelDim,
c_layer->c_pre_ffw_norm_scale.data(),
layer_weights->pre_ffw_norm_scale.data(),
activations.bf_pre_ffw_rms_out.data() + token_idx * kModelDim,
kModelDim);
FFW<TConfig, kBatchSize>(activations, token_idx, c_layer, inner_pool);
FFW<kBatchSize>(activations, token_idx, layer_weights, inner_pool);
AddFrom(activations.ffw_out.data() + token_idx * kModelDim,
activations.x.data() + token_idx * kModelDim, kModelDim);
});
@ -536,21 +632,20 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
pool.Run(
0, num_tokens, [&](const uint64_t token_idx, size_t /*thread*/) HWY_ATTR {
RMSNormInplace(c_weights.c_final_norm_scale.data(),
RMSNormInplace(weights.final_norm_scale.data(),
activations.x.data() + token_idx * kModelDim, kModelDim);
});
}
// n = 1 specialization
template <class TConfig>
void Transformer(int token, size_t pos,
const CompressedWeights<TConfig>& c_weights,
template <typename WeightArrayT, class TConfig>
void Transformer(int token, size_t pos, const WeightArrayT& weights,
Activations<TConfig, 1>& activations, KVCache& kv_cache,
hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool) {
static constexpr size_t kLayers = TConfig::kLayers;
static constexpr size_t kModelDim = TConfig::kModelDim;
Decompress(c_weights.c_embedder_input_embedding, token * kModelDim,
Decompress(weights.embedder_input_embedding, token * kModelDim,
activations.x.data(), kModelDim);
GEMMA_CONSTEXPR_EMBSCALING const float kEmbScaling =
@ -558,17 +653,18 @@ void Transformer(int token, size_t pos,
MulByConst(kEmbScaling, activations.x.data(), kModelDim);
for (size_t layer = 0; layer < kLayers; ++layer) {
const CompressedLayer<TConfig>* c_layer = c_weights.CLayer(layer);
RMSNorm(activations.x.data(), c_layer->c_pre_attention_norm_scale.data(),
const auto* layer_weights = weights.GetLayer(layer);
RMSNorm(activations.x.data(),
layer_weights->pre_attention_norm_scale.data(),
activations.pre_att_rms_out.data(), kModelDim);
Attention<TConfig, 1>(pos, 0, layer, activations, c_layer, kv_cache, pool);
Attention<1>(pos, 0, layer, activations, layer_weights, kv_cache, pool);
AddFrom(activations.att_post2.data(), activations.x.data(), kModelDim);
RMSNorm(activations.x.data(), c_layer->c_pre_ffw_norm_scale.data(),
RMSNorm(activations.x.data(), layer_weights->pre_ffw_norm_scale.data(),
activations.bf_pre_ffw_rms_out.data(), kModelDim);
FFW<TConfig, 1>(activations, /* batch_idx = */ 0, c_layer, pool);
FFW<1>(activations, /* batch_idx = */ 0, layer_weights, pool);
AddFrom(activations.ffw_out.data(), activations.x.data(), kModelDim);
}
RMSNormInplace(c_weights.c_final_norm_scale.data(), activations.x.data(),
RMSNormInplace(weights.final_norm_scale.data(), activations.x.data(),
kModelDim);
}
@ -609,9 +705,9 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
Activations<TConfig, 1>& activations = *gemma.state.get();
Activations<TConfig, kPrefillBatchSize>& prefill_activations =
*gemma.prefill.get();
const CompressedWeights<TConfig>& c_weights =
*reinterpret_cast<CompressedWeights<TConfig>*>(
gemma.compressed_weights.get());
const WeightsT<TConfig>& weights =
*reinterpret_cast<WeightsT<TConfig>*>(gemma.weights_u8.get());
size_t prompt_size = prompt.size();
RangeChecks<TConfig>(max_tokens, max_generated_tokens, prompt_size);
@ -643,9 +739,8 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
HWY_DASSERT(batch_size <= kPrefillBatchSize);
HWY_DASSERT(pos_offset + batch_size <= prompt_size - 1);
const int* batch_tokens = prompt.data() + pos_offset;
Prefill<TConfig, kPrefillBatchSize>(batch_tokens, batch_size, pos,
c_weights, prefill_activations,
kv_cache, pool, inner_pool);
Prefill<kPrefillBatchSize>(batch_tokens, batch_size, pos, weights,
prefill_activations, kv_cache, pool, inner_pool);
for (size_t idx = 0; idx < batch_size; ++idx) {
stream_token(batch_tokens[idx], 0.0f);
}
@ -672,7 +767,7 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
for (size_t generate_pos = 0;
pos < max_tokens && generate_pos < max_generated_tokens;
++pos, ++pos_offset, ++generate_pos) {
Transformer(token, pos, c_weights, activations, kv_cache, pool, inner_pool);
Transformer(token, pos, weights, activations, kv_cache, pool, inner_pool);
float* final_activation = activations.x.data();
// The condition below is always true if we are doing Prefill above.
// We keep it here for clarity so that the code is correct even if Prefill
@ -680,9 +775,9 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
if (pos_offset >= prompt_size - 1) {
PROFILER_ZONE("Gen.Embedding");
// Generation phase
MatVec<kVocabSize, TConfig::kModelDim>(
c_weights.c_embedder_input_embedding, 0, final_activation,
activations.logits.data(), pool);
MatVec<kVocabSize, TConfig::kModelDim>(weights.embedder_input_embedding,
0, final_activation,
activations.logits.data(), pool);
// Barrier: must have all logits so we can subtract max.
Softmax(activations.logits.data(), kVocabSize);
token = SampleTopK<TConfig::kTopK>(activations.logits.data(), kVocabSize,
@ -743,52 +838,37 @@ void ForEachTensor(const Weights<TConfig>* weights,
CompressedWeights<TConfig>& c_weights, Func& func) {
func("c_embedding",
weights ? weights->embedder_input_embedding.data() : nullptr,
c_weights.c_embedder_input_embedding);
c_weights.embedder_input_embedding);
func("c_final_norm", weights ? weights->final_norm_scale.data() : nullptr,
c_weights.c_final_norm_scale);
c_weights.final_norm_scale);
char name[16];
for (int layer_idx = 0; layer_idx < static_cast<int>(TConfig::kLayers);
++layer_idx) {
char name_buf[16];
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
const size_t idx = static_cast<size_t>(layer_idx);
Layer<TConfig>* layer = weights ? &weights->layers[idx] : nullptr;
CompressedLayer<TConfig>* c_layer = c_weights.CLayer(idx);
const Layer<TConfig>* layer = weights ? weights->GetLayer(idx) : nullptr;
CompressedLayer<TConfig>* layer_weights = c_weights.GetLayer(idx);
snprintf(name, sizeof(name), "pre_ff_ns_%d", layer_idx);
func(name, layer ? layer->pre_ffw_norm_scale.data() : nullptr,
c_layer->c_pre_ffw_norm_scale);
#define CALL_FUNC(name, member) \
snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
func(name_buf, layer ? layer->member.data() : nullptr, layer_weights->member)
snprintf(name, sizeof(name), "gating_ein_%d", layer_idx);
func(name, layer ? layer->gating_einsum_w.data() : nullptr,
c_layer->c_gating_einsum_w);
snprintf(name, sizeof(name), "linear_w_%d", layer_idx);
func(name, layer ? layer->linear_w.data() : nullptr, c_layer->c_linear_w);
snprintf(name, sizeof(name), "qkv_ein_%d", layer_idx);
func(name, layer ? layer->qkv_einsum_w.data() : nullptr,
c_layer->c_qkv_einsum_w);
snprintf(name, sizeof(name), "att_ein_%d", layer_idx);
func(name, layer ? layer->attn_vec_einsum_w.data() : nullptr,
c_layer->c_attn_vec_einsum_w);
snprintf(name, sizeof(name), "pre_att_ns_%d", layer_idx);
func(name, layer ? layer->pre_attention_norm_scale.data() : nullptr,
c_layer->c_pre_attention_norm_scale);
CALL_FUNC("pre_ff_ns", pre_ffw_norm_scale);
CALL_FUNC("gating_ein", gating_einsum_w);
CALL_FUNC("linear_w", linear_w);
CALL_FUNC("qkv_ein", qkv_einsum_w);
CALL_FUNC("att_ein", attn_vec_einsum_w);
CALL_FUNC("pre_att_ns", pre_attention_norm_scale);
#undef CALL_FUNC
}
}
template <class TConfig>
hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeights(
const Path& weights_path, const Path& cache, hwy::ThreadPool& pool) {
hwy::AlignedFreeUniquePtr<uint8_t[]> LoadCompressedWeights(
const Path& weights, hwy::ThreadPool& pool) {
PROFILER_ZONE("Startup.LoadCache");
if (!std::filesystem::exists(weights_path.path) &&
!std::filesystem::exists(cache.path)) {
HWY_ABORT(
"Either the model weights (--weights) or cached compressed weights "
"(--compressed_weights) must exist.");
if (!std::filesystem::exists(weights.path)) {
HWY_ABORT("The model weights file '%s' does not exist.",
weights.path.c_str());
}
// Allocate compressed weights.
@ -798,32 +878,49 @@ hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeights(
CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
// First attempt to load them from cache, without requiring weights.
CacheLoader loader(cache.path.c_str());
std::array<float, TConfig::kNumTensorScales> scales;
CacheLoader loader(weights.path.c_str());
ForEachTensor<TConfig>(nullptr, *c_weights, loader);
if (loader.ReadAll(pool)) return c_weights_u8;
// Get weights, compress, and store in cache.
const hwy::AlignedUniquePtr<Weights<TConfig>> weights =
LoadWeights<TConfig>(weights_path);
Compressor compressor(pool);
ForEachTensor<TConfig>(weights.get(), *c_weights, compressor);
compressor.WriteAll(pool, cache.path.c_str());
loader.LoadScales(scales.data(), scales.size());
if (!loader.ReadAll(pool)) {
HWY_ABORT("Failed to load model weights.");
}
if (TConfig::kNumTensorScales > 0) {
size_t scale_pos = 0;
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
const size_t idx = static_cast<size_t>(layer_idx);
CompressedLayer<TConfig>* layer_weights = c_weights->GetLayer(idx);
layer_weights->attn_vec_einsum_w.set_scale(scales[scale_pos++]);
layer_weights->qkv_einsum_w.set_scale(scales[scale_pos++]);
layer_weights->gating_einsum_w.set_scale(scales[scale_pos++]);
layer_weights->linear_w.set_scale(scales[scale_pos++]);
}
HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
}
return c_weights_u8;
}
// Type-erased because this function is called via a function pointer.
hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeightsT(
gcpp::Model model, const Path& weights, const Path& compressed_weights,
hwy::ThreadPool& pool) {
hwy::AlignedFreeUniquePtr<uint8_t[]> LoadCompressedWeightsT(
gcpp::Model model, const Path& weights, hwy::ThreadPool& pool) {
switch (model) {
case Model::GEMMA_2B:
return GetCompressedWeights<ConfigGemma2B>(weights, compressed_weights,
pool);
return LoadCompressedWeights<ConfigGemma2B>(weights, pool);
case Model::GEMMA_7B:
return GetCompressedWeights<ConfigGemma7B>(weights, compressed_weights,
pool);
return LoadCompressedWeights<ConfigGemma7B>(weights, pool);
default:
HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
}
}
hwy::AlignedFreeUniquePtr<uint8_t[]> LoadWeightsT(gcpp::Model model,
const Path& weights,
hwy::ThreadPool& pool) {
switch (model) {
case Model::GEMMA_2B:
return LoadWeights<ConfigGemma2B>(weights, pool);
case Model::GEMMA_7B:
return LoadWeights<ConfigGemma7B>(weights, pool);
default:
HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
}
@ -846,18 +943,22 @@ void CompressWeights(const Path& weights_path,
new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
// Get weights, compress, and store.
const hwy::AlignedUniquePtr<Weights<TConfig>> weights =
LoadWeights<TConfig>(weights_path);
const bool scale_for_compression = TConfig::kNumTensorScales > 0;
const hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8 =
LoadWeights<TConfig>(weights_path, pool, scale_for_compression);
Weights<TConfig>* weights =
reinterpret_cast<Weights<TConfig>*>(weights_u8.get());
Compressor compressor(pool);
ForEachTensor<TConfig>(weights.get(), *c_weights, compressor);
ForEachTensor<TConfig>(weights, *c_weights, compressor);
compressor.AddScales(weights->scales.data(), weights->scales.size());
compressor.WriteAll(pool, compressed_weights_path.path.c_str());
weights->layer_ptrs.~LayerPointers<TConfig>();
c_weights->c_layer_ptrs.~CompressedLayerPointers<TConfig>();
}
void CompressWeightsT(gcpp::Model model, const Path& weights,
const Path& compressed_weights,
hwy::ThreadPool& pool) {
const Path& compressed_weights, hwy::ThreadPool& pool) {
switch (model) {
case Model::GEMMA_2B:
CompressWeights<ConfigGemma2B>(weights, compressed_weights, pool);
@ -877,7 +978,8 @@ HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace gcpp {
HWY_EXPORT(GetCompressedWeightsT);
HWY_EXPORT(LoadCompressedWeightsT);
HWY_EXPORT(LoadWeightsT);
HWY_EXPORT(CompressWeightsT);
HWY_EXPORT(Generate2B);
HWY_EXPORT(Generate7B);
@ -892,10 +994,9 @@ KVCache CreateKVCache(size_t size_cache_pos, size_t seq_len) {
template <class Config>
GemmaImpl<Config>::GemmaImpl(
std::unique_ptr<sentencepiece::SentencePieceProcessor>& tokenizer,
hwy::AlignedFreeUniquePtr<uint8_t[]>& compressed_weights,
hwy::ThreadPool& pool)
hwy::AlignedFreeUniquePtr<uint8_t[]>& weights_u8, hwy::ThreadPool& pool)
: tokenizer(std::move(tokenizer)),
compressed_weights(std::move(compressed_weights)),
weights_u8(std::move(weights_u8)),
prefill(hwy::MakeUniqueAligned<Activations<Config, kPrefillBatchSize>>()),
state(hwy::MakeUniqueAligned<Activations<Config, 1>>()) {}
@ -922,10 +1023,8 @@ void GemmaImpl<ConfigGemma7B>::Generate(
kv_cache, pool, inner_pool, stream_token, accept_token, gen, verbosity);
}
Gemma::Gemma(const Path& tokenizer_path, const Path& compressed_weights_path,
const Path& weights_path, Model model_type, ModelTraining training,
hwy::ThreadPool& pool)
: model_training(training) {
Gemma::Gemma(const Path& tokenizer_path, const Path& weights, Model model_type,
hwy::ThreadPool& pool) {
std::unique_ptr<sentencepiece::SentencePieceProcessor> tokenizer;
{
PROFILER_ZONE("Startup.tokenizer");
@ -934,16 +1033,21 @@ Gemma::Gemma(const Path& tokenizer_path, const Path& compressed_weights_path,
HWY_ABORT("Failed to load the tokenizer file.");
}
}
auto compressed_weights = HWY_DYNAMIC_DISPATCH(GetCompressedWeightsT)(
model_type, weights_path, compressed_weights_path, pool);
hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8;
if constexpr (kWeightsAreCompressed) {
weights_u8 =
HWY_DYNAMIC_DISPATCH(LoadCompressedWeightsT)(model_type, weights, pool);
} else {
weights_u8 = HWY_DYNAMIC_DISPATCH(LoadWeightsT)(model_type, weights, pool);
}
switch (model_type) {
case Model::GEMMA_2B:
impl_.reset(
new GemmaImpl<ConfigGemma2B>(tokenizer, compressed_weights, pool));
impl_.reset(new GemmaImpl<ConfigGemma2B>(tokenizer, weights_u8, pool));
break;
case Model::GEMMA_7B:
impl_.reset(
new GemmaImpl<ConfigGemma7B>(tokenizer, compressed_weights, pool));
impl_.reset(new GemmaImpl<ConfigGemma7B>(tokenizer, weights_u8, pool));
break;
default:
HWY_ABORT("Model type %d unknown.", static_cast<int>(model_type));
@ -981,10 +1085,9 @@ void GenerateGemma(Gemma& gemma, RuntimeConfig runtime_config,
}
void CompressWeights(gcpp::Model model, const Path& weights,
const Path& compressed_weights,
hwy::ThreadPool& pool) {
HWY_DYNAMIC_DISPATCH(CompressWeightsT)(
model, weights, compressed_weights, pool);
const Path& compressed_weights, hwy::ThreadPool& pool) {
HWY_DYNAMIC_DISPATCH(CompressWeightsT)
(model, weights, compressed_weights, pool);
}
} // namespace gcpp

19
gemma.h
View File

@ -24,22 +24,18 @@
// copybara:import_next_line:gemma_cpp
#include "compression/compress.h" // SfpStream/NuqStream
// copybara:import_next_line:gemma_cpp
#include "util/args.h" // Path
#include "configs.h"
#include "hwy/aligned_allocator.h"
#include "hwy/base.h" // hwy::bfloat16_t
#include "hwy/contrib/thread_pool/thread_pool.h"
// copybara:import_next_line:gemma_cpp
#include "util/args.h" // Path
// copybara:import_next_line:sentencepiece
#include "src/sentencepiece_processor.h"
namespace gcpp {
// Allowable types for GEMMA_WEIGHT_T (can be specified at compilation time):
// float, hwy::bfloat16_t, SfpStream, NuqStream
#ifndef GEMMA_WEIGHT_T
#define GEMMA_WEIGHT_T SfpStream
#endif // !GEMMA_WEIGHT_T
using WeightT = GEMMA_WEIGHT_T;
using GemmaWeightT = GEMMA_WEIGHT_T;
using EmbedderInputT = hwy::bfloat16_t;
constexpr size_t kPrefillBatchSize = 16;
constexpr bool kSystemPrompt = false;
@ -65,13 +61,11 @@ struct RuntimeConfig {
struct GemmaInterface;
struct Gemma {
Gemma(const Path& tokenizer_path, const Path& compressed_weights_path,
const Path& weights_path, Model model_type, ModelTraining training,
Gemma(const Path& tokenizer_path, const Path& weights, Model model_type,
hwy::ThreadPool& pool);
~Gemma(); // must be defined after GemmaInterface's dtor is defined.
const sentencepiece::SentencePieceProcessor* Tokenizer() const;
std::unique_ptr<GemmaInterface> impl_;
gcpp::ModelTraining model_training;
};
KVCache CreateKVCache(Model type); // convenient workaround for now
@ -99,8 +93,7 @@ void GenerateGemma(Gemma& gemma, RuntimeConfig runtime_config,
const StreamFunc& stream_token, std::mt19937& gen);
void CompressWeights(gcpp::Model model, const Path& weights,
const Path& compressed_weights,
hwy::ThreadPool& pool);
const Path& compressed_weights, hwy::ThreadPool& pool);
constexpr int EOS_ID = 1;

View File

@ -369,6 +369,7 @@ CompressedArray<float, kOuter * kInner> GenerateMat(size_t offset) {
}
}
Compress(content, ws, mat, pool);
mat.set_scale(1.0f);
return mat;
}

23
run.cc
View File

@ -29,14 +29,14 @@
#include "gemma.h" // Gemma
// copybara:import_next_line:gemma_cpp
#include "util/app.h"
// copybara:import_next_line:gemma_cpp
#include "util/args.h" // HasHelp
#include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
#include "hwy/highway.h"
#include "hwy/per_target.h"
#include "hwy/profiler.h"
#include "hwy/timer.h"
// copybara:import_next_line:gemma_cpp
#include "util/args.h" // HasHelp
namespace gcpp {
@ -66,7 +66,7 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
<< hwy::VectorBytes() * 8 << " bits)" << "\n"
<< "Compiled config : " << CompiledConfig() << "\n"
<< "Weight Type : "
<< gcpp::TypeName(gcpp::WeightT()) << "\n"
<< gcpp::TypeName(gcpp::GemmaWeightT()) << "\n"
<< "EmbedderInput Type : "
<< gcpp::TypeName(gcpp::EmbedderInputT()) << "\n";
}
@ -93,10 +93,11 @@ void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
std::cerr << "\n";
}
void ReplGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache,
hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
const InferenceArgs& args, int verbosity,
const gcpp::AcceptFunc& accept_token, std::string& eot_line) {
void ReplGemma(gcpp::Gemma& model, ModelTraining training,
gcpp::KVCache& kv_cache, hwy::ThreadPool& pool,
hwy::ThreadPool& inner_pool, const InferenceArgs& args,
int verbosity, const gcpp::AcceptFunc& accept_token,
std::string& eot_line) {
PROFILER_ZONE("Gen.misc");
int abs_pos = 0; // absolute token index over all turns
int current_pos = 0; // token index within the current turn
@ -177,7 +178,7 @@ void ReplGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache,
continue;
}
if (model.model_training == ModelTraining::GEMMA_IT) {
if (training == ModelTraining::GEMMA_IT) {
// For instruction-tuned models: add control tokens.
prompt_string = "<start_of_turn>user\n" + prompt_string +
"<end_of_turn>\n<start_of_turn>model\n";
@ -232,8 +233,7 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
[](uint64_t /*task*/, size_t thread) { PinThreadToCore(thread); });
}
gcpp::Gemma model(loader.tokenizer, loader.compressed_weights, loader.weights,
loader.ModelType(), loader.ModelTraining(), pool);
gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
auto kv_cache = CreateKVCache(loader.ModelType());
@ -265,7 +265,8 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
}
ReplGemma(
model, kv_cache, pool, inner_pool, inference, app.verbosity,
model, loader.ModelTraining(), kv_cache, pool, inner_pool, inference,
app.verbosity,
/*accept_token=*/[](int) { return true; }, app.eot_line);
}

View File

@ -36,9 +36,9 @@
#include "configs.h"
// copybara:import_next_line:gemma_cpp
#include "gemma.h"
#include "hwy/base.h" // HWY_ASSERT
// copybara:import_next_line:gemma_cpp
#include "util/args.h"
#include "hwy/base.h" // HWY_ASSERT
namespace gcpp {
@ -151,7 +151,7 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
}
// Returns error string or nullptr if OK.
const char* Validate() const {
const char* Validate() {
const std::string model_type_lc = ToLower(model_type);
if (model_type.empty()) {
return "Missing --model flag, need to specify either 2b-pt, 7b-pt, "
@ -165,37 +165,42 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
if (tokenizer.path.empty()) {
return "Missing --tokenizer flag, a file for the tokenizer is required.";
}
if (compressed_weights.path.empty()) {
return "Missing --compressed_weights flag, a file for the compressed "
"model.";
if (!compressed_weights.path.empty()) {
if (weights.path.empty()) {
weights = compressed_weights;
} else {
return "Only one of --weights and --compressed_weights can be "
"specified. To create compressed weights use the compress_weights "
"tool.";
}
}
if (weights.path.empty()) {
return "Missing --weights flag, a file for the model weights.";
}
if (!weights.exists()) {
return "Can't open file specified with --weights flag.";
}
return nullptr;
}
Path tokenizer;
Path weights; // uncompressed weights file location
Path compressed_weights; // compressed weights file location
Path weights; // weights file location
Path compressed_weights;
std::string model_type;
template <class Visitor>
void ForEach(const Visitor& visitor) {
visitor(tokenizer, "tokenizer", Path(),
"Path name of tokenizer model file.\n Required argument.");
visitor(
compressed_weights, "compressed_weights", Path(),
"Path name of compressed weights file, regenerated from `--weights` "
"file if "
"the compressed weights file does not exist.\n Required argument.");
visitor(weights, "weights", Path(),
"Path name of model weights (.sbs) file.\n Required argument.");
visitor(compressed_weights, "compressed_weights", Path(),
"Alias for --weights.");
visitor(model_type, "model", std::string(),
"Model type\n 2b-it = 2B parameters, instruction-tuned\n "
"2b-pt = 2B parameters, pretrained\n 7b-it = 7B parameters "
"instruction-tuned\n 7b-pt = 7B parameters, pretrained\n"
" Required argument.");
visitor(weights, "weights", Path(),
"Path name of model weights (.sbs) file. Only required if "
"compressed_weights file is not present and needs to be "
"regenerated. This parameter is only required for compressing "
"new model weight exports, otherwise it is not needed.");
}
};