From 4c23932289af7737bf53239bc6ac5bdb2490a624 Mon Sep 17 00:00:00 2001 From: Luca Versari Date: Thu, 4 Apr 2024 13:39:44 +0200 Subject: [PATCH] Improve weight handling. - Allow scaling of SFP weights - Allow using uncompressed weights - Do not try to compress weights in the main model calls - Reduce code duplication in weight handling with some macros Co-authored-by: Eugene Kliuchnikov Co-authored-by: Thomas Fischbacher Co-authored-by: Zoltan Szabadka --- compression/blob_store.cc | 45 ++-- compression/blob_store.h | 6 + compression/compress-inl.h | 41 ++- compression/compress.h | 18 +- configs.h | 14 ++ examples/hello_world/run.cc | 5 +- gemma.cc | 483 ++++++++++++++++++++++-------------- gemma.h | 19 +- ops_test.cc | 1 + run.cc | 23 +- util/app.h | 39 +-- 11 files changed, 424 insertions(+), 270 deletions(-) diff --git a/compression/blob_store.cc b/compression/blob_store.cc index b47515a..2458fb9 100644 --- a/compression/blob_store.cc +++ b/compression/blob_store.cc @@ -29,15 +29,15 @@ // copybara:import_next_line:gemma_cpp #include "compression/blob_store.h" +#include // open #include #include // SEEK_END - unistd isn't enough for IDE. #include // O_RDONLY -#include // open #if HWY_OS_WIN -#include // read, write, close #include +#include // read, write, close #else -#include // read, write, close +#include // read, write, close #endif #include @@ -113,8 +113,9 @@ hwy::uint128_t MakeKey(const char* string) { return ret; } -static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data, - std::vector& requests) { +namespace { +void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data, + std::vector& requests) { // Split into chunks for load-balancing even if blob sizes vary. constexpr size_t kChunkSize = 4 * 1024 * 1024; @@ -129,7 +130,7 @@ static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data, requests.emplace_back(offset + pos, size - pos, data + pos, 0); } } - +} // namespace struct IO { // Returns size in bytes or 0. @@ -197,12 +198,6 @@ static_assert(HWY_IS_LITTLE_ENDIAN, "Assumes little endian"); class BlobStore { static constexpr uint32_t kMagic = 0x0A534253; // SBS\n - // Blob offsets on disk and memory addresses are a multiple of this, because - // we pad the header and each blob's size. This matches CUDA alignment and the - // maximum SVE vector size, and exceeds typical x86 cache line sizes (64 or - // 128), which can help performance. - static constexpr size_t kAlign = 256; - public: // NOT including padding, so that we can also use ZeroFillPadding after // copying the header. @@ -215,13 +210,13 @@ class BlobStore { // blobs. Requires num_blobs_ to already be set, typically by reading // sizeof(BlobStore) bytes from disk. size_t PaddedHeaderSize() const { - return hwy::RoundUpTo(HeaderSize(num_blobs_), kAlign); + return hwy::RoundUpTo(HeaderSize(num_blobs_), kBlobAlign); } // Returns aligned offset and zero-fills between that and `offset`. uint64_t ZeroFillPadding(uint64_t offset) { uint8_t* const bytes = reinterpret_cast(this); - const uint64_t padded = hwy::RoundUpTo(offset, kAlign); + const uint64_t padded = hwy::RoundUpTo(offset, kBlobAlign); hwy::ZeroBytes(bytes + offset, padded - offset); return padded; } @@ -236,7 +231,7 @@ class BlobStore { for (size_t i = 0; i < num_blobs_; ++i) { const hwy::uint128_t val = keys_[num_blobs_ + i]; if (val.lo != offset) return __LINE__; - offset = ZeroFillPadding(offset + val.hi); + offset = hwy::RoundUpTo(offset + val.hi, kBlobAlign); } if (offset != file_size_) return __LINE__; @@ -253,25 +248,24 @@ class BlobStore { static std::vector PrepareWriteRequests( const hwy::uint128_t keys[], const hwy::Span blobs[], - size_t num_blobs) { + size_t num_blobs, BlobStore* bs) { // Sanity check and ensure the cast below is safe. HWY_ASSERT(num_blobs < (1ULL << 20)); // Allocate var-length header. const size_t header_size = HeaderSize(num_blobs); - const size_t padded_header_size = hwy::RoundUpTo(header_size, kAlign); - BlobStorePtr bs = Allocate(padded_header_size); + const size_t padded_header_size = hwy::RoundUpTo(header_size, kBlobAlign); const uint64_t padded_header_end = bs->ZeroFillPadding(header_size); HWY_ASSERT(padded_header_end == padded_header_size); // All-zero buffer used to write padding to the file without copying the // input blobs. - static uint8_t zeros[kAlign] = {0}; + static uint8_t zeros[kBlobAlign] = {0}; // Total file size will be the header plus all padded blobs. uint64_t payload = 0; for (size_t i = 0; i < num_blobs; ++i) { - payload += hwy::RoundUpTo(blobs[i].size(), kAlign); + payload += hwy::RoundUpTo(blobs[i].size(), kBlobAlign); } const size_t total_size = padded_header_size + payload; @@ -285,7 +279,7 @@ class BlobStore { std::vector requests; requests.reserve(1 + 2 * num_blobs); requests.emplace_back(/*offset=*/0, padded_header_size, - reinterpret_cast(bs.get()), 0); + reinterpret_cast(bs), 0); // Fill second half of keys_ with offset/size and prepare IO requests. uint64_t offset = padded_header_end; @@ -295,10 +289,10 @@ class BlobStore { EnqueueChunkRequests(offset, blobs[i].size(), blobs[i].data(), requests); offset += blobs[i].size(); - const size_t padded_size = hwy::RoundUpTo(blobs[i].size(), kAlign); + const size_t padded_size = hwy::RoundUpTo(blobs[i].size(), kBlobAlign); if (padded_size != blobs[i].size()) { const size_t padding = padded_size - blobs[i].size(); - HWY_ASSERT(padding <= kAlign); + HWY_ASSERT(padding <= kBlobAlign); requests.emplace_back(offset, padding, zeros, 0); offset += padding; } @@ -418,8 +412,11 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool, HWY_ASSERT(keys_.size() == blobs_.size()); // Concatenate blobs in memory. + const size_t header_size = BlobStore::HeaderSize(keys_.size()); + const size_t padded_header_size = hwy::RoundUpTo(header_size, kBlobAlign); + BlobStorePtr bs = BlobStore::Allocate(padded_header_size); std::vector requests = BlobStore::PrepareWriteRequests( - keys_.data(), blobs_.data(), keys_.size()); + keys_.data(), blobs_.data(), keys_.size(), bs.get()); // Create/replace existing file. #if HWY_OS_WIN diff --git a/compression/blob_store.h b/compression/blob_store.h index 6ced37f..8736d0f 100644 --- a/compression/blob_store.h +++ b/compression/blob_store.h @@ -40,6 +40,12 @@ using BlobStorePtr = hwy::AlignedFreeUniquePtr; // 0 if successful, otherwise the line number of the failing check. using BlobError = int; +// Blob offsets on disk and memory addresses are a multiple of this, because +// we pad the header and each blob's size. This matches CUDA alignment and the +// maximum SVE vector size, and exceeds typical x86 cache line sizes (64 or +// 128), which can help performance. +static constexpr size_t kBlobAlign = 256; + struct BlobIO { BlobIO(uint64_t offset, size_t size, void* data, uint64_t padding) : offset(offset), size(size), data(data), padding(padding) {} diff --git a/compression/compress-inl.h b/compression/compress-inl.h index 5f11ca1..5717545 100644 --- a/compression/compress-inl.h +++ b/compression/compress-inl.h @@ -381,13 +381,14 @@ HWY_INLINE void Compress(const std::array& in, } // Decompresses `num` values from `compressed` starting at `compressed_ofs`. -template -HWY_NOINLINE void Decompress(const CompressedArray& compressed, - size_t compressed_ofs, OutT* out, size_t num) { - HWY_DASSERT(compressed_ofs + num <= compressed.NumElements()); +template +HWY_NOINLINE void Decompress(const ArrayT& compressed, size_t compressed_ofs, + OutT* out, size_t num) { + HWY_DASSERT(compressed_ofs + num <= compressed.size()); const hn::ScalableTag d; - using Traits = CompressTraits; - Traits::Decompress(d, kCapacity, compressed.data(), compressed_ofs, out, num); + using Traits = CompressTraits; + Traits::Decompress(d, compressed.size(), compressed.data(), compressed_ofs, + out, num); } // As above, but with threading and benchmarking. @@ -395,7 +396,7 @@ template HWY_INLINE void Decompress(const CompressedArray& compressed, size_t compressed_ofs, OutT* out, size_t num, hwy::ThreadPool& pool) { - HWY_DASSERT(compressed_ofs + num <= compressed.NumElements()); + HWY_DASSERT(compressed_ofs + num <= compressed.size()); const double t0 = hwy::platform::Now(); using Traits = CompressTraits; @@ -407,7 +408,7 @@ HWY_INLINE void Decompress(const CompressedArray& compressed, const size_t ofs = idx_batch * kBatch; const size_t num = idx_batch == num_batches - 1 ? (num - ofs) : kBatch; - Traits::Decompress(d, compressed.NumElements(), compressed.data(), + Traits::Decompress(d, compressed.size(), compressed.data(), compressed_ofs + ofs, out + ofs, num); }); @@ -417,16 +418,28 @@ HWY_INLINE void Decompress(const CompressedArray& compressed, fprintf(stderr, "Decompress %.1f MB/s\n", mbps); } +// Returns dot product with `vec_aligned` of length `num`. +template +HWY_INLINE float Dot(DF df, const ArrayT& compressed, size_t compressed_ofs, + const VecT* vec_aligned, size_t num) { + HWY_DASSERT(compressed_ofs + num <= compressed.size()); + HWY_DASSERT(hn::IsAligned(df, vec_aligned)); + using Traits = CompressTraits; + return Traits::Dot(df, compressed.size(), compressed.data(), compressed_ofs, + vec_aligned, num); +} + // Returns dot product with `vec_aligned` of length `num`. template HWY_INLINE float Dot(DF df, const CompressedArray& compressed, size_t compressed_ofs, const VecT* vec_aligned, size_t num) { - HWY_DASSERT(compressed_ofs + num <= compressed.NumElements()); + HWY_DASSERT(compressed_ofs + num <= compressed.size()); HWY_DASSERT(hn::IsAligned(df, vec_aligned)); using Traits = CompressTraits; - return Traits::Dot(df, kCapacity, compressed.data(), compressed_ofs, - vec_aligned, num); + return (compressed.scale() * Traits::Dot(df, compressed.size(), + compressed.data(), compressed_ofs, + vec_aligned, num)); } // Callback used by ForeachTensor. @@ -445,6 +458,12 @@ class Compressor { compressed.CompressedSize()); } + void AddScales(float* scales, size_t len) { + if (len) { + writer_.Add(CacheKey("scales"), scales, len * sizeof(scales[0])); + } + } + void WriteAll(hwy::ThreadPool& pool, const char* blob_filename) { const BlobError err = writer_.WriteAll(pool, blob_filename); if (err != 0) { diff --git a/compression/compress.h b/compression/compress.h index e09d7e5..118ded2 100644 --- a/compression/compress.h +++ b/compression/compress.h @@ -71,10 +71,15 @@ class CompressedArray { } public: + using value_type = MatT; + MatT* data() { return data_.data(); } const MatT* data() const { return data_.data(); } - constexpr size_t NumElements() const { return kCapacity; } + float scale() const { return scale_[0]; } + void set_scale(float scale) { scale_[0] = scale; } + + constexpr size_t size() const { return kCapacity; } constexpr size_t CompressedSize() const { return NumCompressed() * sizeof(MatT); @@ -82,6 +87,7 @@ class CompressedArray { private: std::array data_; + float scale_[kBlobAlign / sizeof(float)]; }; #if COMPRESS_STATS @@ -187,11 +193,21 @@ class CacheLoader { err_ = reader_.Enqueue(CacheKey(name), compressed.data(), compressed.CompressedSize()); + compressed.set_scale(1.0f); if (err_ != 0) { fprintf(stderr, "Failed to read cache %s (error %d)\n", name, err_); } } + void LoadScales(float* scales, size_t len) { + if (0 != reader_.Enqueue(CacheKey("scales"), scales, + len * sizeof(scales[0]))) { + for (size_t i = 0; i < len; ++i) { + scales[i] = 1.0f; + } + } + } + // Returns whether all tensors are successfully loaded from cache. bool ReadAll(hwy::ThreadPool& pool) { // reader_ invalid or any Enqueue failed diff --git a/configs.h b/configs.h index e704664..f1d7f9d 100644 --- a/configs.h +++ b/configs.h @@ -30,6 +30,16 @@ #include +// copybara:import_next_line:gemma_cpp +#include "compression/sfp.h" +#include "hwy/base.h" // hwy::bfloat16_t + +// Allowable types for GEMMA_WEIGHT_T (can be specified at compilation time): +// float, hwy::bfloat16_t, SfpStream, NuqStream +#ifndef GEMMA_WEIGHT_T +#define GEMMA_WEIGHT_T SfpStream +#endif // !GEMMA_WEIGHT_T + namespace gcpp { static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN; @@ -45,6 +55,8 @@ struct ConfigGemma7B { static constexpr int kKVHeads = 16; // standard MHA static constexpr int kQKVDim = 256; // query size == key size == value size static constexpr int kTopK = gcpp::kTopK; + static constexpr int kNumTensorScales = 0; + using WeightT = GEMMA_WEIGHT_T; }; struct ConfigGemma2B { @@ -57,6 +69,8 @@ struct ConfigGemma2B { static constexpr int kKVHeads = 1; static constexpr int kQKVDim = 256; // query size == key size == value size static constexpr int kTopK = gcpp::kTopK; + static constexpr int kNumTensorScales = 0; + using WeightT = GEMMA_WEIGHT_T; }; } // namespace gcpp diff --git a/examples/hello_world/run.cc b/examples/hello_world/run.cc index a352250..484b4a4 100644 --- a/examples/hello_world/run.cc +++ b/examples/hello_world/run.cc @@ -19,9 +19,9 @@ #include "gemma.h" // copybara:import_next_line:gemma_cpp #include "util/app.h" // LoaderArgs +#include "hwy/contrib/thread_pool/thread_pool.h" // copybara:import_next_line:gemma_cpp #include "util/args.h" -#include "hwy/contrib/thread_pool/thread_pool.h" std::vector tokenize( const std::string& prompt_string, @@ -43,8 +43,7 @@ int main(int argc, char** argv) { hwy::ThreadPool pool(num_threads); // Instantiate model and KV Cache - gcpp::Gemma model(loader.tokenizer, loader.compressed_weights, - loader.ModelType(), pool); + gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool); auto kv_cache = CreateKVCache(loader.ModelType()); size_t pos = 0; // KV Cache position diff --git a/gemma.cc b/gemma.cc index bfaa812..77478bd 100644 --- a/gemma.cc +++ b/gemma.cc @@ -19,18 +19,18 @@ // which we pass the filename via macro 'argument'. #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "gemma.cc" // NOLINT -#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/foreach_target.h" // IWYU pragma: keep // Must come after foreach_target.h to avoid redefinition errors. // copybara:import_next_line:gemma_cpp #include "compression/compress-inl.h" // copybara:import_next_line:gemma_cpp #include "ops.h" -// copybara:import_next_line:gemma_cpp -#include "util/args.h" // Path #include "hwy/contrib/matvec/matvec-inl.h" #include "hwy/highway.h" #include "hwy/profiler.h" #include "hwy/timer.h" +// copybara:import_next_line:gemma_cpp +#include "util/args.h" // Path // Non-SIMD includes and types. Note that HWY_ONCE is only true on the last // compile pass, whereas we want this defined in the first. @@ -64,6 +64,12 @@ // copybara:import_next_line:sentencepiece #include "src/sentencepiece_processor.h" +// Setting this to true disables fread() calls that read the model file. +constexpr bool kDryRunFread = false; + +// Setting this to false will load and use uncompressed weights. +constexpr bool kWeightsAreCompressed = true; + namespace gcpp { template @@ -88,70 +94,145 @@ struct Layer { std::array pre_ffw_norm_scale; }; +float ScaleWeights(float* data, size_t len) { + float maxabs = 0.0; + for (size_t i = 0; i < len; ++i) { + maxabs = std::max(maxabs, std::abs(data[i])); + } + const float kMaxRange = 1.875f; + if (maxabs <= kMaxRange) { + return 1.0f; + } + const float scale = maxabs / kMaxRange; + const float inv_scale = 1.0f / scale; + for (size_t i = 0; i < len; ++i) { + data[i] *= inv_scale; + } + return scale; +} + +// Array instead of single large allocation for parallel mem init. Split out of +// Weights so that only these pointers are initialized. +template +struct LayerPointers { + explicit LayerPointers(hwy::ThreadPool& pool) { + pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) { + this->layers[task] = hwy::AllocateAligned>(1); + }); + } + + using TLayer = Layer; + std::array, TConfig::kLayers> layers; +}; + template struct Weights { - Weights() = default; - - hwy::AlignedUniquePtr[]> layers; // kLayers + // No ctor/dtor, allocated via AllocateAligned. std::array embedder_input_embedding; std::array final_norm_scale; + + LayerPointers layer_ptrs; + + std::array scales; + + const Layer* GetLayer(size_t layer) const { + return layer_ptrs.layers[layer].get(); + } + Layer* GetLayer(size_t layer) { + return layer_ptrs.layers[layer].get(); + } }; -// Only called if cached loading fails. template -hwy::AlignedUniquePtr> LoadWeights(const Path& checkpoint) { +hwy::AlignedFreeUniquePtr LoadWeights( + const Path& checkpoint, hwy::ThreadPool& pool, + bool scale_for_compression = false) { PROFILER_ZONE("Startup.LoadWeights"); - using TWeights = Weights; - hwy::AlignedUniquePtr weights = hwy::MakeUniqueAligned(); - weights->layers = - hwy::MakeUniqueAlignedArray>(TConfig::kLayers); - - if (checkpoint.path.empty()) { - HWY_ABORT( - "Loading --compressed_weights failed; we require a --weights argument. " - "Please see issue #11 on how to create this file.\n"); + if (!std::filesystem::exists(checkpoint.path)) { + HWY_ABORT("The model weights file '%s' does not exist.", + checkpoint.path.c_str()); } + using TWeights = Weights; + hwy::AlignedFreeUniquePtr weights_u8 = + hwy::AllocateAligned(sizeof(TWeights)); + TWeights* weights = reinterpret_cast(weights_u8.get()); + new (&weights->layer_ptrs) LayerPointers(pool); + + size_t scale_pos = 0; FILE* fptr; - fptr = fopen(checkpoint.path.c_str(), "rb"); - if (fptr == nullptr) { - HWY_ABORT("Failed to open model file %s - does it exist?", - checkpoint.path.c_str()); + if constexpr (kDryRunFread) { + fprintf(stderr, "Dry-Run, not reading model-file.\n"); + } else { + fptr = fopen(checkpoint.path.c_str(), "rb"); + if (fptr == nullptr) { + HWY_ABORT("Failed to open model file %s - does it exist?", + checkpoint.path.c_str()); + } } bool ok = true; uint64_t total_size = 0; - ok &= 1 == fread(&(weights->embedder_input_embedding), - sizeof(weights->embedder_input_embedding), 1, fptr); - ok &= 1 == fread(&(weights->final_norm_scale), - sizeof(weights->final_norm_scale), 1, fptr); - total_size += sizeof(weights->embedder_input_embedding) + - sizeof(weights->final_norm_scale); + auto do_fread = [&](void* var, int layer, const char* name, size_t size) { + if (layer == -1) { + fprintf(stderr, "Loading Parameters (size %zu): %s\n", size, name); + } else { + fprintf(stderr, "Loading Parameters (layer=%d, size %zu): %s\n", layer, + size, name); + } + if constexpr (!kDryRunFread) { + ok &= 1 == fread(var, size, 1, fptr); + total_size += size; + } + }; + do_fread(&(weights->embedder_input_embedding), -1, "embedder_input_embedding", + sizeof(weights->embedder_input_embedding)); + do_fread(&(weights->final_norm_scale), -1, "final_norm_scale", + sizeof(weights->final_norm_scale)); for (size_t layer = 0; layer < TConfig::kLayers; ++layer) { - Layer* layer_view = &weights->layers[layer]; - ok &= 1 == fread(&layer_view->attn_vec_einsum_w, - sizeof(layer_view->attn_vec_einsum_w), 1, fptr); - ok &= 1 == fread(&layer_view->qkv_einsum_w, - sizeof(layer_view->qkv_einsum_w), 1, fptr); - ok &= 1 == fread(&layer_view->gating_einsum_w, - sizeof(layer_view->gating_einsum_w), 1, fptr); - ok &= 1 == - fread(&layer_view->linear_w, sizeof(layer_view->linear_w), 1, fptr); - ok &= 1 == fread(&layer_view->pre_attention_norm_scale, - sizeof(layer_view->pre_attention_norm_scale), 1, fptr); - ok &= 1 == fread(&layer_view->pre_ffw_norm_scale, - sizeof(layer_view->pre_ffw_norm_scale), 1, fptr); - total_size += sizeof(*layer_view); + Layer* layer_view = weights->GetLayer(layer); + +#define READ_WEIGHTS(name) \ + do { \ + do_fread(&(layer_view->name), layer, #name, sizeof(layer_view->name)); \ + } while (0) + +#define SCALE_WEIGHTS(name) \ + do { \ + if (ok && !kDryRunFread && scale_for_compression) { \ + weights->scales[scale_pos++] = \ + ScaleWeights(layer_view->name.data(), layer_view->name.size()); \ + } \ + } while (0) + // Make sure we don't have uninitialized memory. + hwy::ZeroBytes(layer_view, sizeof(*layer_view)); + READ_WEIGHTS(attn_vec_einsum_w); + READ_WEIGHTS(qkv_einsum_w); + SCALE_WEIGHTS(attn_vec_einsum_w); + SCALE_WEIGHTS(qkv_einsum_w); + READ_WEIGHTS(gating_einsum_w); + READ_WEIGHTS(linear_w); + SCALE_WEIGHTS(gating_einsum_w); + SCALE_WEIGHTS(linear_w); + READ_WEIGHTS(pre_attention_norm_scale); + READ_WEIGHTS(pre_ffw_norm_scale); +#undef READ_WEIGHTS } if (!ok) { - HWY_ABORT("Failed to read from %s - might be a directory, or too small? " - "expected size: %d kB", checkpoint.path.c_str(), - static_cast(total_size >> 10)); + HWY_ABORT( + "Failed to read from %s - might be a directory, or too small? " + "expected size: %d kB", + checkpoint.path.c_str(), static_cast(total_size >> 10)); } - HWY_ASSERT(0 == fclose(fptr)); - return weights; + if (!kDryRunFread) { + HWY_ASSERT(0 == fclose(fptr)); + if (scale_for_compression) { + HWY_ASSERT(scale_pos == TConfig::kNumTensorScales); + } + } + return weights_u8; } template @@ -159,18 +240,19 @@ struct CompressedLayer { // No ctor/dtor, allocated via AllocateAligned. using TLayer = gcpp::Layer; + using WeightT = typename TConfig::WeightT; static constexpr size_t kModelDim = TConfig::kModelDim; static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim; // Compressed Parameters // We don't yet have an RMSNorm that accepts all WeightT. - CompressedArray c_pre_attention_norm_scale; - CompressedArray c_pre_ffw_norm_scale; - CompressedArray c_gating_einsum_w; - CompressedArray c_linear_w; - CompressedArray c_qkv_einsum_w; - CompressedArray c_attn_vec_einsum_w; + CompressedArray pre_attention_norm_scale; + CompressedArray pre_ffw_norm_scale; + CompressedArray gating_einsum_w; + CompressedArray linear_w; + CompressedArray qkv_einsum_w; + CompressedArray attn_vec_einsum_w; }; // Array instead of single large allocation for parallel mem init. Split out of @@ -193,21 +275,25 @@ struct CompressedWeights { // No ctor/dtor, allocated via AllocateAligned. CompressedArray - c_embedder_input_embedding; + embedder_input_embedding; - CompressedArray c_final_norm_scale; + CompressedArray final_norm_scale; // Must be last so that the other arrays remain aligned. CompressedLayerPointers c_layer_ptrs; - const CompressedLayer* CLayer(size_t layer) const { + const CompressedLayer* GetLayer(size_t layer) const { return c_layer_ptrs.c_layers[layer].get(); } - CompressedLayer* CLayer(size_t layer) { + CompressedLayer* GetLayer(size_t layer) { return c_layer_ptrs.c_layers[layer].get(); } }; +template +using WeightsT = hwy::If, + Weights>; + // Aligned. template struct Activations { @@ -272,16 +358,27 @@ KVCache CreateKVCache(Model type) { } } +namespace { +template +void DeleteLayersPtrs(CompressedWeights* c_weights) { + c_weights->c_layer_ptrs.~CompressedLayerPointers(); +} +template +void DeleteLayersPtrs(Weights* weights) { + weights->layer_ptrs.~LayerPointers(); +} +} // namespace + template struct GemmaImpl : public GemmaInterface { GemmaImpl(std::unique_ptr& tokenizer, - hwy::AlignedFreeUniquePtr& compressed_weights, + hwy::AlignedFreeUniquePtr& weights_u8, hwy::ThreadPool& pool); ~GemmaImpl() { - using CWeights = CompressedWeights; - CWeights* c_weights = reinterpret_cast(compressed_weights.get()); - c_weights->c_layer_ptrs.~CompressedLayerPointers(); + WeightsT* weights = + reinterpret_cast*>(weights_u8.get()); + DeleteLayersPtrs(weights); } const sentencepiece::SentencePieceProcessor* Tokenizer() const override { @@ -296,7 +393,7 @@ struct GemmaImpl : public GemmaInterface { int verbosity) override; std::unique_ptr tokenizer; - hwy::AlignedFreeUniquePtr compressed_weights; + hwy::AlignedFreeUniquePtr weights_u8; hwy::AlignedUniquePtr> prefill; hwy::AlignedUniquePtr> state; }; @@ -309,11 +406,11 @@ HWY_BEFORE_NAMESPACE(); namespace gcpp { namespace HWY_NAMESPACE { -template +template HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer, Activations& activations, - const CompressedLayer* c_layer, - KVCache& kv_cache, hwy::ThreadPool& pool) { + const LayerT* layer_weights, KVCache& kv_cache, + hwy::ThreadPool& pool) { PROFILER_ZONE("Gen.Attention"); const size_t pos = batch_start + batch_idx; HWY_DASSERT(batch_idx < kBatchSize); @@ -329,27 +426,26 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer, static const float kQueryScale = static_cast(1.0 / sqrt(static_cast(kQKVDim))); - const size_t batch_offset = batch_idx * kModelDim; + float* x = activations.pre_att_rms_out.data() + batch_idx * kModelDim; auto ProjQ = [&](uint64_t head, size_t head_offset) HWY_ATTR { float* HWY_RESTRICT q = activations.q.data() + head * kQKVDim + batch_idx * kHeads * kQKVDim; - MatVecLoop( - c_layer->c_qkv_einsum_w, head_offset + 0 * kQKVDim * kModelDim, - activations.pre_att_rms_out.data() + batch_offset, q); + MatVecLoop(layer_weights->qkv_einsum_w, + head_offset + 0 * kQKVDim * kModelDim, x, q); }; - auto ProjKV = - [&](size_t k_offset, size_t v_offset, size_t kv_offset) HWY_ATTR { - TwoOfsMatVecLoop( - c_layer->c_qkv_einsum_w, k_offset, v_offset, - activations.pre_att_rms_out.data() + batch_offset, - kv_cache.key_cache.get() + kv_offset, - kv_cache.value_cache.get() + kv_offset); + auto ProjKV = [&](size_t k_offset, size_t v_offset, + size_t kv_offset) HWY_ATTR { + float* HWY_RESTRICT k = kv_cache.key_cache.get() + kv_offset; + float* HWY_RESTRICT v = kv_cache.value_cache.get() + kv_offset; - Rope(kv_cache.key_cache.get() + kv_offset, kQKVDim, pos); - }; + TwoOfsMatVecLoop(layer_weights->qkv_einsum_w, k_offset, + v_offset, x, k, v); + + Rope(k, kQKVDim, pos); + }; auto Attn = [&](uint64_t head, size_t head_offset) HWY_ATTR { // Calculate scores @@ -388,7 +484,7 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer, head == 0 ? activations.att_post2.data() + batch_idx * kModelDim : activations.att_post1.data() + head * kBatchSize * kModelDim; - MatVecLoop(c_layer->c_attn_vec_einsum_w, + MatVecLoop(layer_weights->attn_vec_einsum_w, head * kModelDim * kQKVDim, att_out, head_out); }; @@ -431,9 +527,9 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer, } } -template +template HWY_NOINLINE void FFW(Activations& activations, - size_t batch_idx, const CompressedLayer* c_layer, + size_t batch_idx, const LayerT* layer_weights, hwy::ThreadPool& pool) { HWY_DASSERT(batch_idx < kBatchSize); static constexpr size_t kModelDim = TConfig::kModelDim; @@ -449,12 +545,12 @@ HWY_NOINLINE void FFW(Activations& activations, // Same matrix, first and second half of rows. Could fuse into one MatVec, // but separating them could help on NUMA e.g. multiple sockets. - MatVec(c_layer->c_gating_einsum_w, + MatVec(layer_weights->gating_einsum_w, kFFHiddenDim * kModelDim, vec, out_mul, pool); // Gate, will go through the nonlinearity. - MatVec(c_layer->c_gating_einsum_w, 0, vec, out, + MatVec(layer_weights->gating_einsum_w, 0, vec, out, pool); namespace hn = hwy::HWY_NAMESPACE; @@ -467,7 +563,7 @@ HWY_NOINLINE void FFW(Activations& activations, PROFILER_ZONE("Gen.FFW\\GatedGELU"); MatVec( - c_layer->c_linear_w, 0, activations.ffw_hidden.data() + hidden_offset, + layer_weights->linear_w, 0, activations.ffw_hidden.data() + hidden_offset, activations.ffw_out.data() + batch_idx * kModelDim, pool); } @@ -486,9 +582,9 @@ GEMMA_CONSTEXPR_EMBSCALING float EmbeddingScaling() { Sqrt(static_cast(TConfig::kModelDim)))); } -template +template HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos, - const CompressedWeights& c_weights, + const WeightArrayT& weights, Activations& activations, KVCache& kv_cache, hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool) { @@ -500,22 +596,22 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos, pool.Run( 0, num_tokens, [&](const uint64_t token_idx, size_t /*thread*/) HWY_ATTR { const int token = tokens[token_idx]; - Decompress(c_weights.c_embedder_input_embedding, token * kModelDim, + Decompress(weights.embedder_input_embedding, token * kModelDim, activations.x.data() + token_idx * kModelDim, kModelDim); MulByConst(kEmbScaling, activations.x.data() + token_idx * kModelDim, kModelDim); }); for (size_t layer = 0; layer < TConfig::kLayers; ++layer) { - const CompressedLayer* c_layer = c_weights.CLayer(layer); + const auto* layer_weights = weights.GetLayer(layer); for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) { RMSNorm(activations.x.data() + token_idx * kModelDim, - c_layer->c_pre_attention_norm_scale.data(), + layer_weights->pre_attention_norm_scale.data(), activations.pre_att_rms_out.data() + token_idx * kModelDim, kModelDim); - Attention(pos, token_idx, layer, activations, - c_layer, kv_cache, pool); + Attention(pos, token_idx, layer, activations, layer_weights, + kv_cache, pool); } // TODO: sink the loop into these functions, i.e. make them matmuls. @@ -525,10 +621,10 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos, AddFrom(activations.att_post2.data() + token_idx * kModelDim, activations.x.data() + token_idx * kModelDim, kModelDim); RMSNorm(activations.x.data() + token_idx * kModelDim, - c_layer->c_pre_ffw_norm_scale.data(), + layer_weights->pre_ffw_norm_scale.data(), activations.bf_pre_ffw_rms_out.data() + token_idx * kModelDim, kModelDim); - FFW(activations, token_idx, c_layer, inner_pool); + FFW(activations, token_idx, layer_weights, inner_pool); AddFrom(activations.ffw_out.data() + token_idx * kModelDim, activations.x.data() + token_idx * kModelDim, kModelDim); }); @@ -536,21 +632,20 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos, pool.Run( 0, num_tokens, [&](const uint64_t token_idx, size_t /*thread*/) HWY_ATTR { - RMSNormInplace(c_weights.c_final_norm_scale.data(), + RMSNormInplace(weights.final_norm_scale.data(), activations.x.data() + token_idx * kModelDim, kModelDim); }); } // n = 1 specialization -template -void Transformer(int token, size_t pos, - const CompressedWeights& c_weights, +template +void Transformer(int token, size_t pos, const WeightArrayT& weights, Activations& activations, KVCache& kv_cache, hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool) { static constexpr size_t kLayers = TConfig::kLayers; static constexpr size_t kModelDim = TConfig::kModelDim; - Decompress(c_weights.c_embedder_input_embedding, token * kModelDim, + Decompress(weights.embedder_input_embedding, token * kModelDim, activations.x.data(), kModelDim); GEMMA_CONSTEXPR_EMBSCALING const float kEmbScaling = @@ -558,17 +653,18 @@ void Transformer(int token, size_t pos, MulByConst(kEmbScaling, activations.x.data(), kModelDim); for (size_t layer = 0; layer < kLayers; ++layer) { - const CompressedLayer* c_layer = c_weights.CLayer(layer); - RMSNorm(activations.x.data(), c_layer->c_pre_attention_norm_scale.data(), + const auto* layer_weights = weights.GetLayer(layer); + RMSNorm(activations.x.data(), + layer_weights->pre_attention_norm_scale.data(), activations.pre_att_rms_out.data(), kModelDim); - Attention(pos, 0, layer, activations, c_layer, kv_cache, pool); + Attention<1>(pos, 0, layer, activations, layer_weights, kv_cache, pool); AddFrom(activations.att_post2.data(), activations.x.data(), kModelDim); - RMSNorm(activations.x.data(), c_layer->c_pre_ffw_norm_scale.data(), + RMSNorm(activations.x.data(), layer_weights->pre_ffw_norm_scale.data(), activations.bf_pre_ffw_rms_out.data(), kModelDim); - FFW(activations, /* batch_idx = */ 0, c_layer, pool); + FFW<1>(activations, /* batch_idx = */ 0, layer_weights, pool); AddFrom(activations.ffw_out.data(), activations.x.data(), kModelDim); } - RMSNormInplace(c_weights.c_final_norm_scale.data(), activations.x.data(), + RMSNormInplace(weights.final_norm_scale.data(), activations.x.data(), kModelDim); } @@ -609,9 +705,9 @@ void GenerateImpl(GemmaImpl& gemma, size_t max_tokens, Activations& activations = *gemma.state.get(); Activations& prefill_activations = *gemma.prefill.get(); - const CompressedWeights& c_weights = - *reinterpret_cast*>( - gemma.compressed_weights.get()); + + const WeightsT& weights = + *reinterpret_cast*>(gemma.weights_u8.get()); size_t prompt_size = prompt.size(); RangeChecks(max_tokens, max_generated_tokens, prompt_size); @@ -643,9 +739,8 @@ void GenerateImpl(GemmaImpl& gemma, size_t max_tokens, HWY_DASSERT(batch_size <= kPrefillBatchSize); HWY_DASSERT(pos_offset + batch_size <= prompt_size - 1); const int* batch_tokens = prompt.data() + pos_offset; - Prefill(batch_tokens, batch_size, pos, - c_weights, prefill_activations, - kv_cache, pool, inner_pool); + Prefill(batch_tokens, batch_size, pos, weights, + prefill_activations, kv_cache, pool, inner_pool); for (size_t idx = 0; idx < batch_size; ++idx) { stream_token(batch_tokens[idx], 0.0f); } @@ -672,7 +767,7 @@ void GenerateImpl(GemmaImpl& gemma, size_t max_tokens, for (size_t generate_pos = 0; pos < max_tokens && generate_pos < max_generated_tokens; ++pos, ++pos_offset, ++generate_pos) { - Transformer(token, pos, c_weights, activations, kv_cache, pool, inner_pool); + Transformer(token, pos, weights, activations, kv_cache, pool, inner_pool); float* final_activation = activations.x.data(); // The condition below is always true if we are doing Prefill above. // We keep it here for clarity so that the code is correct even if Prefill @@ -680,9 +775,9 @@ void GenerateImpl(GemmaImpl& gemma, size_t max_tokens, if (pos_offset >= prompt_size - 1) { PROFILER_ZONE("Gen.Embedding"); // Generation phase - MatVec( - c_weights.c_embedder_input_embedding, 0, final_activation, - activations.logits.data(), pool); + MatVec(weights.embedder_input_embedding, + 0, final_activation, + activations.logits.data(), pool); // Barrier: must have all logits so we can subtract max. Softmax(activations.logits.data(), kVocabSize); token = SampleTopK(activations.logits.data(), kVocabSize, @@ -743,52 +838,37 @@ void ForEachTensor(const Weights* weights, CompressedWeights& c_weights, Func& func) { func("c_embedding", weights ? weights->embedder_input_embedding.data() : nullptr, - c_weights.c_embedder_input_embedding); + c_weights.embedder_input_embedding); func("c_final_norm", weights ? weights->final_norm_scale.data() : nullptr, - c_weights.c_final_norm_scale); + c_weights.final_norm_scale); - char name[16]; - for (int layer_idx = 0; layer_idx < static_cast(TConfig::kLayers); - ++layer_idx) { + char name_buf[16]; + for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) { const size_t idx = static_cast(layer_idx); - Layer* layer = weights ? &weights->layers[idx] : nullptr; - CompressedLayer* c_layer = c_weights.CLayer(idx); + const Layer* layer = weights ? weights->GetLayer(idx) : nullptr; + CompressedLayer* layer_weights = c_weights.GetLayer(idx); - snprintf(name, sizeof(name), "pre_ff_ns_%d", layer_idx); - func(name, layer ? layer->pre_ffw_norm_scale.data() : nullptr, - c_layer->c_pre_ffw_norm_scale); +#define CALL_FUNC(name, member) \ + snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \ + func(name_buf, layer ? layer->member.data() : nullptr, layer_weights->member) - snprintf(name, sizeof(name), "gating_ein_%d", layer_idx); - func(name, layer ? layer->gating_einsum_w.data() : nullptr, - c_layer->c_gating_einsum_w); - - snprintf(name, sizeof(name), "linear_w_%d", layer_idx); - func(name, layer ? layer->linear_w.data() : nullptr, c_layer->c_linear_w); - snprintf(name, sizeof(name), "qkv_ein_%d", layer_idx); - - func(name, layer ? layer->qkv_einsum_w.data() : nullptr, - c_layer->c_qkv_einsum_w); - snprintf(name, sizeof(name), "att_ein_%d", layer_idx); - - func(name, layer ? layer->attn_vec_einsum_w.data() : nullptr, - c_layer->c_attn_vec_einsum_w); - - snprintf(name, sizeof(name), "pre_att_ns_%d", layer_idx); - func(name, layer ? layer->pre_attention_norm_scale.data() : nullptr, - c_layer->c_pre_attention_norm_scale); + CALL_FUNC("pre_ff_ns", pre_ffw_norm_scale); + CALL_FUNC("gating_ein", gating_einsum_w); + CALL_FUNC("linear_w", linear_w); + CALL_FUNC("qkv_ein", qkv_einsum_w); + CALL_FUNC("att_ein", attn_vec_einsum_w); + CALL_FUNC("pre_att_ns", pre_attention_norm_scale); +#undef CALL_FUNC } } template -hwy::AlignedFreeUniquePtr GetCompressedWeights( - const Path& weights_path, const Path& cache, hwy::ThreadPool& pool) { +hwy::AlignedFreeUniquePtr LoadCompressedWeights( + const Path& weights, hwy::ThreadPool& pool) { PROFILER_ZONE("Startup.LoadCache"); - - if (!std::filesystem::exists(weights_path.path) && - !std::filesystem::exists(cache.path)) { - HWY_ABORT( - "Either the model weights (--weights) or cached compressed weights " - "(--compressed_weights) must exist."); + if (!std::filesystem::exists(weights.path)) { + HWY_ABORT("The model weights file '%s' does not exist.", + weights.path.c_str()); } // Allocate compressed weights. @@ -798,32 +878,49 @@ hwy::AlignedFreeUniquePtr GetCompressedWeights( CWeights* c_weights = reinterpret_cast(c_weights_u8.get()); new (&c_weights->c_layer_ptrs) CompressedLayerPointers(pool); - // First attempt to load them from cache, without requiring weights. - CacheLoader loader(cache.path.c_str()); + std::array scales; + CacheLoader loader(weights.path.c_str()); ForEachTensor(nullptr, *c_weights, loader); - if (loader.ReadAll(pool)) return c_weights_u8; - - // Get weights, compress, and store in cache. - const hwy::AlignedUniquePtr> weights = - LoadWeights(weights_path); - Compressor compressor(pool); - ForEachTensor(weights.get(), *c_weights, compressor); - compressor.WriteAll(pool, cache.path.c_str()); - + loader.LoadScales(scales.data(), scales.size()); + if (!loader.ReadAll(pool)) { + HWY_ABORT("Failed to load model weights."); + } + if (TConfig::kNumTensorScales > 0) { + size_t scale_pos = 0; + for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) { + const size_t idx = static_cast(layer_idx); + CompressedLayer* layer_weights = c_weights->GetLayer(idx); + layer_weights->attn_vec_einsum_w.set_scale(scales[scale_pos++]); + layer_weights->qkv_einsum_w.set_scale(scales[scale_pos++]); + layer_weights->gating_einsum_w.set_scale(scales[scale_pos++]); + layer_weights->linear_w.set_scale(scales[scale_pos++]); + } + HWY_ASSERT(scale_pos == TConfig::kNumTensorScales); + } return c_weights_u8; } // Type-erased because this function is called via a function pointer. -hwy::AlignedFreeUniquePtr GetCompressedWeightsT( - gcpp::Model model, const Path& weights, const Path& compressed_weights, - hwy::ThreadPool& pool) { +hwy::AlignedFreeUniquePtr LoadCompressedWeightsT( + gcpp::Model model, const Path& weights, hwy::ThreadPool& pool) { switch (model) { case Model::GEMMA_2B: - return GetCompressedWeights(weights, compressed_weights, - pool); + return LoadCompressedWeights(weights, pool); case Model::GEMMA_7B: - return GetCompressedWeights(weights, compressed_weights, - pool); + return LoadCompressedWeights(weights, pool); + default: + HWY_ABORT("Model type %d unknown.", static_cast(model)); + } +} + +hwy::AlignedFreeUniquePtr LoadWeightsT(gcpp::Model model, + const Path& weights, + hwy::ThreadPool& pool) { + switch (model) { + case Model::GEMMA_2B: + return LoadWeights(weights, pool); + case Model::GEMMA_7B: + return LoadWeights(weights, pool); default: HWY_ABORT("Model type %d unknown.", static_cast(model)); } @@ -846,18 +943,22 @@ void CompressWeights(const Path& weights_path, new (&c_weights->c_layer_ptrs) CompressedLayerPointers(pool); // Get weights, compress, and store. - const hwy::AlignedUniquePtr> weights = - LoadWeights(weights_path); + const bool scale_for_compression = TConfig::kNumTensorScales > 0; + const hwy::AlignedFreeUniquePtr weights_u8 = + LoadWeights(weights_path, pool, scale_for_compression); + Weights* weights = + reinterpret_cast*>(weights_u8.get()); Compressor compressor(pool); - ForEachTensor(weights.get(), *c_weights, compressor); + ForEachTensor(weights, *c_weights, compressor); + compressor.AddScales(weights->scales.data(), weights->scales.size()); compressor.WriteAll(pool, compressed_weights_path.path.c_str()); + weights->layer_ptrs.~LayerPointers(); c_weights->c_layer_ptrs.~CompressedLayerPointers(); } void CompressWeightsT(gcpp::Model model, const Path& weights, - const Path& compressed_weights, - hwy::ThreadPool& pool) { + const Path& compressed_weights, hwy::ThreadPool& pool) { switch (model) { case Model::GEMMA_2B: CompressWeights(weights, compressed_weights, pool); @@ -877,7 +978,8 @@ HWY_AFTER_NAMESPACE(); #if HWY_ONCE namespace gcpp { -HWY_EXPORT(GetCompressedWeightsT); +HWY_EXPORT(LoadCompressedWeightsT); +HWY_EXPORT(LoadWeightsT); HWY_EXPORT(CompressWeightsT); HWY_EXPORT(Generate2B); HWY_EXPORT(Generate7B); @@ -892,10 +994,9 @@ KVCache CreateKVCache(size_t size_cache_pos, size_t seq_len) { template GemmaImpl::GemmaImpl( std::unique_ptr& tokenizer, - hwy::AlignedFreeUniquePtr& compressed_weights, - hwy::ThreadPool& pool) + hwy::AlignedFreeUniquePtr& weights_u8, hwy::ThreadPool& pool) : tokenizer(std::move(tokenizer)), - compressed_weights(std::move(compressed_weights)), + weights_u8(std::move(weights_u8)), prefill(hwy::MakeUniqueAligned>()), state(hwy::MakeUniqueAligned>()) {} @@ -922,10 +1023,8 @@ void GemmaImpl::Generate( kv_cache, pool, inner_pool, stream_token, accept_token, gen, verbosity); } -Gemma::Gemma(const Path& tokenizer_path, const Path& compressed_weights_path, - const Path& weights_path, Model model_type, ModelTraining training, - hwy::ThreadPool& pool) - : model_training(training) { +Gemma::Gemma(const Path& tokenizer_path, const Path& weights, Model model_type, + hwy::ThreadPool& pool) { std::unique_ptr tokenizer; { PROFILER_ZONE("Startup.tokenizer"); @@ -934,16 +1033,21 @@ Gemma::Gemma(const Path& tokenizer_path, const Path& compressed_weights_path, HWY_ABORT("Failed to load the tokenizer file."); } } - auto compressed_weights = HWY_DYNAMIC_DISPATCH(GetCompressedWeightsT)( - model_type, weights_path, compressed_weights_path, pool); + + hwy::AlignedFreeUniquePtr weights_u8; + if constexpr (kWeightsAreCompressed) { + weights_u8 = + HWY_DYNAMIC_DISPATCH(LoadCompressedWeightsT)(model_type, weights, pool); + } else { + weights_u8 = HWY_DYNAMIC_DISPATCH(LoadWeightsT)(model_type, weights, pool); + } + switch (model_type) { case Model::GEMMA_2B: - impl_.reset( - new GemmaImpl(tokenizer, compressed_weights, pool)); + impl_.reset(new GemmaImpl(tokenizer, weights_u8, pool)); break; case Model::GEMMA_7B: - impl_.reset( - new GemmaImpl(tokenizer, compressed_weights, pool)); + impl_.reset(new GemmaImpl(tokenizer, weights_u8, pool)); break; default: HWY_ABORT("Model type %d unknown.", static_cast(model_type)); @@ -981,10 +1085,9 @@ void GenerateGemma(Gemma& gemma, RuntimeConfig runtime_config, } void CompressWeights(gcpp::Model model, const Path& weights, - const Path& compressed_weights, - hwy::ThreadPool& pool) { - HWY_DYNAMIC_DISPATCH(CompressWeightsT)( - model, weights, compressed_weights, pool); + const Path& compressed_weights, hwy::ThreadPool& pool) { + HWY_DYNAMIC_DISPATCH(CompressWeightsT) + (model, weights, compressed_weights, pool); } } // namespace gcpp diff --git a/gemma.h b/gemma.h index a3caa43..dc96c7b 100644 --- a/gemma.h +++ b/gemma.h @@ -24,22 +24,18 @@ // copybara:import_next_line:gemma_cpp #include "compression/compress.h" // SfpStream/NuqStream // copybara:import_next_line:gemma_cpp -#include "util/args.h" // Path +#include "configs.h" #include "hwy/aligned_allocator.h" #include "hwy/base.h" // hwy::bfloat16_t #include "hwy/contrib/thread_pool/thread_pool.h" +// copybara:import_next_line:gemma_cpp +#include "util/args.h" // Path // copybara:import_next_line:sentencepiece #include "src/sentencepiece_processor.h" namespace gcpp { -// Allowable types for GEMMA_WEIGHT_T (can be specified at compilation time): -// float, hwy::bfloat16_t, SfpStream, NuqStream -#ifndef GEMMA_WEIGHT_T -#define GEMMA_WEIGHT_T SfpStream -#endif // !GEMMA_WEIGHT_T -using WeightT = GEMMA_WEIGHT_T; - +using GemmaWeightT = GEMMA_WEIGHT_T; using EmbedderInputT = hwy::bfloat16_t; constexpr size_t kPrefillBatchSize = 16; constexpr bool kSystemPrompt = false; @@ -65,13 +61,11 @@ struct RuntimeConfig { struct GemmaInterface; struct Gemma { - Gemma(const Path& tokenizer_path, const Path& compressed_weights_path, - const Path& weights_path, Model model_type, ModelTraining training, + Gemma(const Path& tokenizer_path, const Path& weights, Model model_type, hwy::ThreadPool& pool); ~Gemma(); // must be defined after GemmaInterface's dtor is defined. const sentencepiece::SentencePieceProcessor* Tokenizer() const; std::unique_ptr impl_; - gcpp::ModelTraining model_training; }; KVCache CreateKVCache(Model type); // convenient workaround for now @@ -99,8 +93,7 @@ void GenerateGemma(Gemma& gemma, RuntimeConfig runtime_config, const StreamFunc& stream_token, std::mt19937& gen); void CompressWeights(gcpp::Model model, const Path& weights, - const Path& compressed_weights, - hwy::ThreadPool& pool); + const Path& compressed_weights, hwy::ThreadPool& pool); constexpr int EOS_ID = 1; diff --git a/ops_test.cc b/ops_test.cc index c711946..d74ceb8 100644 --- a/ops_test.cc +++ b/ops_test.cc @@ -369,6 +369,7 @@ CompressedArray GenerateMat(size_t offset) { } } Compress(content, ws, mat, pool); + mat.set_scale(1.0f); return mat; } diff --git a/run.cc b/run.cc index 46ac1ba..7ebf7b4 100644 --- a/run.cc +++ b/run.cc @@ -29,14 +29,14 @@ #include "gemma.h" // Gemma // copybara:import_next_line:gemma_cpp #include "util/app.h" -// copybara:import_next_line:gemma_cpp -#include "util/args.h" // HasHelp #include "hwy/base.h" #include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/highway.h" #include "hwy/per_target.h" #include "hwy/profiler.h" #include "hwy/timer.h" +// copybara:import_next_line:gemma_cpp +#include "util/args.h" // HasHelp namespace gcpp { @@ -66,7 +66,7 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { << hwy::VectorBytes() * 8 << " bits)" << "\n" << "Compiled config : " << CompiledConfig() << "\n" << "Weight Type : " - << gcpp::TypeName(gcpp::WeightT()) << "\n" + << gcpp::TypeName(gcpp::GemmaWeightT()) << "\n" << "EmbedderInput Type : " << gcpp::TypeName(gcpp::EmbedderInputT()) << "\n"; } @@ -93,10 +93,11 @@ void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference, std::cerr << "\n"; } -void ReplGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache, - hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool, - const InferenceArgs& args, int verbosity, - const gcpp::AcceptFunc& accept_token, std::string& eot_line) { +void ReplGemma(gcpp::Gemma& model, ModelTraining training, + gcpp::KVCache& kv_cache, hwy::ThreadPool& pool, + hwy::ThreadPool& inner_pool, const InferenceArgs& args, + int verbosity, const gcpp::AcceptFunc& accept_token, + std::string& eot_line) { PROFILER_ZONE("Gen.misc"); int abs_pos = 0; // absolute token index over all turns int current_pos = 0; // token index within the current turn @@ -177,7 +178,7 @@ void ReplGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache, continue; } - if (model.model_training == ModelTraining::GEMMA_IT) { + if (training == ModelTraining::GEMMA_IT) { // For instruction-tuned models: add control tokens. prompt_string = "user\n" + prompt_string + "\nmodel\n"; @@ -232,8 +233,7 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { [](uint64_t /*task*/, size_t thread) { PinThreadToCore(thread); }); } - gcpp::Gemma model(loader.tokenizer, loader.compressed_weights, loader.weights, - loader.ModelType(), loader.ModelTraining(), pool); + gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool); auto kv_cache = CreateKVCache(loader.ModelType()); @@ -265,7 +265,8 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { } ReplGemma( - model, kv_cache, pool, inner_pool, inference, app.verbosity, + model, loader.ModelTraining(), kv_cache, pool, inner_pool, inference, + app.verbosity, /*accept_token=*/[](int) { return true; }, app.eot_line); } diff --git a/util/app.h b/util/app.h index 4735e8f..2348051 100644 --- a/util/app.h +++ b/util/app.h @@ -36,9 +36,9 @@ #include "configs.h" // copybara:import_next_line:gemma_cpp #include "gemma.h" +#include "hwy/base.h" // HWY_ASSERT // copybara:import_next_line:gemma_cpp #include "util/args.h" -#include "hwy/base.h" // HWY_ASSERT namespace gcpp { @@ -151,7 +151,7 @@ struct LoaderArgs : public ArgsBase { } // Returns error string or nullptr if OK. - const char* Validate() const { + const char* Validate() { const std::string model_type_lc = ToLower(model_type); if (model_type.empty()) { return "Missing --model flag, need to specify either 2b-pt, 7b-pt, " @@ -165,37 +165,42 @@ struct LoaderArgs : public ArgsBase { if (tokenizer.path.empty()) { return "Missing --tokenizer flag, a file for the tokenizer is required."; } - if (compressed_weights.path.empty()) { - return "Missing --compressed_weights flag, a file for the compressed " - "model."; + if (!compressed_weights.path.empty()) { + if (weights.path.empty()) { + weights = compressed_weights; + } else { + return "Only one of --weights and --compressed_weights can be " + "specified. To create compressed weights use the compress_weights " + "tool."; + } } + if (weights.path.empty()) { + return "Missing --weights flag, a file for the model weights."; + } + if (!weights.exists()) { + return "Can't open file specified with --weights flag."; + } return nullptr; } Path tokenizer; - Path weights; // uncompressed weights file location - Path compressed_weights; // compressed weights file location + Path weights; // weights file location + Path compressed_weights; std::string model_type; template void ForEach(const Visitor& visitor) { visitor(tokenizer, "tokenizer", Path(), "Path name of tokenizer model file.\n Required argument."); - visitor( - compressed_weights, "compressed_weights", Path(), - "Path name of compressed weights file, regenerated from `--weights` " - "file if " - "the compressed weights file does not exist.\n Required argument."); + visitor(weights, "weights", Path(), + "Path name of model weights (.sbs) file.\n Required argument."); + visitor(compressed_weights, "compressed_weights", Path(), + "Alias for --weights."); visitor(model_type, "model", std::string(), "Model type\n 2b-it = 2B parameters, instruction-tuned\n " "2b-pt = 2B parameters, pretrained\n 7b-it = 7B parameters " "instruction-tuned\n 7b-pt = 7B parameters, pretrained\n" " Required argument."); - visitor(weights, "weights", Path(), - "Path name of model weights (.sbs) file. Only required if " - "compressed_weights file is not present and needs to be " - "regenerated. This parameter is only required for compressing " - "new model weight exports, otherwise it is not needed."); } };