// Copyright 2024 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_WEIGHTS_RAW_H_ #define THIRD_PARTY_GEMMA_CPP_COMPRESSION_WEIGHTS_RAW_H_ // Historical note: this was the original f32-only simple on-disk format // created by convert_weights.py. BlobStore is now the preferred on-disk // format, and we load that into CompressedWeights. // // NOTE: this file should only be used by compress_weights. It is currently // also referenced by backprop because it supports T = std::complex, and // CompressedWeights might not yet. #include #include "gemma/configs.h" #include "util/allocator.h" #include "hwy/aligned_allocator.h" #include "hwy/base.h" #include "hwy/contrib/thread_pool/thread_pool.h" namespace gcpp { template struct Layer { Layer() {} static constexpr size_t kHeads = TConfig::kHeads; static constexpr size_t kKVHeads = TConfig::kKVHeads; static constexpr size_t kModelDim = TConfig::kModelDim; static constexpr size_t kQKVDim = TConfig::kQKVDim; static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim; static constexpr size_t kAttVecEinsumWSize = kHeads * kQKVDim * kModelDim; static constexpr size_t kQKVEinsumWSize = (kHeads + 2 * kKVHeads) * kQKVDim * kModelDim; // 2x for (gelu gating vector, gated vector) static constexpr size_t kGatingEinsumWSize = 2 * kFFHiddenDim * kModelDim; static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth; static constexpr bool kFFBiases = TConfig::kFFBiases; static constexpr PostNormType kPostNorm = TConfig::kPostNorm; static constexpr size_t kAOBiasDim = TConfig::kSoftmaxAttnOutputBiases ? kModelDim : 0; static constexpr size_t kGriffinDim = TConfig::kGriffinLayers > 0 ? kModelDim : 0; union { struct { std::array attn_vec_einsum_w; std::array qkv_einsum_w; std::array attention_output_biases; }; struct { std::array linear_x_w; std::array linear_x_biases; std::array linear_y_w; std::array linear_y_biases; std::array linear_out_w; std::array linear_out_biases; std::array conv_w; std::array conv_biases; std::array gate_w; std::array gate_biases; std::array a; } griffin; }; std::array gating_einsum_w; std::array linear_w; std::array pre_attention_norm_scale; std::array pre_ffw_norm_scale; std::array post_attention_norm_scale; std::array post_ffw_norm_scale; std::array ffw_gating_biases; std::array ffw_output_biases; }; template using LayerF = Layer; // Array instead of single large allocation for parallel mem init. Split out of // Weights so that only these pointers are initialized. template struct LayerPointers { explicit LayerPointers(hwy::ThreadPool& pool) { pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) { this->layers[task] = hwy::AllocateAligned>(1); }); } using TLayer = Layer; std::array, TConfig::kLayers> layers; }; template struct Weights { // No ctor/dtor, allocated via AllocateAligned. std::array embedder_input_embedding; std::array final_norm_scale; LayerPointers layer_ptrs; std::array scales; const Layer* GetLayer(size_t layer) const { return layer_ptrs.layers[layer].get(); } Layer* GetLayer(size_t layer) { return layer_ptrs.layers[layer].get(); } }; template using WeightsF = Weights; // TODO: can we use TConfig::Weight instead of T? template struct AllocateWeights { ByteStorageT operator()(hwy::ThreadPool& pool) const { using TWeights = Weights; ByteStorageT weights_u8 = AllocateSizeof(); TWeights* weights = reinterpret_cast(weights_u8.get()); new (&weights->layer_ptrs) LayerPointers(pool); return weights_u8; } }; template struct AllocateWeightsF { ByteStorageT operator()(hwy::ThreadPool& pool) const { return AllocateWeights()(pool); } }; // TODO: make a member of Weights. template struct ZeroInitWeights { void operator()(ByteStorageT& weights, hwy::ThreadPool& pool) const { Weights& w = *reinterpret_cast*>(weights.get()); hwy::ZeroBytes(&w.embedder_input_embedding, sizeof(w.embedder_input_embedding)); hwy::ZeroBytes(&w.final_norm_scale, sizeof(w.final_norm_scale)); for (int i = 0; i < TConfig::kLayers; ++i) { hwy::ZeroBytes(w.GetLayer(i), sizeof(*w.GetLayer(i))); } } }; template struct ZeroInitWeightsF { void operator()(ByteStorageT& weights, hwy::ThreadPool& pool) const { ZeroInitWeights()(weights, pool); } }; template struct CopyWeights { void operator()(Weights& dst, const Weights& src) const { hwy::CopyBytes(&src.embedder_input_embedding, &dst.embedder_input_embedding, sizeof(src.embedder_input_embedding)); hwy::CopyBytes(&src.final_norm_scale, &dst.final_norm_scale, sizeof(src.final_norm_scale)); for (int i = 0; i < TConfig::kLayers; ++i) { hwy::CopyBytes(src.GetLayer(i), dst.GetLayer(i), sizeof(*dst.GetLayer(i))); } } }; template void RandInit(std::array& x, T stddev, std::mt19937& gen) { std::normal_distribution dist(0.0, stddev); for (size_t i = 0; i < kLen; ++i) { x[i] = dist(gen); } } // TODO: make a member of Layer. template void RandInit(Layer& w, T stddev, std::mt19937& gen) { RandInit(w.pre_attention_norm_scale, stddev, gen); RandInit(w.attn_vec_einsum_w, stddev, gen); RandInit(w.qkv_einsum_w, stddev, gen); RandInit(w.pre_ffw_norm_scale, stddev, gen); RandInit(w.gating_einsum_w, stddev, gen); RandInit(w.linear_w, stddev, gen); } template void RandInit(Weights& w, T stddev, std::mt19937& gen) { static constexpr size_t kLayers = TConfig::kLayers; RandInit(w.embedder_input_embedding, stddev, gen); RandInit(w.final_norm_scale, stddev, gen); for (size_t i = 0; i < kLayers; ++i) { RandInit(*w.GetLayer(i), stddev, gen); } } // Owns weights and provides access to TConfig. template class WeightsWrapper { public: WeightsWrapper() : pool_(0), data_(AllocateWeights()(pool_)), weights_(reinterpret_cast*>(data_.get())) {} ~WeightsWrapper() { get().layer_ptrs.~LayerPointers(); } const Weights& get() const { return *weights_; } Weights& get() { return *weights_; } void clear() { ZeroInitWeights()(data_, pool_); } void copy(const WeightsWrapper& other) { CopyWeights()(get(), other.get()); } private: hwy::ThreadPool pool_; ByteStorageT data_; Weights* weights_; }; } // namespace gcpp #endif // THIRD_PARTY_GEMMA_CPP_COMPRESSION_WEIGHTS_RAW_H_