Added MatPtr/MatPtrT/MatStorageT/MatStorage as a dynamically-sized replacement for CompressedArray.

Definition of array size is moved to the constructor.
Allocation is separate and parallelized.
All users of weights_raw.h migrated to CompressedWeights and weights_raw.h deleted.
Replaced all previous ForEachTensor functions with a single unified function.

PiperOrigin-RevId: 684451604
This commit is contained in:
Ray Smith 2024-10-10 08:21:39 -07:00 committed by Copybara-Service
parent a570e3f662
commit 85958f5fd3
35 changed files with 1568 additions and 1381 deletions

View File

@ -75,9 +75,7 @@ cc_library(
":allocator",
":threading",
"//compression:compress",
"//compression:sfp",
"@hwy//:algo",
"@hwy//:dot",
"@hwy//:hwy",
"@hwy//:math",
"@hwy//:matvec",
@ -149,7 +147,6 @@ cc_test(
"//compression:compress",
"@hwy//:hwy",
"@hwy//:hwy_test_util",
"@hwy//:nanobenchmark",
"@hwy//:thread_pool",
],
)
@ -281,11 +278,9 @@ cc_library(
"//paligemma:image",
"@hwy//:hwy",
"@hwy//:bit_set",
"@hwy//:matvec",
"@hwy//:nanobenchmark", # timer
"@hwy//:profiler",
"@hwy//:thread_pool",
"@hwy//:topology",
],
)
@ -481,6 +476,7 @@ cc_library(
":ops",
":prompt",
":weights",
"//compression:compress",
"@hwy//:dot",
"@hwy//:hwy", # base.h
"@hwy//:thread_pool",
@ -498,9 +494,10 @@ cc_library(
deps = [
":allocator",
":common",
":gemma_lib",
":prompt",
"//compression:weights_raw",
":weights",
"//compression:compress",
"@hwy//:hwy",
],
)
@ -512,13 +509,15 @@ cc_test(
"backprop/test_util.h",
],
deps = [
":allocator",
":backprop_scalar",
":common",
":gemma_lib",
":prompt",
":sampler",
":weights",
"@googletest//:gtest_main",
"//compression:weights_raw",
"//compression:compress",
"@hwy//:thread_pool",
],
)
@ -534,6 +533,7 @@ cc_test(
"mem": "28g",
},
deps = [
":allocator",
":backprop",
":backprop_scalar",
":common",
@ -541,8 +541,9 @@ cc_test(
":ops",
":prompt",
":sampler",
":weights",
"@googletest//:gtest_main",
"//compression:weights_raw",
"//compression:compress",
"@hwy//:hwy",
"@hwy//:hwy_test_util",
"@hwy//:thread_pool",

View File

@ -22,7 +22,7 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 457c891775a7397bdb0376bb1031e6e027af1c48 EXCLUDE_FROM_ALL)
FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG bb6c3f36b0c8dde8a8ef98b0f0884f4de820a7ca EXCLUDE_FROM_ALL)
FetchContent_MakeAvailable(highway)
## Note: absl needs to be installed by sentencepiece. This will only happen if
@ -39,6 +39,7 @@ FetchContent_MakeAvailable(benchmark)
set(SOURCES
compression/blob_store.cc
compression/blob_store.h
compression/compress.cc
compression/compress.h
compression/compress-inl.h
compression/io_win.cc
@ -48,7 +49,6 @@ set(SOURCES
compression/sfp-inl.h
compression/shared.h
compression/test_util-inl.h
compression/weights_raw.h
backprop/activations.h
backprop/backward.cc
backprop/backward.h

View File

@ -20,32 +20,51 @@
#include <array>
#include "compression/compress.h" // MatStorageT
#include "util/allocator.h" // ByteStorageT
namespace gcpp {
template <typename T, typename TConfig>
struct ForwardLayer {
ForwardLayer()
: input("input", kSeqLen, kModelDim),
pre_att_rms_out("pre_att_rms_out", kSeqLen, kModelDim),
qkv("qkv", kSeqLen * (kHeads + 2), kQKVDim),
att("att", kSeqLen * kHeads, kSeqLen),
att_out("att_out", kSeqLen * kHeads, kQKVDim),
att_post1("att_post1", kSeqLen, kModelDim),
attention_out("attention_out", kSeqLen, kModelDim),
bf_pre_ffw_rms_out("bf_pre_ffw_rms_out", kSeqLen, kModelDim),
ffw_hidden("ffw_hidden", kSeqLen, kFFHiddenDim * 2),
ffw_hidden_gated("ffw_hidden_gated", kSeqLen, kFFHiddenDim) {}
static constexpr size_t kSeqLen = TConfig::kSeqLen;
static constexpr size_t kModelDim = TConfig::kModelDim;
static constexpr size_t kQKVDim = TConfig::kQKVDim;
static constexpr size_t kHeads = TConfig::kHeads;
static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
std::array<T, kSeqLen * kModelDim> input;
std::array<T, kSeqLen * kModelDim> pre_att_rms_out;
std::array<T, kSeqLen * (kHeads + 2) * kQKVDim> qkv;
std::array<T, kSeqLen * kHeads * kSeqLen> att;
std::array<T, kSeqLen * kHeads * kQKVDim> att_out;
std::array<T, kSeqLen * kModelDim> att_post1;
std::array<T, kSeqLen * kModelDim> attention_out;
std::array<T, kSeqLen * kModelDim> bf_pre_ffw_rms_out;
std::array<T, kSeqLen * kFFHiddenDim * 2> ffw_hidden;
std::array<T, kSeqLen * kFFHiddenDim> ffw_hidden_gated;
MatStorageT<T> input;
MatStorageT<T> pre_att_rms_out;
MatStorageT<T> qkv;
MatStorageT<T> att;
MatStorageT<T> att_out;
MatStorageT<T> att_post1;
MatStorageT<T> attention_out;
MatStorageT<T> bf_pre_ffw_rms_out;
MatStorageT<T> ffw_hidden;
MatStorageT<T> ffw_hidden_gated;
};
template <typename T, typename TConfig>
struct ForwardPass {
ForwardPass() {} // prevents placement-new calling memset
ForwardPass()
: final_layer_output("final_layer_output", kSeqLen, kModelDim),
final_norm_output("final_norm_output", kSeqLen, kModelDim),
logits("logits", kSeqLen, kVocabSize),
probs("probs", kSeqLen, kVocabSize) {
} // prevents placement-new calling memset
static constexpr size_t kSeqLen = TConfig::kSeqLen;
static constexpr size_t kModelDim = TConfig::kModelDim;
@ -53,16 +72,20 @@ struct ForwardPass {
static constexpr size_t kLayers = TConfig::kLayers;
std::array<ForwardLayer<T, TConfig>, kLayers> layers;
std::array<T, kSeqLen * kModelDim> final_layer_output;
std::array<T, kSeqLen * kModelDim> final_norm_output;
std::array<T, kSeqLen * kVocabSize> logits;
std::array<T, kSeqLen * kVocabSize> probs;
MatStorageT<T> final_layer_output;
MatStorageT<T> final_norm_output;
MatStorageT<T> logits;
MatStorageT<T> probs;
};
template <typename TConfig>
struct AllocateForwardPass {
ByteStorageT operator()() const {
return AllocateSizeof<ForwardPass<float, TConfig>>();
ByteStorageT c_weights_u8 = AllocateSizeof<ForwardPass<float, TConfig>>();
auto* c_weights =
reinterpret_cast<ForwardPass<float, TConfig>*>(c_weights_u8.get());
new (c_weights) ForwardPass<float, TConfig>();
return c_weights_u8;
}
};
@ -74,7 +97,7 @@ class ActivationsWrapper {
public:
ActivationsWrapper()
: data_(AllocateSizeof<WrappedT>()),
activations_(*reinterpret_cast<WrappedT*>(data_.get())) {}
activations_(*(new(data_.get()) WrappedT())) {}
const WrappedT& get() const { return activations_; }
WrappedT& get() { return activations_; }

View File

@ -168,11 +168,11 @@ static HWY_NOINLINE void InputEmbeddingVJP(
}
}
template <typename TConfig, template <typename> typename LayerT>
void LayerVJP(const LayerT<TConfig>& weights,
template <typename TConfig, typename LayerT>
void LayerVJP(const LayerT& weights,
const ForwardLayer<float, TConfig>& forward,
const float* HWY_RESTRICT next_layer_grad, size_t num_tokens,
LayerT<TConfig>& grad, ForwardLayer<float, TConfig>& backward,
LayerT& grad, ForwardLayer<float, TConfig>& backward,
const RowVectorBatch<float>& inv_timescale,
hwy::ThreadPool& pool) {
static constexpr size_t kModelDim = TConfig::kModelDim;
@ -226,8 +226,7 @@ void LayerVJP(const LayerT<TConfig>& weights,
backward.attention_out.data() + pos * kModelDim, kModelDim);
}
hwy::ZeroBytes(backward.qkv.data(),
num_tokens * (kHeads + 2) * kQKVDim * sizeof(backward.qkv[0]));
backward.qkv.ZeroInit();
MultiHeadMatMulVJP<kHeads, kQKVDim, kModelDim>(
weights.attn_vec_einsum_w.data(), forward.att_out.data(),
@ -343,12 +342,10 @@ static HWY_NOINLINE void CrossEntropyLossGrad(
}
}
template <typename TConfig, template <typename...> typename WeightsT,
template <typename> typename LayerT>
void CrossEntropyLossBackwardPass(const Prompt& prompt,
const WeightsT<TConfig>& weights,
template <typename TConfig, typename WeightsT, typename LayerT>
void CrossEntropyLossBackwardPass(const Prompt& prompt, const WeightsT& weights,
const ForwardPass<float, TConfig>& forward,
WeightsT<TConfig>& grad,
WeightsT& grad,
ForwardPass<float, TConfig>& backward,
RowVectorBatch<float>& inv_timescale,
hwy::ThreadPool& pool) {

View File

@ -52,7 +52,8 @@ void CrossEntropyLossBackwardPass(const Prompt& prompt,
using TAct = ForwardPass<float, TConfig>;
const auto& forward = *reinterpret_cast<const TAct*>(forward_u8.get());
auto& backward = *reinterpret_cast<TAct*>(backward_u8.get());
CrossEntropyLossBackwardPass<TConfig, CompressedWeights, CompressedLayer>(
CrossEntropyLossBackwardPass<TConfig, CompressedWeights<TConfig>,
CompressedLayer<TConfig>>(
prompt, weights, forward, grad, backward, inv_timescale, pool);
}

View File

@ -25,8 +25,8 @@
#include "backprop/activations.h"
#include "backprop/common_scalar.h"
#include "backprop/prompt.h"
#include "compression/weights_raw.h"
#include "gemma/common.h" // EmbeddingScaling
#include "gemma/weights.h"
namespace gcpp {
template<typename T>
@ -199,13 +199,11 @@ void InputEmbeddingVJPT(const T* w, const std::vector<int>& tokens, T scaling,
}
}
template<typename T, typename TConfig>
void LayerVJP(const Layer<T, TConfig>& weights,
const ForwardLayer<T, TConfig>& forward,
const T* dy,
Layer<T, TConfig>& grad,
ForwardLayer<T, TConfig>& backward,
size_t num_tokens) {
template <typename T, typename TConfig>
void LayerVJP(const CompressedLayer<TConfig>& weights,
const ForwardLayer<T, TConfig>& forward, const T* dy,
CompressedLayer<TConfig>& grad,
ForwardLayer<T, TConfig>& backward, size_t num_tokens) {
static constexpr size_t kModelDim = TConfig::kModelDim;
static constexpr size_t kSeqLen = TConfig::kSeqLen;
static constexpr size_t kQKVDim = TConfig::kQKVDim;
@ -298,11 +296,11 @@ void CrossEntropyLossGrad(const T* x, T* dx, const Prompt& prompt, size_t V) {
}
}
template<typename T, typename TConfig>
template <typename T, typename TConfig>
void CrossEntropyLossBackwardPass(const Prompt& prompt,
const Weights<T, TConfig>& weights,
const CompressedWeights<TConfig>& weights,
const ForwardPass<T, TConfig>& forward,
Weights<T, TConfig>& grad,
CompressedWeights<TConfig>& grad,
ForwardPass<T, TConfig>& backward) {
static constexpr size_t kModelDim = TConfig::kModelDim;
static constexpr size_t kVocabSize = TConfig::kVocabSize;

View File

@ -17,7 +17,7 @@
#include <stddef.h>
#include <stdio.h>
#include <string.h> // memset
#include <string.h> // memcpy
#include <array>
#include <complex>
@ -32,8 +32,9 @@
#include "backprop/prompt.h"
#include "backprop/sampler.h"
#include "backprop/test_util.h"
#include "compression/weights_raw.h"
#include "compression/compress.h"
#include "gemma/configs.h"
#include "gemma/weights.h"
namespace gcpp {
@ -44,14 +45,14 @@ TEST(BackPropTest, MatMulVJP) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, kRows * kCols> weights;
std::array<T, kTokens * kCols> x;
std::array<T, kRows * kCols> grad;
std::array<T, kTokens * kCols> dx;
std::array<TC, kRows * kCols> c_weights;
std::array<TC, kTokens * kCols> c_x;
std::array<TC, kTokens * kRows> c_y;
std::array<T, kTokens * kRows> dy;
MatStorageT<T> weights("weights", kRows, kCols);
MatStorageT<T> x("x", kTokens, kCols);
MatStorageT<T> grad("grad", kRows, kCols);
MatStorageT<T> dx("dx", kTokens, kCols);
MatStorageT<TC> c_weights("c_weights", kRows, kCols);
MatStorageT<TC> c_x("c_x", kTokens, kCols);
MatStorageT<TC> c_y("c_y", kTokens, kRows);
MatStorageT<T> dy("dy", kTokens, kRows);
for (int iter = 0; iter < 10; ++iter) {
RandInit(weights, 1.0 * (1 << iter), gen);
@ -63,7 +64,7 @@ TEST(BackPropTest, MatMulVJP) {
MatMulT(c_weights.data(), c_x.data(), c_y.data(), kRows, kCols, kTokens);
return DotT(dy.data(), c_y.data(), kTokens * kRows);
};
memset(&grad, 0, sizeof(grad));
grad.ZeroInit();
MatMulVJPT(weights.data(), x.data(), dy.data(), grad.data(), dx.data(),
kRows, kCols, kTokens);
TestGradient(dx, c_x, func, 1e-11, 1e-12, __LINE__);
@ -79,14 +80,14 @@ TEST(BackPropTest, MultiHeadMatMulVJP) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, kRows * kCols * kHeads> weights;
std::array<T, kTokens * kCols * kHeads> x;
std::array<T, kRows * kCols * kHeads> grad;
std::array<T, kTokens * kCols * kHeads> dx;
std::array<TC, kRows * kCols * kHeads> c_weights;
std::array<TC, kTokens * kCols * kHeads> c_x;
std::array<TC, kTokens * kRows> c_y;
std::array<T, kTokens * kRows> dy;
MatStorageT<T> weights("weights", kRows, kCols * kHeads);
MatStorageT<T> x("x", kTokens, kCols * kHeads);
MatStorageT<T> grad("grad", kRows, kCols * kHeads);
MatStorageT<T> dx("dx", kTokens, kCols * kHeads);
MatStorageT<TC> c_weights("c_weights", kRows, kCols * kHeads);
MatStorageT<TC> c_x("c_x", kTokens, kCols * kHeads);
MatStorageT<TC> c_y("c_y", kTokens, kRows);
MatStorageT<T> dy("dy", kTokens, kRows);
for (int iter = 0; iter < 10; ++iter) {
RandInit(weights, 1.0 * (1 << iter), gen);
@ -99,7 +100,7 @@ TEST(BackPropTest, MultiHeadMatMulVJP) {
kCols, kTokens);
return DotT(dy.data(), c_y.data(), kTokens * kRows);
};
memset(&grad, 0, sizeof(grad));
grad.ZeroInit();
MultiHeadMatMulVJPT(weights.data(), x.data(), dy.data(), grad.data(),
dx.data(), kHeads, kRows, kCols, kTokens);
TestGradient(dx, c_x, func, 1e-15, 1e-13, __LINE__);
@ -113,14 +114,14 @@ TEST(BackPropTest, RMSNormVJP) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, N> weights;
std::array<T, N> grad;
std::array<T, K * N> x;
std::array<T, K * N> dx;
std::array<T, K * N> dy;
std::array<TC, N> c_weights;
std::array<TC, K * N> c_x;
std::array<TC, K * N> c_y;
MatStorageT<T> weights("weights", N, 1);
MatStorageT<T> grad("grad", N, 1);
MatStorageT<T> x("x", K, N);
MatStorageT<T> dx("dx", K, N);
MatStorageT<T> dy("dy", K, N);
MatStorageT<TC> c_weights("c_weights", N, 1);
MatStorageT<TC> c_x("c_x", K, N);
MatStorageT<TC> c_y("c_y", K, N);
for (int iter = 0; iter < 10; ++iter) {
RandInit(weights, 1.0 * (1 << iter), gen);
@ -132,7 +133,7 @@ TEST(BackPropTest, RMSNormVJP) {
RMSNormT(c_weights.data(), c_x.data(), c_y.data(), N, K);
return DotT(dy.data(), c_y.data(), K * N);
};
memset(&grad, 0, sizeof(grad));
grad.ZeroInit();
RMSNormVJPT(weights.data(), x.data(), dy.data(), grad.data(), dx.data(),
N, K);
TestGradient(dx, c_x, func, 1e-15, 1e-14, __LINE__);
@ -145,23 +146,23 @@ TEST(BackPropTest, SoftmaxVJP) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, N> x;
std::array<T, N> dx;
std::array<T, N> dy;
std::array<TC, N> c_x;
std::array<TC, N> c_y;
MatStorageT<T> x("x", N, 1);
MatStorageT<T> dx("dx", N, 1);
MatStorageT<T> dy("dy", N, 1);
MatStorageT<TC> c_x("c_x", N, 1);
MatStorageT<TC> c_y("c_y", N, 1);
for (int iter = 0; iter < 10; ++iter) {
RandInit(x, 1.0 * (1 << iter), gen);
Complexify(x, c_x);
RandInit(dy, 1.0, gen);
auto func = [&]() {
memcpy(c_y.data(), c_x.data(), sizeof(c_x));
memcpy(c_y.data(), c_x.data(), c_x.SizeBytes());
Softmax(c_y.data(), N);
return DotT(dy.data(), c_y.data(), N);
};
Softmax(x.data(), N);
memcpy(dx.data(), dy.data(), N * sizeof(dx[0]));
memcpy(dx.data(), dy.data(), dx.SizeBytes());
SoftmaxVJPT(x.data(), dx.data(), N);
TestGradient(dx, c_x, func, 1e-15, 1e-15, __LINE__);
}
@ -171,15 +172,16 @@ TEST(BackPropTest, MaskedSoftmaxVJP) {
static const size_t kSeqLen = 16;
static const size_t kHeads = 2;
static const size_t kTokens = 14;
static const size_t N = kHeads * kSeqLen * kSeqLen;
static const size_t N = kTokens * kHeads * kSeqLen;
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, N> x;
std::array<T, N> dy;
std::array<T, N> dx = {};
std::array<TC, N> c_x;
std::array<TC, N> c_y;
MatStorageT<T> x("x", N, 1);
MatStorageT<T> dy("dy", N, 1);
MatStorageT<T> dx("dx", N, 1);
MatStorageT<TC> c_x("c_x", N, 1);
MatStorageT<TC> c_y("c_y", N, 1);
dx.ZeroInit();
for (int iter = 0; iter < 10; ++iter) {
RandInit(x, 1.0 * (1 << iter), gen);
@ -187,12 +189,12 @@ TEST(BackPropTest, MaskedSoftmaxVJP) {
RandInit(dy, 1.0, gen);
auto func = [&]() {
memcpy(c_y.data(), c_x.data(),
kTokens * kHeads * kSeqLen * sizeof(c_x[0]));
kTokens * kHeads * kSeqLen * sizeof(c_x.At(0)));
MaskedSoftmax(c_y.data(), kTokens, kHeads, kSeqLen);
return DotT(dy.data(), c_y.data(), N);
};
MaskedSoftmax(x.data(), kTokens, kHeads, kSeqLen);
memcpy(dx.data(), dy.data(), kTokens * kHeads * kSeqLen * sizeof(dx[0]));
memcpy(dx.data(), dy.data(), kTokens * kHeads * kSeqLen * sizeof(dx.At(0)));
MaskedSoftmaxVJPT(x.data(), dx.data(), kTokens, kHeads, kSeqLen);
TestGradient(dx, c_x, func, 1e-14, 1e-15, __LINE__);
}
@ -203,11 +205,11 @@ TEST(BackPropTest, SoftcapVJP) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, N> x;
std::array<T, N> dx;
std::array<T, N> dy;
std::array<TC, N> c_x;
std::array<TC, N> c_y;
MatStorageT<T> x("x", N, 1);
MatStorageT<T> dx("dx", N, 1);
MatStorageT<T> dy("dy", N, 1);
MatStorageT<TC> c_x("c_x", N, 1);
MatStorageT<TC> c_y("c_y", N, 1);
constexpr float kCap = 30.0f;
for (int iter = 0; iter < 10; ++iter) {
@ -215,12 +217,12 @@ TEST(BackPropTest, SoftcapVJP) {
Complexify(x, c_x);
RandInit(dy, 1.0, gen);
auto func = [&]() {
memcpy(c_y.data(), c_x.data(), N * sizeof(c_x[0]));
memcpy(c_y.data(), c_x.data(), N * sizeof(c_x.At(0)));
Softcap(kCap, c_y.data(), N);
return DotT(dy.data(), c_y.data(), N);
};
Softcap(kCap, x.data(), N);
memcpy(dx.data(), dy.data(), N * sizeof(dx[0]));
memcpy(dx.data(), dy.data(), dx.SizeBytes());
SoftcapVJPT(kCap, x.data(), dx.data(), N);
TestGradient(dx, c_x, func, 1e-15, 1e-14, __LINE__);
}
@ -232,9 +234,9 @@ TEST(BackPropTest, CrossEntropyLossGrad) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, K * V> x;
std::array<T, K * V> dx;
std::array<TC, K * V> c_x;
MatStorageT<T> x("x", K, V);
MatStorageT<T> dx("dx", K, V);
MatStorageT<TC> c_x("c_x", K, V);
Prompt prompt;
prompt.tokens = { 0, 1, 2, 3, 0, 3, 2, 1, 0 };
@ -259,11 +261,11 @@ TEST(BackPropTest, GatedGeluVJP) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, K * 2 * N> x;
std::array<T, K * 2 * N> dx;
std::array<T, K * N> dy;
std::array<TC, K * 2 * N> c_x;
std::array<TC, K * N> c_y;
MatStorageT<T> x("x", K, 2 * N);
MatStorageT<T> dx("dx", K, 2 * N);
MatStorageT<T> dy("dy", K, N);
MatStorageT<TC> c_x("c_x", K, 2 * N);
MatStorageT<TC> c_y("c_y", K, N);
for (int iter = 0; iter < 10; ++iter) {
RandInit(x, 1.0, gen);
@ -284,15 +286,17 @@ TEST(BackPropTest, MaskedAttentionVJP) {
static const size_t kQKVDim = 8;
static const size_t kTokens = 14;
static const size_t kQKVSize = kSeqLen * (kHeads + 2) * kQKVDim;
static const size_t kOutSize = kSeqLen * kHeads * kSeqLen;
static const size_t kOutSize = kTokens * kHeads * kSeqLen;
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, kQKVSize> x;
std::array<T, kQKVSize> dx = {};
std::array<T, kOutSize> dy;
std::array<TC, kQKVSize> c_x;
std::array<TC, kOutSize> c_y;
MatStorageT<T> x("x", kQKVSize, 1);
MatStorageT<T> dx("dx", kQKVSize, 1);
MatStorageT<T> dy("dy", kOutSize, 1);
MatStorageT<TC> c_x("c_x", kQKVSize, 1);
MatStorageT<TC> c_y("c_y", kOutSize, 1);
dx.ZeroInit();
c_y.ZeroInit();
for (int iter = 0; iter < 10; ++iter) {
RandInit(x, 1.0, gen);
@ -320,14 +324,17 @@ TEST(BackPropTest, MixByAttentionVJP) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, kQKVSize> qkv;
std::array<T, kQKVSize> dqkv = {};
std::array<T, kAttnSize> attn;
std::array<T, kAttnSize> dattn = {};
std::array<T, kOutSize> dy;
std::array<TC, kQKVSize> c_qkv;
std::array<TC, kAttnSize> c_attn;
std::array<TC, kOutSize> c_y;
MatStorageT<T> qkv("qkv", kQKVSize, 1);
MatStorageT<T> dqkv("dqkv", kQKVSize, 1);
MatStorageT<T> attn("attn", kAttnSize, 1);
MatStorageT<T> dattn("dattn", kAttnSize, 1);
MatStorageT<T> dy("dy", kOutSize, 1);
MatStorageT<TC> c_qkv("c_qkv", kQKVSize, 1);
MatStorageT<TC> c_attn("c_attn", kAttnSize, 1);
MatStorageT<TC> c_y("c_y", kOutSize, 1);
dqkv.ZeroInit();
dattn.ZeroInit();
c_y.ZeroInit();
for (int iter = 0; iter < 10; ++iter) {
RandInit(qkv, 1.0, gen);
@ -354,11 +361,11 @@ TEST(BackPropTest, InputEmbeddingVJP) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
std::array<T, kVocabSize * kModelDim> weights;
std::array<T, kVocabSize * kModelDim> grad;
std::array<T, kSeqLen * kModelDim> dy;
std::array<TC, kVocabSize * kModelDim> c_weights;
std::array<TC, kSeqLen * kModelDim> c_y;
MatStorageT<T> weights("weights", kVocabSize, kModelDim);
MatStorageT<T> grad("grad", kVocabSize, kModelDim);
MatStorageT<T> dy("dy", kSeqLen, kModelDim);
MatStorageT<TC> c_weights("c_weights", kVocabSize, kModelDim);
MatStorageT<TC> c_y("c_y", kSeqLen, kModelDim);
std::vector<int> tokens = { 0, 1, 2, 3, 0, 1, 2 };
size_t num_tokens = tokens.size() - 1;
@ -370,14 +377,16 @@ TEST(BackPropTest, InputEmbeddingVJP) {
InputEmbedding(c_weights.data(), tokens, TC(3.0), c_y.data(), kModelDim);
return DotT(dy.data(), c_y.data(), num_tokens * kModelDim);
};
memset(&grad, 0, sizeof(grad));
grad.ZeroInit();
InputEmbeddingVJPT(weights.data(), tokens, 3.0, dy.data(), grad.data(),
kModelDim);
TestGradient(grad, c_weights, func, 1e-16, 1e-14, __LINE__);
}
}
template <typename T>
struct TestConfig : ConfigBaseGemmaV2 {
using Weight = T;
static constexpr int kSeqLen = 18;
static constexpr int kVocabSize = 12;
static constexpr int kModelDim = 32;
@ -399,17 +408,21 @@ TEST(BackPropTest, LayerVJP) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
const size_t kOutputSize = TestConfig::kSeqLen * TestConfig::kModelDim;
Layer<T, TestConfig> weights;
Layer<T, TestConfig> grad;
ForwardLayer<T, TestConfig> forward;
ForwardLayer<T, TestConfig> backward = {};
Layer<TC, TestConfig> c_weights;
ForwardLayer<TC, TestConfig> c_forward;
const size_t kOutputSize = TestConfig<T>::kSeqLen * TestConfig<T>::kModelDim;
CompressedLayer<TestConfig<T>> weights;
CompressedLayer<TestConfig<T>> grad;
ForwardLayer<T, TestConfig<T>> forward;
ForwardLayer<T, TestConfig<T>> backward = {};
CompressedLayer<TestConfig<TC>> c_weights;
ForwardLayer<TC, TestConfig<TC>> c_forward;
std::array<T, kOutputSize> y;
std::array<T, kOutputSize> dy;
MatStorageT<T> dy("dy", kOutputSize, 1);
std::array<TC, kOutputSize> c_y;
const size_t num_tokens = 3;
weights.Allocate();
grad.Allocate();
c_weights.Allocate();
backward.input.ZeroInit();
for (size_t iter = 0; iter < 10; ++iter) {
RandInit(weights, 1.0, gen);
@ -419,9 +432,9 @@ TEST(BackPropTest, LayerVJP) {
Complexify(forward.input, c_forward.input);
auto func = [&]() {
ApplyLayer(c_weights, c_forward, num_tokens, c_y.data());
return DotT(dy.data(), c_y.data(), num_tokens * TestConfig::kModelDim);
return DotT(dy.data(), c_y.data(), num_tokens * TestConfig<T>::kModelDim);
};
memset(&grad, 0, sizeof(grad));
grad.ZeroInit(/*layer_idx=*/0);
ApplyLayer(weights, forward, num_tokens, y.data());
LayerVJP(weights, forward, dy.data(), grad, backward, num_tokens);
TestGradient(backward.input, c_forward.input, func, 1e-11, 5e-11,
@ -434,12 +447,12 @@ TEST(BackPropTest, EndToEnd) {
std::mt19937 gen(42);
using T = double;
using TC = std::complex<T>;
WeightsWrapper<T, TestConfig> weights;
WeightsWrapper<T, TestConfig> grad;
ForwardPass<T, TestConfig> forward;
ForwardPass<T, TestConfig> backward;
WeightsWrapper<TC, TestConfig> c_weights;
ForwardPass<TC, TestConfig> c_forward;
WeightsWrapper<TestConfig<T>> weights;
WeightsWrapper<TestConfig<T>> grad;
ForwardPass<T, TestConfig<T>> forward;
ForwardPass<T, TestConfig<T>> backward;
WeightsWrapper<TestConfig<TC>> c_weights;
ForwardPass<TC, TestConfig<TC>> c_forward;
ReverseSequenceSampler training_task({0, 0, 1, 1});
std::vector<Prompt> batch = training_task.SampleBatch(3, gen);
@ -448,7 +461,7 @@ TEST(BackPropTest, EndToEnd) {
ReverseSequenceSampler::LogPrompt(prompt);
RandInit(weights.get(), 1.0, gen);
CrossEntropyLossForwardPass(prompt, weights.get(), forward);
grad.clear();
grad.ZeroInit();
CrossEntropyLossBackwardPass(
prompt, weights.get(), forward, grad.get(), backward);
@ -461,9 +474,9 @@ TEST(BackPropTest, EndToEnd) {
}
}
template<typename T, typename TConfig>
void MulByConstAndAddT(T c, const Layer<T, TConfig>& x,
Layer<T, TConfig>& out) {
template <typename T, typename TConfig>
void MulByConstAndAddT(T c, const CompressedLayer<TConfig>& x,
CompressedLayer<TConfig>& out) {
MulByConstAndAddT(c, x.pre_attention_norm_scale,
out.pre_attention_norm_scale);
MulByConstAndAddT(c, x.attn_vec_einsum_w, out.attn_vec_einsum_w);
@ -473,9 +486,9 @@ void MulByConstAndAddT(T c, const Layer<T, TConfig>& x,
MulByConstAndAddT(c, x.linear_w, out.linear_w);
}
template<typename T, typename TConfig>
void MulByConstAndAddT(T c, const Weights<T, TConfig>& x,
Weights<T, TConfig>& out) {
template <typename T, typename TConfig>
void MulByConstAndAddT(T c, const CompressedWeights<TConfig>& x,
CompressedWeights<TConfig>& out) {
static constexpr size_t kLayers = TConfig::kLayers;
MulByConstAndAddT(c, x.embedder_input_embedding,
out.embedder_input_embedding);
@ -486,9 +499,9 @@ void MulByConstAndAddT(T c, const Weights<T, TConfig>& x,
}
// Evaluates forward pass on a batch.
template<typename T, typename TConfig>
template <typename T, typename TConfig>
T CrossEntropyLossForwardPass(const std::vector<Prompt>& batch,
const WeightsWrapper<T, TConfig>& weights,
const WeightsWrapper<TConfig>& weights,
ForwardPass<T, TConfig>& forward) {
T loss = 0.0;
for (const Prompt& prompt : batch) {
@ -501,14 +514,13 @@ T CrossEntropyLossForwardPass(const std::vector<Prompt>& batch,
// Evaluates forward pass on a batch by applying gradient with the given
// learning rate. Does not update weights, but uses the given tmp weights
// instead.
template<typename T, typename TConfig>
T CrossEntropyLossForwardPass(T learning_rate,
const std::vector<Prompt>& batch,
const WeightsWrapper<T, TConfig>& weights,
const WeightsWrapper<T, TConfig>& grad,
WeightsWrapper<T, TConfig>& tmp,
template <typename T, typename TConfig>
T CrossEntropyLossForwardPass(T learning_rate, const std::vector<Prompt>& batch,
const WeightsWrapper<TConfig>& weights,
const WeightsWrapper<TConfig>& grad,
WeightsWrapper<TConfig>& tmp,
ForwardPass<T, TConfig>& forward) {
tmp.copy(weights);
tmp.CopyFrom(weights);
const T scale = -learning_rate / batch.size();
MulByConstAndAddT(scale, grad.get(), tmp.get());
return CrossEntropyLossForwardPass(batch, tmp, forward);
@ -517,13 +529,13 @@ T CrossEntropyLossForwardPass(T learning_rate,
// Uses line search in the negative gradient direction to update weights. We do
// this so that we can test that each step during the gradient descent can
// decrease the objective function value.
template<typename T, typename TConfig>
T FindOptimalUpdate(const WeightsWrapper<T, TConfig>& grad,
WeightsWrapper<T, TConfig>& weights,
WeightsWrapper<T, TConfig>& tmp,
template <typename T, typename TConfig>
T FindOptimalUpdate(const WeightsWrapper<TConfig>& grad,
WeightsWrapper<TConfig>& weights,
WeightsWrapper<TConfig>& tmp,
ForwardPass<T, TConfig>& forward,
const std::vector<Prompt>& batch,
T loss, T initial_learning_rate) {
const std::vector<Prompt>& batch, T loss,
T initial_learning_rate) {
T lr0 = initial_learning_rate;
T loss0 = CrossEntropyLossForwardPass(
lr0, batch, weights, grad, tmp, forward);
@ -556,13 +568,13 @@ TEST(BackProptest, Convergence) {
std::mt19937 gen(42);
using T = float;
using TC = std::complex<double>;
WeightsWrapper<T, TestConfig> weights;
WeightsWrapper<T, TestConfig> grad;
WeightsWrapper<T, TestConfig> tmp;
ForwardPass<T, TestConfig> forward;
ForwardPass<T, TestConfig> backward;
WeightsWrapper<TC, TestConfig> c_weights;
ForwardPass<TC, TestConfig> c_forward;
WeightsWrapper<TestConfig<T>> weights;
WeightsWrapper<TestConfig<T>> grad;
WeightsWrapper<TestConfig<T>> tmp;
ForwardPass<T, TestConfig<T>> forward;
ForwardPass<T, TestConfig<T>> backward;
WeightsWrapper<TestConfig<TC>> c_weights;
ForwardPass<TC, TestConfig<TC>> c_forward;
constexpr size_t kBatchSize = 5;
ReverseSequenceSampler training_task({0, 0, 0, 1, 1});
T learning_rate = 0.01;
@ -579,7 +591,7 @@ TEST(BackProptest, Convergence) {
size_t step = 0;
while (!stop) {
T loss = 0.0;
grad.clear();
grad.ZeroInit();
std::mt19937 sgen(42);
std::vector<Prompt> batch = training_task.SampleBatch(kBatchSize, sgen);
for (const Prompt& prompt : batch) {

View File

@ -32,9 +32,9 @@
#include "backprop/prompt.h"
#include "backprop/sampler.h"
#include "backprop/test_util.h"
#include "compression/weights_raw.h"
#include "gemma/activations.h"
#include "gemma/configs.h"
#include "gemma/weights.h"
#include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
@ -48,6 +48,7 @@
// After highway.h
#include "backprop/backward-inl.h"
#include "backprop/forward-inl.h"
#include "compression/compress.h"
#include "ops/ops-inl.h"
HWY_BEFORE_NAMESPACE();
@ -60,17 +61,17 @@ void TestMatMulVJP() {
static const size_t kTokens = 5;
hwy::ThreadPool pool(8);
std::mt19937 gen(42);
HWY_ALIGN std::array<float, kRows * kCols> weights;
HWY_ALIGN std::array<float, kTokens * kCols> x;
HWY_ALIGN std::array<float, kTokens * kRows> dy;
HWY_ALIGN std::array<float, kRows * kCols> grad;
HWY_ALIGN std::array<float, kTokens * kCols> dx;
HWY_ALIGN std::array<float, kRows * kCols> grad_scalar;
HWY_ALIGN std::array<float, kTokens * kCols> dx_scalar;
MatStorageT<float> weights("weights", kRows, kCols);
MatStorageT<float> x("x", kTokens, kCols);
MatStorageT<float> dy("dy", kTokens, kRows);
MatStorageT<float> grad("grad", kRows, kCols);
MatStorageT<float> dx("dx", kTokens, kCols);
MatStorageT<float> grad_scalar("grad_scalar", kRows, kCols);
MatStorageT<float> dx_scalar("dx_scalar", kTokens, kCols);
using TC = std::complex<double>;
std::array<TC, kRows * kCols> c_weights;
std::array<TC, kTokens * kCols> c_x;
std::array<TC, kTokens * kRows> c_y;
MatStorageT<TC> c_weights("c_weights", kRows, kCols);
MatStorageT<TC> c_x("c_x", kTokens, kCols);
MatStorageT<TC> c_y("c_y", kTokens, kRows);
for (int iter = 0; iter < 10; ++iter) {
RandInit(weights, 1.0f * (1 << iter), gen);
@ -83,13 +84,13 @@ void TestMatMulVJP() {
return DotT(dy.data(), c_y.data(), kTokens * kRows);
};
hwy::ZeroBytes(&grad, sizeof(grad));
grad.ZeroInit();
MatMulVJP<kCols, kRows>(weights.data(), x.data(), dy.data(), kTokens,
grad.data(), dx.data(), pool);
TestGradient(dx, c_x, func, 5e-5, 5e-5, __LINE__);
TestGradient(grad, c_weights, func, 5e-5, 5e-5, __LINE__);
TestGradient(dx, c_x, func, 5e-5f, 5e-5f, __LINE__);
TestGradient(grad, c_weights, func, 5e-5f, 5e-5f, __LINE__);
hwy::ZeroBytes(&grad_scalar, sizeof(grad_scalar));
grad_scalar.ZeroInit();
MatMulVJPT(weights.data(), x.data(), dy.data(), grad_scalar.data(),
dx_scalar.data(), kRows, kCols, kTokens);
TestNear(dx, dx_scalar, 5e-5, 1e-4, __LINE__);
@ -104,17 +105,17 @@ void TestMultiHeadMatMulVJP() {
static const size_t kTokens = 3;
hwy::ThreadPool pool(8);
std::mt19937 gen(42);
HWY_ALIGN std::array<float, kRows * kCols * kHeads> weights;
HWY_ALIGN std::array<float, kTokens * kCols * kHeads> x;
HWY_ALIGN std::array<float, kRows * kCols * kHeads> grad;
HWY_ALIGN std::array<float, kTokens * kCols * kHeads> dx;
HWY_ALIGN std::array<float, kTokens * kRows> dy;
HWY_ALIGN std::array<float, kRows * kCols * kHeads> grad_scalar;
HWY_ALIGN std::array<float, kTokens * kCols * kHeads> dx_scalar;
MatStorageT<float> weights("weights", kRows, kCols * kHeads);
MatStorageT<float> x("x", kTokens, kCols * kHeads);
MatStorageT<float> grad("grad", kRows, kCols * kHeads);
MatStorageT<float> dx("dx", kTokens, kCols * kHeads);
MatStorageT<float> dy("dy", kTokens, kRows);
MatStorageT<float> grad_scalar("grad_scalar", kRows, kCols * kHeads);
MatStorageT<float> dx_scalar("dx_scalar", kTokens, kCols * kHeads);
using TC = std::complex<double>;
std::array<TC, kRows * kCols * kHeads> c_weights;
std::array<TC, kTokens * kCols * kHeads> c_x;
std::array<TC, kTokens * kRows> c_y;
MatStorageT<TC> c_weights("c_weights", kRows, kCols * kHeads);
MatStorageT<TC> c_x("c_x", kTokens, kCols * kHeads);
MatStorageT<TC> c_y("c_y", kTokens, kRows);
for (int iter = 0; iter < 10; ++iter) {
RandInit(weights, 1.0f * (1 << iter), gen);
@ -128,14 +129,14 @@ void TestMultiHeadMatMulVJP() {
return DotT(dy.data(), c_y.data(), kTokens * kRows);
};
hwy::ZeroBytes(&grad, sizeof(grad));
grad.ZeroInit();
MultiHeadMatMulVJP<kHeads, kCols, kRows>(
weights.data(), x.data(), dy.data(), kTokens, grad.data(), dx.data(),
pool);
TestGradient(dx, c_x, func, 5e-5, 5e-5, __LINE__);
TestGradient(grad, c_weights, func, 5e-5, 5e-5, __LINE__);
TestGradient(dx, c_x, func, 5e-5f, 5e-5f, __LINE__);
TestGradient(grad, c_weights, func, 5e-5f, 5e-5f, __LINE__);
hwy::ZeroBytes(&grad_scalar, sizeof(grad_scalar));
grad_scalar.ZeroInit();
MultiHeadMatMulVJPT(weights.data(), x.data(), dy.data(), grad_scalar.data(),
dx_scalar.data(), kHeads, kRows, kCols, kTokens);
TestNear(dx, dx_scalar, 5e-5, 5e-5, __LINE__);
@ -148,17 +149,17 @@ void TestRMSNormVJP() {
static const size_t N = 64;
hwy::ThreadPool pool(8);
std::mt19937 gen(42);
HWY_ALIGN std::array<float, N> weights;
HWY_ALIGN std::array<float, K * N> x;
HWY_ALIGN std::array<float, N> grad;
HWY_ALIGN std::array<float, K * N> dx;
HWY_ALIGN std::array<float, K * N> dy;
HWY_ALIGN std::array<float, N> grad_scalar;
HWY_ALIGN std::array<float, K * N> dx_scalar;
MatStorageT<float> weights("weights", N, 1);
MatStorageT<float> x("x", K, N);
MatStorageT<float> grad("grad", N, 1);
MatStorageT<float> dx("dx", K, N);
MatStorageT<float> dy("dy", K, N);
MatStorageT<float> grad_scalar("grad_scalar", N, 1);
MatStorageT<float> dx_scalar("dx_scalar", K, N);
using TC = std::complex<double>;
std::array<TC, N> c_weights;
std::array<TC, K * N> c_x;
std::array<TC, K * N> c_y;
MatStorageT<TC> c_weights("c_weights", N, 1);
MatStorageT<TC> c_x("c_x", K, N);
MatStorageT<TC> c_y("c_y", K, N);
for (int iter = 0; iter < 10; ++iter) {
RandInit(weights, 1.0f * (1 << iter), gen);
@ -171,13 +172,13 @@ void TestRMSNormVJP() {
return DotT(dy.data(), c_y.data(), K * N);
};
hwy::ZeroBytes(&grad, sizeof(grad));
grad.ZeroInit();
RMSNormVJP(weights.data(), x.data(), dy.data(), N, K, grad.data(),
dx.data(), pool);
TestGradient(dx, c_x, func, 5e-5, 5e-5, __LINE__);
TestGradient(grad, c_weights, func, 5e-5, 5e-5, __LINE__);
TestGradient(dx, c_x, func, 5e-5f, 5e-5f, __LINE__);
TestGradient(grad, c_weights, func, 5e-5f, 5e-5f, __LINE__);
hwy::ZeroBytes(&grad_scalar, sizeof(grad_scalar));
grad_scalar.ZeroInit();
RMSNormVJPT(weights.data(), x.data(), dy.data(), grad_scalar.data(),
dx_scalar.data(), N, K);
TestNear(dx, dx_scalar, 0, 2e-5, __LINE__);
@ -185,7 +186,9 @@ void TestRMSNormVJP() {
}
}
struct TestConfig : public ConfigBaseGemmaV2 {
template <typename T>
struct TestConfig : ConfigBaseGemmaV2 {
using Weight = T;
static constexpr int kSeqLen = 24;
static constexpr int kVocabSize = 16;
static constexpr int kModelDim = 32;
@ -206,20 +209,22 @@ struct TestConfig : public ConfigBaseGemmaV2 {
void TestEndToEnd() {
std::mt19937 gen(42);
hwy::ThreadPool pool(0);
WeightsWrapper<float, TestConfig> weights;
WeightsWrapper<float, TestConfig> grad;
ActivationsWrapper<float, TestConfig> forward0;
ActivationsWrapper<float, TestConfig> forward1;
ActivationsWrapper<float, TestConfig> backward;
using WeightsF = CompressedWeights<TestConfig<float>>;
using LayerF = CompressedLayer<TestConfig<float>>;
WeightsWrapper<TestConfig<float>> weights;
WeightsWrapper<TestConfig<float>> grad;
ActivationsWrapper<float, TestConfig<float>> forward0;
ActivationsWrapper<float, TestConfig<float>> forward1;
ActivationsWrapper<float, TestConfig<float>> backward;
using TC = std::complex<double>;
WeightsWrapper<TC, TestConfig> c_weights;
ForwardPass<TC, TestConfig> c_forward;
WeightsWrapper<TestConfig<TC>> c_weights;
ForwardPass<TC, TestConfig<TC>> c_forward;
ReverseSequenceSampler training_task({0, 0, 1, 1});
std::vector<Prompt> batch = training_task.SampleBatch(3, gen);
RowVectorBatch<float> inv_timescale =
Activations::CreateInvTimescale<TestConfig>();
Activations::CreateInvTimescale<TestConfig<float>>();
for (const Prompt& prompt : batch) {
ReverseSequenceSampler::LogPrompt(prompt);
RandInit(weights.get(), 1.0f, gen);
@ -227,14 +232,15 @@ void TestEndToEnd() {
float loss0 = CrossEntropyLossForwardPass(
prompt, weights.get(), forward0.get());
float loss1 = CrossEntropyLossForwardPass<TestConfig, WeightsF, LayerF>(
float loss1 =
CrossEntropyLossForwardPass<TestConfig<float>, WeightsF, LayerF>(
prompt.tokens, prompt.context_size, weights.get(), forward1.get(),
inv_timescale, pool);
EXPECT_NEAR(loss1, loss0, std::abs(loss0) * 2e-5);
grad.clear();
CrossEntropyLossBackwardPass<TestConfig, WeightsF, LayerF>(
grad.ZeroInit();
CrossEntropyLossBackwardPass<TestConfig<float>, WeightsF, LayerF>(
prompt, weights.get(), forward1.get(), grad.get(), backward.get(),
inv_timescale, pool);

View File

@ -18,9 +18,10 @@
#include <stddef.h>
#include <array>
#include <complex>
#include "compression/compress.h" // MatStorageT
namespace gcpp {
template<typename T, typename U>
@ -57,9 +58,9 @@ void MulByConstAndAddT(T c, const T* x, T* out, size_t N) {
}
}
template<typename T, size_t N>
void MulByConstAndAddT(T c, const std::array<T, N>& x, std::array<T, N>& out) {
MulByConstAndAddT(c, x.data(), out.data(), N);
template <typename T>
void MulByConstAndAddT(T c, const MatPtrT<T>& x, MatPtrT<T>& out) {
MulByConstAndAddT(c, x.data(), out.data(), x.NumElements());
}
template<typename T>

View File

@ -93,8 +93,8 @@ static HWY_NOINLINE float CrossEntropyLoss(const float* HWY_RESTRICT probs,
return loss * scaling;
}
template <typename TConfig, template <typename> typename LayerT>
void ApplyForwardLayer(const LayerT<TConfig>& weights,
template <typename TConfig, typename LayerT>
void ApplyForwardLayer(const LayerT& weights,
ForwardLayer<float, TConfig>& activations,
size_t num_tokens, float* HWY_RESTRICT output,
const RowVectorBatch<float>& inv_timescale,
@ -171,8 +171,7 @@ void ApplyForwardLayer(const LayerT<TConfig>& weights,
}
});
hwy::ZeroBytes(activations.attention_out.data(),
num_tokens * kModelDim * sizeof(activations.attention_out[0]));
activations.attention_out.ZeroInit();
for (size_t pos = 0; pos < num_tokens; ++pos) {
for (size_t head = 0; head < kHeads; ++head) {
MatVec<kModelDim, kQKVDim>(
@ -227,11 +226,9 @@ void ApplyForwardLayer(const LayerT<TConfig>& weights,
}
}
template <typename TConfig, template <typename...> typename WeightsT,
template <typename> typename LayerT>
template <typename TConfig, typename WeightsT, typename LayerT>
float CrossEntropyLossForwardPass(const std::vector<int>& prompt,
size_t context_size,
const WeightsT<TConfig>& weights,
size_t context_size, const WeightsT& weights,
ForwardPass<float, TConfig>& forward,
const RowVectorBatch<float>& inv_timescale,
hwy::ThreadPool& pool) {
@ -281,7 +278,7 @@ float CrossEntropyLossForwardPass(const std::vector<int>& prompt,
}
hwy::CopyBytes(forward.logits.data(), forward.probs.data(),
num_tokens * kVocabSize * sizeof(forward.logits[0]));
num_tokens * kVocabSize * sizeof(forward.logits.At(0)));
for (size_t pos = 0; pos < num_tokens; ++pos) {
Softmax(forward.probs.data() + pos * kVocabSize, kVocabSize);

View File

@ -46,8 +46,8 @@ float CrossEntropyLossForwardPass(const Prompt& prompt,
*reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
auto& forward =
*reinterpret_cast<ForwardPass<float, TConfig>*>(forward_u8.get());
return CrossEntropyLossForwardPass<TConfig, CompressedWeights,
CompressedLayer>(
return CrossEntropyLossForwardPass<TConfig, CompressedWeights<TConfig>,
CompressedLayer<TConfig>>(
prompt.tokens, prompt.context_size, weights, forward, inv_timescale,
pool);
}

View File

@ -26,8 +26,9 @@
#include "backprop/activations.h"
#include "backprop/common_scalar.h"
#include "backprop/prompt.h"
#include "compression/weights_raw.h"
#include "gemma/common.h" // EmbeddingScaling
#include "gemma/weights.h"
#include "hwy/base.h"
namespace gcpp {
@ -116,6 +117,8 @@ void GatedGelu(const T* in, T* out, size_t N, size_t K) {
template<typename T>
void InputEmbedding(const T* w, const std::vector<int>& tokens, T scaling,
T* y, size_t N) {
HWY_ASSERT(w != nullptr);
HWY_ASSERT(y != nullptr);
const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
for (size_t i = 0; i < num_tokens; ++i) {
int token = tokens[i];
@ -166,10 +169,10 @@ void MixByAttention(const T* qkv, const T* attention, T* output,
}
}
}
template<typename T, typename TConfig>
void ApplyLayer(const Layer<T, TConfig>& weights,
ForwardLayer<T, TConfig>& activations,
size_t num_tokens, T* output) {
template <typename T, typename TConfig>
void ApplyLayer(const CompressedLayer<TConfig>& weights,
ForwardLayer<T, TConfig>& activations, size_t num_tokens,
T* output) {
static constexpr size_t kModelDim = TConfig::kModelDim;
static constexpr size_t kSeqLen = TConfig::kSeqLen;
static constexpr size_t kQKVDim = TConfig::kQKVDim;
@ -244,9 +247,9 @@ T CrossEntropyLoss(const T* x, const Prompt& prompt, size_t V) {
return loss * scaling;
}
template<typename T, typename TConfig>
template <typename T, typename TConfig>
T CrossEntropyLossForwardPass(const Prompt& prompt,
const Weights<T, TConfig>& weights,
const CompressedWeights<TConfig>& weights,
ForwardPass<T, TConfig>& forward) {
static constexpr size_t kModelDim = TConfig::kModelDim;
static constexpr size_t kVocabSize = TConfig::kVocabSize;
@ -282,7 +285,7 @@ T CrossEntropyLossForwardPass(const Prompt& prompt,
}
memcpy(forward.probs.data(), forward.logits.data(),
num_tokens * kVocabSize * sizeof(forward.logits[0]));
num_tokens * kVocabSize * sizeof(forward.logits.At(0)));
Softmax(forward.probs.data(), kVocabSize, num_tokens);
return CrossEntropyLoss(forward.probs.data(), prompt, kVocabSize);

View File

@ -21,6 +21,8 @@
#include "compression/compress.h"
#include "gemma/common.h"
#include "gemma/weights.h"
#include "util/allocator.h"
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
@ -32,14 +34,14 @@ class WeightInitializer {
public:
WeightInitializer(std::mt19937& gen) : dist_(0.0f, 1.0f), gen_(gen) {}
template <size_t N>
void operator()(const char* name, CompressedArray<float, N>& tensor) {
float* data = tensor.data();
for (size_t i = 0; i < N; ++i) {
void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
float* data = tensors[0]->data<float>();
for (size_t i = 0; i < tensors[0]->NumElements(); ++i) {
data[i] = dist_(gen_);
}
tensor.set_scale(1.0f);
tensors[0]->set_scale(1.0f);
}
private:
std::normal_distribution<float> dist_;
std::mt19937& gen_;
@ -54,7 +56,8 @@ struct RandInitWeightsT {
// TODO(szabadka) Use the same weight initialization method as in the python
// version.
WeightInitializer init(gen);
ForEachTensor1<TConfig>(init, weights);
CompressedWeights<TConfig>::ForEachTensor({&weights},
ForEachType::kLoadNoToc, init);
}
};
@ -66,17 +69,13 @@ class AdamUpdater {
cbeta2_(1.0f - beta2), norm1_(1.0 / (1.0 - std::pow(beta1, t))),
norm2_(1.0 / (1.0 - std::pow(beta2, t))), epsilon_(epsilon) {}
template <size_t kCapacity>
void operator()(const char* name,
const CompressedArray<float, kCapacity>& grad,
CompressedArray<float, kCapacity>& weights,
CompressedArray<float, kCapacity>& grad_m,
CompressedArray<float, kCapacity>& grad_v) {
const float* HWY_RESTRICT g = grad.data();
float* HWY_RESTRICT w = weights.data();
float* HWY_RESTRICT m = grad_m.data();
float* HWY_RESTRICT v = grad_v.data();
for (size_t i = 0; i < kCapacity; ++i) {
void operator()(const char* name, const MatPtr& grad, MatPtr& weights,
MatPtr& grad_m, MatPtr& grad_v) {
const float* HWY_RESTRICT g = grad.data<float>();
float* HWY_RESTRICT w = weights.data<float>();
float* HWY_RESTRICT m = grad_m.data<float>();
float* HWY_RESTRICT v = grad_v.data<float>();
for (size_t i = 0; i < grad.NumElements(); ++i) {
m[i] *= beta1_;
m[i] += cbeta1_ * g[i];
v[i] *= beta2_;
@ -105,12 +104,16 @@ struct AdamUpdateT {
const ByteStorageT& weights_u8, const ByteStorageT& grad_m_u8,
const ByteStorageT& grad_v_u8, hwy::ThreadPool& pool) const {
using TWeights = CompressedWeights<TConfig>;
const auto& grad = *reinterpret_cast<const TWeights*>(grad_u8.get());
auto& grad = *reinterpret_cast<TWeights*>(grad_u8.get());
auto& weights = *reinterpret_cast<TWeights*>(weights_u8.get());
auto& grad_m = *reinterpret_cast<TWeights*>(grad_m_u8.get());
auto& grad_v = *reinterpret_cast<TWeights*>(grad_v_u8.get());
AdamUpdater updater(alpha, beta1, beta2, epsilon, t);
ForEachTensor4<TConfig>(updater, grad, weights, grad_m, grad_v);
TWeights::ForEachTensor(
{&grad, &weights, &grad_m, &grad_v}, ForEachType::kLoadNoToc,
[&updater](const char* name, hwy::Span<MatPtr*> tensors) {
updater(name, *tensors[0], *tensors[1], *tensors[2], *tensors[3]);
});
}
};

View File

@ -18,27 +18,57 @@
#include <stddef.h>
#include <array>
#include <cmath>
#include <complex>
#include <random>
#include "gtest/gtest.h"
#include "compression/weights_raw.h"
#include "compression/compress.h"
#include "gemma/weights.h"
#include "util/allocator.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
namespace gcpp {
template<typename T, typename U, size_t kLen>
void Complexify(const std::array<T, kLen>& x,
std::array<std::complex<U>, kLen>& c_x) {
for (size_t i = 0; i < kLen; ++i) {
c_x[i] = std::complex<U>(x[i], 0.0);
template <typename T>
void RandInit(MatPtrT<T>& x, T stddev, std::mt19937& gen) {
std::normal_distribution<T> dist(0.0, stddev);
for (size_t i = 0; i < x.NumElements(); ++i) {
x.At(i) = dist(gen);
}
}
// TODO: make a member of Layer<T>.
template <typename T, typename TConfig>
void RandInit(CompressedLayer<TConfig>& w, T stddev, std::mt19937& gen) {
RandInit(w.pre_attention_norm_scale, stddev, gen);
RandInit(w.attn_vec_einsum_w, stddev, gen);
RandInit(w.qkv_einsum_w, stddev, gen);
RandInit(w.pre_ffw_norm_scale, stddev, gen);
RandInit(w.gating_einsum_w, stddev, gen);
RandInit(w.linear_w, stddev, gen);
}
template<typename T, typename U, typename TConfig>
void Complexify(const Layer<T, TConfig>& w,
Layer<std::complex<U>, TConfig>& c_w) {
template <typename T, typename TConfig>
void RandInit(CompressedWeights<TConfig>& w, T stddev, std::mt19937& gen) {
static constexpr size_t kLayers = TConfig::kLayers;
RandInit(w.embedder_input_embedding, stddev, gen);
RandInit(w.final_norm_scale, stddev, gen);
for (size_t i = 0; i < kLayers; ++i) {
RandInit(*w.GetLayer(i), stddev, gen);
}
}
template <typename T, typename U>
void Complexify(const MatPtrT<T>& x, MatPtrT<std::complex<U>>& c_x) {
for (size_t i = 0; i < x.NumElements(); ++i) {
c_x.At(i) = std::complex<U>(x.At(i), 0.0);
}
}
template <typename TConfig, typename UConfig>
void Complexify(const CompressedLayer<TConfig>& w,
CompressedLayer<UConfig>& c_w) {
Complexify(w.pre_attention_norm_scale, c_w.pre_attention_norm_scale);
Complexify(w.attn_vec_einsum_w, c_w.attn_vec_einsum_w);
Complexify(w.qkv_einsum_w, c_w.qkv_einsum_w);
@ -47,9 +77,9 @@ void Complexify(const Layer<T, TConfig>& w,
Complexify(w.linear_w, c_w.linear_w);
}
template<typename T, typename U, typename TConfig>
void Complexify(const Weights<T, TConfig>& w,
Weights<std::complex<U>, TConfig>& c_w) {
template <typename TConfig, typename UConfig>
void Complexify(const CompressedWeights<TConfig>& w,
CompressedWeights<UConfig>& c_w) {
static constexpr size_t kLayers = TConfig::kLayers;
Complexify(w.embedder_input_embedding, c_w.embedder_input_embedding);
Complexify(w.final_norm_scale, c_w.final_norm_scale);
@ -58,19 +88,41 @@ void Complexify(const Weights<T, TConfig>& w,
}
}
template<typename T, typename U, size_t N>
void TestNear(const std::array<T, N>& actual, const std::array<U, N>& expected,
// Owns weights and provides access to TConfig.
template <typename TConfig>
class WeightsWrapper {
public:
WeightsWrapper()
: pool_(0),
data_(AllocateCompressedWeights<TConfig>()(pool_)),
weights_(reinterpret_cast<CompressedWeights<TConfig>*>(data_.get())) {}
const CompressedWeights<TConfig>& get() const { return *weights_; }
CompressedWeights<TConfig>& get() { return *weights_; }
void ZeroInit() { weights_->ZeroInit(); }
void CopyFrom(const WeightsWrapper<TConfig>& other) {
get().CopyFrom(other.get());
}
private:
hwy::ThreadPool pool_;
ByteStorageT data_;
CompressedWeights<TConfig>* weights_;
};
template <typename T, typename U>
void TestNear(const MatPtrT<T>& actual, const MatPtrT<U>& expected,
double max_abs_err, double max_rel_err, int line) {
double sum0 = 0;
double sum1 = 0;
double sum01 = 0;
for (size_t i = 0; i < N; ++i) {
sum0 += actual[i] * actual[i];
sum1 += expected[i] * expected[i];
sum01 += actual[i] * expected[i];
ASSERT_NEAR(actual[i], expected[i],
std::max(max_abs_err, std::abs(expected[i]) * max_rel_err))
<< "line: " << line << " dim=" << N << " i=" << i;
for (size_t i = 0; i < actual.NumElements(); ++i) {
sum0 += actual.At(i) * actual.At(i);
sum1 += expected.At(i) * expected.At(i);
sum01 += actual.At(i) * expected.At(i);
ASSERT_NEAR(actual.At(i), expected.At(i),
std::max(max_abs_err, std::abs(expected.At(i)) * max_rel_err))
<< "line: " << line << " dim=" << expected.NumElements() << " i=" << i;
}
if (sum0 > 1e-40) {
double norm_dot = sum01 / std::sqrt(sum0) / std::sqrt(sum1);
@ -93,48 +145,37 @@ void TestNear(const std::array<T, N>& actual, const std::array<U, N>& expected,
// This method is more numerically stable than the real-valued finite difference
// method since we don't need to subtract floating point numbers that are near
// to each other.
template<typename T, typename U, size_t N, typename FUNC>
void TestGradient(const std::array<T, N>& grad,
std::array<std::complex<U>, N>& x, FUNC func,
U step, T max_abs_err, T max_rel_err, int line) {
std::array<T, N> exp_grad;
template <typename FUNC, typename T, typename U>
void TestGradient(const MatPtrT<T>& grad, MatPtrT<std::complex<U>>& x,
FUNC func, U step, T max_abs_err, T max_rel_err, int line) {
MatStorageT<T> exp_grad("exp_grad", x.Rows(), x.Cols());
const U inv_step = 1.0 / step;
for (size_t i = 0; i < N; ++i) {
const U x0 = std::real(x[i]);
for (size_t i = 0; i < x.NumElements(); ++i) {
const U x0 = std::real(x.At(i));
const std::complex<U> x1 = std::complex<U>(x0, step);
x[i] = x1;
x.At(i) = x1;
const std::complex<U> f1 = func();
exp_grad [i] = std::imag(f1) * inv_step;
x[i] = x0;
exp_grad.At(i) = std::imag(f1) * inv_step;
x.At(i) = x0;
}
TestNear(grad, exp_grad, max_abs_err, max_rel_err, line);
}
template<size_t N, typename FUNC>
void TestGradient(const std::array<float, N>& grad,
std::array<std::complex<float>, N>& x, FUNC func,
float max_abs_err, float max_rel_error, int line) {
template <typename FUNC>
void TestGradient(const MatPtrT<float>& grad, MatPtrT<std::complex<float>>& x,
FUNC func, float max_abs_err, float max_rel_error, int line) {
TestGradient(grad, x, func, 1e-30f, max_abs_err, max_rel_error, line);
}
template<size_t N, typename FUNC>
void TestGradient(const std::array<float, N>& grad,
std::array<std::complex<double>, N>& x, FUNC func,
float max_abs_err, float max_rel_error, int line) {
template <typename FUNC, typename T>
void TestGradient(const MatPtrT<T>& grad, MatPtrT<std::complex<double>>& x,
FUNC func, T max_abs_err, T max_rel_error, int line) {
TestGradient(grad, x, func, 1e-50, max_abs_err, max_rel_error, line);
}
template<size_t N, typename FUNC>
void TestGradient(const std::array<double, N>& grad,
std::array<std::complex<double>, N>& x, FUNC func,
double max_abs_err, double max_rel_error, int line) {
TestGradient(grad, x, func, 1e-50, max_abs_err, max_rel_error, line);
}
template<typename T, typename U, typename TConfig, typename FUNC>
void TestGradient(const Layer<T, TConfig>& grad,
Layer<std::complex<U>, TConfig>& c_weights,
FUNC func, T max_err) {
template <typename T, typename TConfig, typename UConfig, typename FUNC>
void TestGradient(const CompressedLayer<TConfig>& grad,
CompressedLayer<UConfig>& c_weights, FUNC func, T max_err) {
TestGradient(grad.pre_attention_norm_scale,
c_weights.pre_attention_norm_scale,
func, max_err, max_err, __LINE__);
@ -150,10 +191,9 @@ void TestGradient(const Layer<T, TConfig>& grad,
func, max_err, max_err, __LINE__);
}
template<typename T, typename U, typename TConfig, typename FUNC>
void TestGradient(const Weights<T, TConfig>& grad,
Weights<std::complex<U>, TConfig>& c_weights,
FUNC func, T max_err) {
template <typename T, typename TConfig, typename UConfig, typename FUNC>
void TestGradient(const CompressedWeights<TConfig>& grad,
CompressedWeights<UConfig>& c_weights, FUNC func, T max_err) {
TestGradient(grad.embedder_input_embedding,
c_weights.embedder_input_embedding,
func, 2 * max_err, max_err, __LINE__);

View File

@ -152,6 +152,7 @@ cc_test(
cc_library(
name = "compress",
srcs = ["compress.cc"],
hdrs = [
"compress.h",
"shared.h",
@ -207,30 +208,17 @@ cc_library(
],
)
cc_library(
name = "weights_raw",
hdrs = ["weights_raw.h"],
deps = [
"//:allocator",
"//:common",
"@hwy//:hwy",
"@hwy//:thread_pool",
],
)
cc_binary(
name = "compress_weights",
srcs = ["compress_weights.cc"],
deps = [
":compress",
":io",
":weights_raw",
"//:allocator",
"//:args",
"//:common",
"//:weights",
"@hwy//:hwy",
"@hwy//:profiler",
"@hwy//:thread_pool",
],
)

View File

@ -19,7 +19,10 @@
#include <stdint.h>
#include <atomic>
#include <cstdio>
#include <memory>
#include <new>
#include <string>
#include <vector>
#include "compression/io.h"
@ -45,6 +48,13 @@ hwy::uint128_t MakeKey(const char* string) {
return ret;
}
std::string StringFromKey(hwy::uint128_t key) {
std::string name(sizeof(key) + 1, '\0');
hwy::CopyBytes(&key, name.data(), sizeof(key));
name.resize(name.find('\0'));
return name;
}
namespace {
void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
std::vector<BlobIO>& requests) {
@ -226,15 +236,23 @@ BlobError BlobReader::Open(const Path& filename) {
return blob_store_->CheckValidity(file_->FileSize());
}
size_t BlobReader::BlobSize(hwy::uint128_t key) const {
uint64_t offset;
size_t size;
if (!blob_store_->FindKey(key, offset, size)) return 0;
return size;
}
BlobError BlobReader::Enqueue(hwy::uint128_t key, void* data, size_t size) {
uint64_t offset;
size_t actual_size;
if (!blob_store_->FindKey(key, offset, actual_size)) return __LINE__;
if (actual_size != size) {
fprintf(stderr,
"Mismatch between expected %d and actual %d KiB size. Please see "
"README.md on how to update the weights.\n",
static_cast<int>(size >> 10), static_cast<int>(actual_size >> 10));
"Mismatch between expected %d and actual %d KiB size of blob %s. "
"Please see README.md on how to update the weights.\n",
static_cast<int>(size >> 10), static_cast<int>(actual_size >> 10),
StringFromKey(key).c_str());
return __LINE__;
}
@ -265,6 +283,17 @@ BlobError BlobReader::ReadAll(hwy::ThreadPool& pool) {
return 0;
}
BlobError BlobReader::ReadOne(hwy::uint128_t key, void* data,
size_t size) const {
uint64_t offset;
size_t actual_size;
if (!blob_store_->FindKey(key, offset, actual_size)) return __LINE__;
if (!file_->Read(offset, actual_size, data)) {
return __LINE__;
}
return 0;
}
BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool, const Path& filename) {
HWY_ASSERT(keys_.size() == blobs_.size());

View File

@ -20,6 +20,7 @@
#include <stdint.h>
#include <memory>
#include <string>
#include <vector>
#include "compression/io.h"
@ -32,6 +33,9 @@ namespace gcpp {
// Convenient way to construct a key from a string (<= 16 chars).
hwy::uint128_t MakeKey(const char* string);
// Returns a string from a key.
std::string StringFromKey(hwy::uint128_t key);
// Ordered list of opaque blobs (~hundreds), identified by unique opaque
// 128-bit keys.
class BlobStore;
@ -67,6 +71,9 @@ class BlobReader {
// Opens `filename` and reads its header.
BlobError Open(const Path& filename);
// Returns the size of the blob identified by `key`, or 0 if not found.
size_t BlobSize(hwy::uint128_t key) const;
// Enqueues read requests if `key` is found and its size matches `size`, which
// is in units of bytes.
BlobError Enqueue(hwy::uint128_t key, void* data, size_t size);
@ -74,6 +81,9 @@ class BlobReader {
// Reads all enqueued requests.
BlobError ReadAll(hwy::ThreadPool& pool);
// Reads one blob directly.
BlobError ReadOne(hwy::uint128_t key, void* data, size_t size) const;
private:
BlobStorePtr blob_store_; // holds header, not the entire file
std::vector<BlobIO> requests_;

View File

@ -471,14 +471,15 @@ HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
}
}
// Adapter that compresses into `CompressedArray`. `raw` must already be scaled
// Adapter that compresses into `MatStorageT`. `raw` must already be scaled
// to fit the value range, if `Packed` is `SfpStream`.
template <typename Packed, size_t kCapacity>
template <typename Packed>
HWY_INLINE void CompressScaled(const float* HWY_RESTRICT raw, size_t num,
CompressWorkingSet& work,
CompressedArray<Packed, kCapacity>& compressed,
MatStorageT<Packed>& compressed,
hwy::ThreadPool& pool) {
Compress(raw, num, work, MakeSpan(compressed.data(), kCapacity),
Compress(raw, num, work,
MakeSpan(compressed.data(), compressed.NumElements()),
/*packed_ofs=*/0, pool);
}
@ -674,28 +675,24 @@ class Compressor {
public:
explicit Compressor(hwy::ThreadPool& pool) : pool_(pool) {}
template <typename Packed, size_t kCapacity>
void operator()(const char* name, const float* HWY_RESTRICT weights,
CompressedArray<Packed, kCapacity>& compressed) {
Insert(name, weights, kCapacity, work_, compressed.GetSpan(),
/*packed_ofs=*/0, pool_);
}
template <typename Packed>
void Insert(const char* name, const float* HWY_RESTRICT weights,
size_t num_weights, CompressWorkingSet& work,
const PackedSpan<Packed>& packed, size_t packed_ofs,
hwy::ThreadPool& pool) {
fprintf(stderr, "Compressing %s (%zuM), please wait\n", name,
void operator()(MatPtrT<Packed>* compressed, const char* decorated_name,
const float* HWY_RESTRICT weights) {
int num_weights = compressed->NumElements();
int num_compressed = compressed->NumElements();
PackedSpan<Packed> packed = MakeSpan(compressed->data(), num_compressed);
fprintf(stderr, "Compressing %s (%zuM), please wait\n", decorated_name,
num_weights / (1000 * 1000));
Compress(weights, num_weights, work_, packed, packed_ofs, pool_);
Compress(weights, num_weights, work_, packed, /*packed_ofs=*/0, pool_);
const size_t num_bytes = packed.num * sizeof(Packed);
writer_.Add(CacheKey<Packed>(name), packed.ptr, num_bytes);
writer_.Add(MakeKey(decorated_name), packed.ptr, num_bytes);
}
void AddScales(const float* scales, size_t len) {
if (len) {
writer_.Add(CacheKey<float>("scales"), scales, len * sizeof(scales[0]));
MatPtrT<float> scales_ptr("scales", 0, 1);
writer_.Add(MakeKey(scales_ptr.CacheName().c_str()), scales,
len * sizeof(scales[0]));
}
}

22
compression/compress.cc Normal file
View File

@ -0,0 +1,22 @@
// Copyright 2024 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "compression/compress.h"
namespace gcpp {
MatPtr::~MatPtr() {}
} // namespace gcpp

View File

@ -23,7 +23,11 @@
#include <stdio.h>
#include <array>
#include <cstdio>
#include <cstring>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
// IWYU pragma: begin_exports
@ -32,7 +36,8 @@
#include "compression/shared.h"
// IWYU pragma: end_exports
#include "compression/distortion.h"
#include "hwy/base.h"
#include "hwy/aligned_allocator.h"
#include "hwy/base.h" // BF16
#include "hwy/contrib/thread_pool/thread_pool.h"
#if COMPRESS_STATS
#include "hwy/stats.h"
@ -82,6 +87,376 @@ class CompressedArray {
float scale_[kBlobAlign / sizeof(float)];
};
// Yet another array class. This one is intended to be compatible with
// CompressedArray, but have both run-time sizing and compile-time constant
// size.
// It also provides easy conversion from/to a table of contents for a BlobStore
// file, and a templated (compile-time) accessor for a 2-d array of fixed inner
// dimension and type.
// The base class is intended for accessing the metadata, without needing to
// know any of the template arguments.
// It holds only a borrowed pointer to the data, but all metadata.
// It is designed to be put in a vector, and has default copy and operator=, so
// it is easy to read/write a blob_store file.
// The derived class or an external class owns the data.
class MatPtr {
public:
// Full constructor for dynamic sizing.
MatPtr(const std::string& name, const std::string& type, size_t element_size,
size_t rows, size_t cols)
: name_(name),
type_(type),
element_size_(element_size),
num_elements_(rows * cols),
rows_(rows),
cols_(cols),
ptr_(nullptr) {}
// Default constructor doesn't set anything.
MatPtr() = default;
virtual ~MatPtr();
// Number of hwy::uint128_t in a TOC entry.
// Note that the old-style BlobStore files Only have a list of keys and size.
// The new-style BlobStore files have an entry called "toc" that contains a
// vector of 4-tuples of
// (name, type, (num_elements, element_size), (rows, cols)).
// The listed blobs can be read directly into MatPtr from the BlobStore
// file, without needing any external knowledge of the number of elements,
// element size or type of the data.
static constexpr size_t kNumU128InTocEntry = 4;
// Construct from a TOC entry.
MatPtr(const hwy::uint128_t& key0, const hwy::uint128_t& key1,
const hwy::uint128_t& key2, const hwy::uint128_t& key3)
: name_(StringFromKey(key0)),
type_(StringFromKey(key1)),
element_size_(key2.hi),
num_elements_(key2.lo),
rows_(key3.lo),
cols_(key3.hi) {}
// Adds the contents entry to the table of contents.
void AddToToc(std::vector<hwy::uint128_t>& toc) const {
toc.push_back(MakeKey(name_.c_str()));
toc.push_back(MakeKey(type_.c_str()));
toc.push_back({num_elements_, element_size_});
toc.push_back({rows_, cols_});
}
// Compatibility interface for CompressedArray.
template <typename T>
T* data() {
return HWY_RCAST_ALIGNED(T*, ptr_);
}
template <typename T>
const T* data() const {
return HWY_RCAST_ALIGNED(const T*, ptr_);
}
const void* Ptr() const { return ptr_; }
void* Ptr() { return ptr_; }
// Sets the pointer from another MatPtr.
void SetPtr(const MatPtr& other) { ptr_ = other.ptr_; }
// Copying allowed as the metadata is small.
MatPtr(const MatPtr& other) = default;
MatPtr& operator=(const MatPtr& other) = default;
// Returns the name of the blob.
const std::string& Name() const { return name_; }
void SetName(const std::string& name) { name_ = name; }
// Returns the type of the blob.
const std::string& Type() const { return type_; }
// Returns the size of each element in bytes.
size_t ElementSize() const { return element_size_; }
// Returns the number of elements in the array.
size_t NumElements() const { return num_elements_; }
// Returns the number of bytes in the array.
size_t SizeBytes() const { return num_elements_ * element_size_; }
size_t CompressedSize() const { return SizeBytes(); }
// Returns the number of rows in the 2-d array (outer dimension).
size_t Rows() const { return rows_; }
// Returns the number of columns in the 2-d array (inner dimension).
size_t Cols() const { return cols_; }
// Decoded elements should be multiplied by this to restore their original
// range. This is required because SfpStream can only encode a limited range
// of magnitudes.
float scale() const { return scale_; }
void set_scale(float scale) { scale_ = scale; }
std::string LayerName(int layer) const {
std::string name = name_ + std::to_string(layer);
HWY_ASSERT(name.size() <= sizeof(hwy::uint128_t));
return name;
}
// Adds the blob to the writer.
void AddToWriter(BlobWriter& writer) const {
fprintf(stderr, "Adding %s to writer\n", name_.c_str());
writer.Add(MakeKey(name_.c_str()), ptr_, SizeBytes());
}
// Sets all data to zero.
void ZeroInit() {
if (ptr_ == nullptr)
HWY_ABORT("ptr_ is null on tensor %s\n", name_.c_str());
hwy::ZeroBytes(ptr_, SizeBytes());
}
// Calls func on the upcasted type. Since MatPtr by design is not templated,
// here we provide a way to get to the derived type, provided that the type
// matches one of a known short-list.
template <class FuncT, typename... TArgs>
decltype(auto) CallUpcasted(FuncT& func, TArgs&&... args);
protected:
// Arbitrary name for the array of preferably <= 16 characters.
std::string name_;
// Should be the result of TypeName<T> for CallUpcasted() to work.
std::string type_;
// sizeof(T)
size_t element_size_ = 0;
// Number of elements in the array.
size_t num_elements_ = 0; // In element_size units.
// Number of rows in the 2-d array (outer dimension).
size_t rows_ = 0;
// Number of columns in the 2-d array (inner dimension).
size_t cols_ = 0;
// Scaling to apply to each element.
float scale_ = 1.0f;
// Aligned data array. This is always a borrowed pointer. It should never be
// freed. The underlying memory is owned by a subclass or some external class
// and must outlive this object.
void* ptr_ = nullptr;
};
// MatPtrT adds a single template argument to MatPtr for an explicit type.
// Use this class as a function argument where the type needs to be known.
// Use MatPtr where the type does not need to be known.
template <typename MatT>
class MatPtrT : public MatPtr {
public:
using value_type = MatT;
// Full constructor for dynamic sizing.
MatPtrT(const std::string& name, size_t rows, size_t cols)
: MatPtr(name, TypeName<MatT>(), sizeof(MatT), rows, cols) {}
// Copying allowed as the metadata is small.
MatPtrT(const MatPtr& other) : MatPtr(other) {}
MatPtrT& operator=(const MatPtr& other) {
MatPtr::operator=(other);
return *this;
}
MatPtrT(const MatPtrT& other) = default;
MatPtrT& operator=(const MatPtrT& other) = default;
std::string CacheName(int layer = -1, char separator = ' ',
int index = -1) const {
// Already used/retired: s, S, n, 1
const char prefix = hwy::IsSame<MatT, float>() ? 'F'
: hwy::IsSame<MatT, BF16>() ? 'B'
: hwy::IsSame<MatT, SfpStream>() ? '$'
: hwy::IsSame<MatT, NuqStream>() ? '2'
: '?';
std::string name = std::string(1, prefix) + name_;
if (layer >= 0 || index >= 0) {
name += '_';
if (layer >= 0) name += std::to_string(layer);
if (index >= 0) {
name += separator + std::to_string(index);
}
}
return name;
}
// Sets the number of elements in the array. For use when the number of
// elements is != rows * cols ONLY.
void SetNumElements(size_t num_elements) {
num_elements_ = CompressedArrayElements<MatT>(num_elements);
}
// Fast 2-d accessor for a 2-d array of fixed inner dimension and type.
template <typename T = MatT, size_t kInner>
const T& AtT(size_t row, size_t col) const {
size_t index = row * kInner + col;
HWY_DASSERT(index < num_elements_);
return HWY_RCAST_ALIGNED(const T*, ptr_)[index];
}
// 2-d Accessor for a specific type but with a dynamic inner dimension.
template <typename T = MatT>
const T& At(size_t row, size_t col) const {
size_t index = row * cols_ + col;
HWY_DASSERT(index < num_elements_);
return HWY_RCAST_ALIGNED(const T*, ptr_)[index];
}
// 1-d Accessor for a specific type.
template <typename T = MatT>
const T& At(size_t index) const {
HWY_DASSERT(index < num_elements_);
return HWY_RCAST_ALIGNED(const T*, ptr_)[index];
}
template <typename T = MatT>
T& At(size_t index) {
return HWY_RCAST_ALIGNED(T*, ptr_)[index];
}
// Compatibility interface for CompressedArray.
template <typename T = MatT>
T* data() {
return HWY_RCAST_ALIGNED(T*, ptr_);
}
template <typename T = MatT>
const T* data() const {
return HWY_RCAST_ALIGNED(const T*, ptr_);
}
// The const accessor data_scale1() asserts (!) that the scale is 1.0f, so
// calling it means "I am sure the scale is 1 and therefore ignore the scale".
// A scale of 0 indicates that the scale has likely never been set, so is
// "implicitly 1".
const MatT* data_scale1() const {
HWY_ASSERT(scale() == 1.f);
return HWY_RCAST_ALIGNED(const MatT*, ptr_);
}
};
template <class FuncT, typename... TArgs>
decltype(auto) MatPtr::CallUpcasted(FuncT& func, TArgs&&... args) {
if (type_ == TypeName<float>()) {
return func(dynamic_cast<MatPtrT<float>*>(this),
std::forward<TArgs>(args)...);
} else if (type_ == TypeName<BF16>()) {
return func(dynamic_cast<MatPtrT<BF16>*>(this),
std::forward<TArgs>(args)...);
} else if (type_ == TypeName<SfpStream>()) {
return func(dynamic_cast<MatPtrT<SfpStream>*>(this),
std::forward<TArgs>(args)...);
} else {
HWY_ABORT("Type %s unknown.", type_.c_str());
}
}
// MatStorageT adds the actual data storage to MatPtrT.
template <typename MatT>
class MatStorageT : public MatPtrT<MatT> {
public:
// Full constructor for dynamic sizing.
MatStorageT(const std::string& name, size_t rows, size_t cols)
: MatPtrT<MatT>(name, rows, cols),
data_(hwy::AllocateAligned<MatT>(
hwy::DivCeil(this->SizeBytes(), sizeof(MatT)))) {
this->ptr_ = data_.get();
}
// Can copy the metadata, from a MatPtr, and allocate later.
MatStorageT(const MatPtr& other) : MatPtrT<MatT>(other) {}
// No copying of MatStorageT as it contains big data.
MatStorageT(const MatStorageT& other) = delete;
MatStorageT& operator=(const MatStorageT& other) = delete;
MatStorageT(MatStorageT&& other) = default;
MatStorageT& operator=(MatStorageT&& other) = default;
// Allocate the memory and copy the pointer to the MatPtr.
// num_elements is in elements. In the default (zero) case, it is computed
// from the current num_elements_ which was set by the constructor from the
// rows and cols.
void Allocate(size_t num_elements = 0) {
if (num_elements == 0) {
num_elements = hwy::DivCeil(this->SizeBytes(), sizeof(MatT));
} else {
this->num_elements_ = num_elements;
}
data_ = hwy::AllocateAligned<MatT>(num_elements);
this->ptr_ = data_.get();
}
// Zeros the content.
void ZeroInit() {
HWY_ASSERT(data_ != nullptr);
hwy::ZeroBytes(data_.get(), this->SizeBytes());
}
private:
// Aligned data array.
// std::unique_ptr<MatT[]> data_;
hwy::AlignedFreeUniquePtr<MatT[]> data_;
};
// MatStorage allows heterogeneous tensors to be stored in a single vector.
using MatStorage = MatStorageT<hwy::uint128_t>;
// Table of contents for a blob store file. Full metadata, but not actual data.
class BlobToc {
public:
BlobToc() = default;
// Adds all blobs to the blob writer. Note that the blobs must have unique
// names.
static void AddAllToBlobWriter(const std::vector<MatStorage>& blobs,
BlobWriter& writer) {
std::vector<hwy::uint128_t> toc;
for (const auto& blob : blobs) {
blob.AddToToc(toc);
blob.AddToWriter(writer);
}
writer.Add(MakeKey(kTocName), toc.data(), toc.size() * sizeof(toc[0]));
}
// Loads the table of contents from the given reader.
BlobError LoadToc(BlobReader& reader) {
hwy::uint128_t toc_key = MakeKey(kTocName);
size_t toc_size = reader.BlobSize(toc_key);
if (toc_size != 0) {
std::vector<hwy::uint128_t> toc(toc_size / sizeof(hwy::uint128_t));
BlobError err = reader.ReadOne(toc_key, toc.data(), toc_size);
if (err != 0) {
fprintf(stderr, "Failed to read toc (error %d)\n", err);
return err;
}
for (size_t i = 0; i < toc.size(); i += MatPtr::kNumU128InTocEntry) {
AddToToc(MatPtr(toc[i], toc[i + 1], toc[i + 2], toc[i + 3]));
}
}
return 0;
}
bool Empty() const { return toc_map_.empty(); }
// Returns true if the table of contents contains the given name.
bool Contains(const std::string& name) const {
return toc_map_.find(name) != toc_map_.end();
}
// Returns the blob with the given name, or nullptr if not found.
const MatPtr* Get(const std::string& name) const {
auto it = toc_map_.find(name);
if (it == toc_map_.end()) return nullptr;
return &toc_[it->second];
}
private:
// The name of the toc in the blob store file.
static constexpr char kTocName[] = "toc";
// Adds the blob to the table of contents.
void AddToToc(const MatPtr& blob) {
HWY_ASSERT(!Contains(blob.Name()));
toc_map_[blob.Name()] = toc_.size();
toc_.push_back(blob);
}
std::unordered_map<std::string, size_t> toc_map_;
std::vector<MatPtr> toc_;
};
#if COMPRESS_STATS
class CompressStats {
public:
@ -146,21 +521,6 @@ struct CompressWorkingSet {
std::vector<CompressPerThread> tls;
};
// Returns key for the given tensor name. Also encodes the type, so that
// changing the representation automatically invalidates prior cached files
// (the new blob name will not be found).
template <typename Packed>
hwy::uint128_t CacheKey(const char* name) {
// Already used/retired: s, S, n, 1
const char prefix = hwy::IsSame<Packed, float>() ? 'F'
: hwy::IsSame<Packed, BF16>() ? 'B'
: hwy::IsSame<Packed, SfpStream>() ? '$'
: hwy::IsSame<Packed, NuqStream>() ? '2'
: '?';
return MakeKey((std::string(1, prefix) + name).c_str());
}
// Functor called for each tensor, which loads them and their scaling factors
// from BlobStore.
class CacheLoader {
@ -170,43 +530,82 @@ class CacheLoader {
if (err_ != 0) {
fprintf(stderr,
"Cached compressed weights does not exist yet (code %d), "
"compressing weights and creating file: %s.\n",
"loading from file: %s.\n",
err_, blob_filename.path.c_str());
}
err_ = file_toc_.LoadToc(reader_);
if (err_ != 0) {
fprintf(stderr, "Found a TOC, but failed to load it (code %d)\n", err_);
}
}
// Returns true if there is a TOC.
bool HaveToc() const { return !file_toc_.Empty(); }
// Called for each tensor, enqueues read requests.
template <typename Packed, size_t kCapacity>
void operator()(const char* name, const float* null,
CompressedArray<Packed, kCapacity>& compressed) {
HWY_DASSERT(null == nullptr);
// Skip if reader_ is invalid or any load failed: we will regenerate
// everything because it's rare to update only a few tensors.
if (err_ != 0) return;
const PackedSpan<Packed> span = compressed.GetSpan();
const size_t num_bytes = span.num * sizeof(Packed);
err_ = reader_.Enqueue(CacheKey<Packed>(name), span.ptr, num_bytes);
compressed.set_scale(1.0f);
if (err_ != 0) {
fprintf(stderr, "Failed to read cache %s (error %d)\n", name, err_);
void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
if (file_toc_.Empty() || file_toc_.Contains(name)) {
if (tensors[0]->NumElements() == 0)
fprintf(stderr, "Zero elements for %s\n", name);
model_toc_.push_back(tensors[0]);
file_keys_.push_back(name);
}
}
void LoadScales(float* scales, size_t len) {
if (0 != reader_.Enqueue(CacheKey<float>("scales"), scales,
len * sizeof(scales[0]))) {
BlobError LoadScales(float* scales, size_t len) {
for (size_t i = 0; i < len; ++i) {
scales[i] = 1.0f;
}
}
MatPtrT<float> scales_ptr("scales", 0, 1);
auto key = MakeKey(scales_ptr.CacheName().c_str());
if (reader_.BlobSize(key) == 0) return 0;
return reader_.Enqueue(key, scales, len * sizeof(scales[0]));
}
// Returns whether all tensors are successfully loaded from cache.
bool ReadAll(hwy::ThreadPool& pool) {
bool ReadAll(hwy::ThreadPool& pool, std::vector<MatStorage>& model_memory) {
// reader_ invalid or any Enqueue failed
if (err_ != 0) return false;
// Setup the model_memory.
for (int b = 0; b < model_toc_.size(); ++b) {
const std::string& file_key = file_keys_[b];
MatPtr* blob = model_toc_[b];
if (!file_toc_.Empty()) {
const MatPtr* toc_blob = file_toc_.Get(file_key);
if (toc_blob == nullptr) {
fprintf(stderr, "Blob %s not found in TOC\n", file_key.c_str());
return false;
}
if (toc_blob->Rows() != blob->Rows() ||
toc_blob->Cols() != blob->Cols()) {
fprintf(stderr, "Blob %s has size mismatch TOC\n", file_key.c_str());
return false;
}
MatStorage toc_blob_array(*toc_blob);
model_memory.push_back(std::move(toc_blob_array));
} else {
model_memory.emplace_back(*blob);
model_memory.back().SetName(file_key);
}
}
// Allocate in parallel using the pool.
pool.Run(0, model_memory.size(),
[this, &model_memory](uint64_t task, size_t /*thread*/) {
model_memory[task].Allocate();
model_toc_[task]->SetPtr(model_memory[task]);
});
// Enqueue the read requests.
for (auto& blob : model_memory) {
err_ = reader_.Enqueue(MakeKey(blob.Name().c_str()), blob.data(),
blob.SizeBytes());
if (err_ != 0) {
fprintf(stderr,
"Failed to read blob %s (error %d) of size %zu x %zu x %zu\n",
blob.Name().c_str(), err_, blob.Rows(), blob.Cols(),
blob.ElementSize());
return false;
}
}
err_ = reader_.ReadAll(pool);
if (err_ != 0) {
@ -220,6 +619,13 @@ class CacheLoader {
private:
BlobReader reader_;
BlobError err_ = 0;
// Table of contents from the file, if present.
BlobToc file_toc_;
// Table of contents from the model. Pointers to original MatPtrT so the
// data pointers can be updated.
std::vector<MatPtr*> model_toc_;
// Mangled names of the tensors in model_toc_ for reading from the file.
std::vector<std::string> file_keys_;
};
} // namespace gcpp

View File

@ -36,155 +36,23 @@
#include <iostream>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "compression/compress.h"
#include "compression/io.h" // Path
#include "compression/shared.h"
#include "compression/weights_raw.h"
#include "gemma/common.h" // Model
#include "gemma/weights.h"
#include "util/allocator.h"
#include "util/args.h"
#include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
#include "hwy/profiler.h"
namespace gcpp {
// Setting this to true disables fread() calls that read the model file.
constexpr bool kDryRunFread = false;
namespace {
#define READ_WEIGHTS(name) \
do { \
do_fread(&(layer_view->name), layer, #name, sizeof(layer_view->name)); \
} while (0)
#define SCALE_WEIGHTS(name) \
do { \
if (ok && !kDryRunFread && scale_for_compression) { \
weights->scales[scale_pos++] = \
ScaleWeights(layer_view->name.data(), layer_view->name.size()); \
} \
} while (0)
template <typename TConfig>
struct LoadRawWeightsT {
ByteStorageT operator()(const Path& checkpoint, hwy::ThreadPool& pool,
bool scale_for_compression) const {
PROFILER_ZONE("Startup.LoadWeights");
if (!checkpoint.Exists()) {
HWY_ABORT("The model weights file '%s' does not exist.",
checkpoint.path.c_str());
}
ByteStorageT weights_u8 = AllocateWeightsF<TConfig>()(pool);
auto* weights = reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
size_t scale_pos = 0;
FILE* fptr;
if constexpr (kDryRunFread) {
fprintf(stderr, "Dry-Run, not reading model-file.\n");
} else {
fptr = fopen(checkpoint.path.c_str(), "rb");
if (fptr == nullptr) {
HWY_ABORT("Failed to open model file %s - does it exist?",
checkpoint.path.c_str());
}
}
bool ok = true;
uint64_t total_size = 0;
auto do_fread = [&](void* var, int layer, const char* name, size_t size) {
if (layer == -1) {
fprintf(stderr, "Loading Parameters (size %zu): %s\n", size, name);
} else {
fprintf(stderr, "Loading Parameters (layer=%d, size %zu): %s\n", layer,
size, name);
}
if constexpr (!kDryRunFread) {
ok &= 1 == fread(var, size, 1, fptr);
total_size += size;
}
};
do_fread(&(weights->embedder_input_embedding), -1,
"embedder_input_embedding",
sizeof(weights->embedder_input_embedding));
do_fread(&(weights->final_norm_scale), -1, "final_norm_scale",
sizeof(weights->final_norm_scale));
for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
auto type = TConfig::kLayerConfig[layer];
LayerF<TConfig>* layer_view = weights->GetLayer(layer);
// Make sure we don't have uninitialized memory.
hwy::ZeroBytes(layer_view, sizeof(*layer_view));
if (type == LayerAttentionType::kGemma) {
READ_WEIGHTS(attn_vec_einsum_w);
READ_WEIGHTS(qkv_einsum_w);
SCALE_WEIGHTS(attn_vec_einsum_w);
SCALE_WEIGHTS(qkv_einsum_w);
} else {
READ_WEIGHTS(griffin.linear_x_w);
READ_WEIGHTS(griffin.linear_x_biases);
READ_WEIGHTS(griffin.linear_y_w);
READ_WEIGHTS(griffin.linear_y_biases);
READ_WEIGHTS(griffin.linear_out_w);
READ_WEIGHTS(griffin.linear_out_biases);
READ_WEIGHTS(griffin.conv_w);
READ_WEIGHTS(griffin.conv_biases);
READ_WEIGHTS(griffin.gate_w);
READ_WEIGHTS(griffin.gate_biases);
READ_WEIGHTS(griffin.a);
SCALE_WEIGHTS(griffin.linear_x_w);
SCALE_WEIGHTS(griffin.linear_y_w);
SCALE_WEIGHTS(griffin.linear_out_w);
SCALE_WEIGHTS(griffin.gate_w);
}
READ_WEIGHTS(gating_einsum_w);
READ_WEIGHTS(linear_w);
SCALE_WEIGHTS(gating_einsum_w);
SCALE_WEIGHTS(linear_w);
READ_WEIGHTS(pre_attention_norm_scale);
READ_WEIGHTS(pre_ffw_norm_scale);
if (TConfig::kPostNorm == PostNormType::Scale) {
READ_WEIGHTS(post_attention_norm_scale);
READ_WEIGHTS(post_ffw_norm_scale);
}
if (TConfig::kFFBiases) {
READ_WEIGHTS(ffw_gating_biases);
READ_WEIGHTS(ffw_output_biases);
}
if (TConfig::kSoftmaxAttnOutputBiases &&
type == LayerAttentionType::kGemma) {
READ_WEIGHTS(attention_output_biases);
}
}
if (!ok) {
HWY_ABORT(
"Failed to read from %s - might be a directory, or too small? "
"expected size: %d kB",
checkpoint.path.c_str(), static_cast<uint32_t>(total_size >> 10));
}
if (!kDryRunFread) {
HWY_ASSERT(0 == fclose(fptr));
if (scale_for_compression) {
HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
}
}
return weights_u8;
}
};
#undef READ_WEIGHTS
#undef SCALE_WEIGHTS
} // namespace
ByteStorageT LoadRawWeights(const Path& weights, Model model_type,
Type weight_type, hwy::ThreadPool& pool,
bool scale_for_compression) {
return CallForModelAndWeight<LoadRawWeightsT>(
model_type, weight_type, weights, pool, scale_for_compression);
}
struct Args : public ArgsBase<Args> {
static constexpr size_t kDefaultNumThreads = ~size_t{0};
@ -282,7 +150,7 @@ HWY_BEFORE_NAMESPACE();
namespace gcpp {
namespace HWY_NAMESPACE {
template <class TConfig>
template <class Configs>
void CompressWeights(const Path& weights_path,
const Path& compressed_weights_path, Model model_type,
Type weight_type, hwy::ThreadPool& pool) {
@ -290,26 +158,53 @@ void CompressWeights(const Path& weights_path,
HWY_ABORT("The model weights file '%s' does not exist.",
weights_path.path.c_str());
}
printf("Compressing weights from %s to %s\n", weights_path.path.c_str(),
compressed_weights_path.path.c_str());
using CConfig = Configs::c;
using UCConfig = Configs::uc;
// Allocate compressed weights.
using CWeights = CompressedWeights<TConfig>;
ByteStorageT c_weights_u8 = AllocateSizeof<CWeights>();
using CWeights = CompressedWeights<CConfig>;
ByteStorageT c_weights_u8 = AllocateCompressedWeights<CConfig>()(pool);
CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
// Get weights, compress, and store.
const bool scale_for_compression = TConfig::kNumTensorScales > 0;
const ByteStorageT weights_u8 = gcpp::LoadRawWeights(
weights_path, model_type, weight_type, pool, scale_for_compression);
WeightsF<TConfig>* weights =
reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
// Allocate uncompressed weights.
using UCWeights = CompressedWeights<UCConfig>;
ByteStorageT uc_weights_u8 = AllocateCompressedWeights<UCConfig>()(pool);
UCWeights* uc_weights = reinterpret_cast<UCWeights*>(uc_weights_u8.get());
// Get uncompressed weights, compress, and store.
FILE* fptr = fopen(weights_path.path.c_str(), "rb");
if (fptr == nullptr) {
HWY_ABORT("Failed to open model file %s - does it exist?",
weights_path.path.c_str());
}
bool ok = true;
uint64_t total_size = 0;
CompressedWeights<UCConfig>::ForEachTensor(
{uc_weights}, ForEachType::kLoadNoToc,
[&](const char* name, hwy::Span<MatPtr*> tensors) {
fprintf(stderr, "Loading Parameters (size %zu): %s\n",
tensors[0]->SizeBytes(), name);
ok &= 1 == fread(tensors[0]->Ptr(), tensors[0]->SizeBytes(), 1, fptr);
total_size += tensors[0]->SizeBytes();
});
const bool scale_for_compression = UCConfig::kNumTensorScales > 0;
std::vector<float> scales;
if (scale_for_compression) {
uc_weights->GetOrApplyScales(scales);
}
Compressor compressor(pool);
ForEachTensor<TConfig, LayerF<TConfig>>(weights, *c_weights, compressor);
compressor.AddScales(weights->scales.data(), weights->scales.size());
CompressedWeights<CConfig>::ForEachTensor(
{reinterpret_cast<CompressedWeights<CConfig>*>(uc_weights), c_weights},
ForEachType::kLoadNoToc,
[&compressor](const char* name, hwy::Span<MatPtr*> tensors) {
tensors[1]->CallUpcasted(
compressor, name,
reinterpret_cast<const float*>(tensors[0]->Ptr()));
});
compressor.AddScales(scales.data(), scales.size() * sizeof(scales[0]));
compressor.WriteAll(pool, compressed_weights_path);
weights->layer_ptrs.~LayerPointers<float, TConfig>();
c_weights->c_layer_ptrs.~CompressedLayerPointers<TConfig>();
}
} // namespace HWY_NAMESPACE

View File

@ -53,34 +53,35 @@ namespace HWY_NAMESPACE {
class SbsWriterImpl : public WriterInterface {
template <typename Packed>
hwy::AlignedFreeUniquePtr<Packed[]> AllocateAndCompress(
const std::string& name, absl::Span<const float> weights) {
void AllocateAndCompress(const std::string& name,
absl::Span<const float> weights) {
const size_t num_packed = CompressedArrayElements<Packed>(weights.size());
auto packed = hwy::AllocateAligned<Packed>(num_packed);
PackedSpan<Packed> span = MakeSpan(packed.get(), num_packed);
compressor_.Insert(name.c_str(), weights.data(), weights.size(),
working_set_, span, /*packed_ofs=*/0, pool_);
return packed;
MatPtrT<Packed> storage(name, 1, num_packed);
model_memory_.push_back(storage);
model_memory_.back().Allocate();
storage.SetPtr(model_memory_.back());
std::string decorated_name = storage.CacheName();
compressor_(&storage, decorated_name.c_str(), weights.data());
}
public:
SbsWriterImpl() : pool_(0), compressor_(pool_) {}
void Insert(std::string name, absl::Span<const float> weights) override {
sfp_streams_.push_back(AllocateAndCompress<SfpStream>(name, weights));
AllocateAndCompress<SfpStream>(name, weights);
}
void InsertNUQ(std::string name, absl::Span<const float> weights) override {
nuq_streams_.push_back(AllocateAndCompress<NuqStream>(name, weights));
AllocateAndCompress<NuqStream>(name, weights);
}
void InsertBfloat16(std::string name,
absl::Span<const float> weights) override {
bf16_streams_.push_back(AllocateAndCompress<BF16>(name, weights));
AllocateAndCompress<BF16>(name, weights);
}
void InsertFloat(std::string name, absl::Span<const float> weights) override {
f32_streams_.push_back(AllocateAndCompress<float>(name, weights));
AllocateAndCompress<float>(name, weights);
}
void AddScales(const std::vector<float>& scales) override {
@ -96,10 +97,7 @@ class SbsWriterImpl : public WriterInterface {
hwy::ThreadPool pool_;
Compressor compressor_;
CompressWorkingSet working_set_;
std::vector<hwy::AlignedFreeUniquePtr<SfpStream[]>> sfp_streams_;
std::vector<hwy::AlignedFreeUniquePtr<NuqStream[]>> nuq_streams_;
std::vector<hwy::AlignedFreeUniquePtr<BF16[]>> bf16_streams_;
std::vector<hwy::AlignedFreeUniquePtr<float[]>> f32_streams_;
std::vector<MatStorage> model_memory_;
std::vector<float> scales_;
};

View File

@ -22,6 +22,7 @@
#include <stddef.h>
#include <stdint.h>
#include <complex>
#include <cstdio>
#include "hwy/aligned_allocator.h"
@ -184,6 +185,12 @@ const char* TypeName() {
return "sfp";
} else if constexpr (hwy::IsSame<Packed, NuqStream>()) {
return "nuq";
} else if constexpr (hwy::IsSame<Packed, double>()) {
return "f64";
} else if constexpr (hwy::IsSame<Packed, std::complex<double>>()) {
return "c64";
} else if constexpr (hwy::IsSame<Packed, hwy::uint128_t>()) {
return "u128";
} else {
HWY_DASSERT(false);
return "unknown";

View File

@ -1,247 +0,0 @@
// Copyright 2024 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_WEIGHTS_RAW_H_
#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_WEIGHTS_RAW_H_
// Historical note: this was the original f32-only simple on-disk format
// created by convert_weights.py. BlobStore is now the preferred on-disk
// format, and we load that into CompressedWeights.
//
// NOTE: this file should only be used by compress_weights. It is currently
// also referenced by backprop because it supports T = std::complex, and
// CompressedWeights might not yet.
#include <random>
#include "gemma/configs.h"
#include "util/allocator.h"
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
namespace gcpp {
template <typename T, class TConfig>
struct Layer {
Layer() {}
static constexpr size_t kHeads = TConfig::kHeads;
static constexpr size_t kKVHeads = TConfig::kKVHeads;
static constexpr size_t kModelDim = TConfig::kModelDim;
static constexpr size_t kQKVDim = TConfig::kQKVDim;
static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
static constexpr size_t kAttVecEinsumWSize = kHeads * kQKVDim * kModelDim;
static constexpr size_t kQKVEinsumWSize =
(kHeads + 2 * kKVHeads) * kQKVDim * kModelDim;
// 2x for (gelu gating vector, gated vector)
static constexpr size_t kGatingEinsumWSize = 2 * kFFHiddenDim * kModelDim;
static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
static constexpr bool kFFBiases = TConfig::kFFBiases;
static constexpr PostNormType kPostNorm = TConfig::kPostNorm;
static constexpr size_t kAOBiasDim =
TConfig::kSoftmaxAttnOutputBiases ? kModelDim : 0;
static constexpr size_t kGriffinDim =
TConfig::kGriffinLayers > 0 ? kModelDim : 0;
union {
struct {
std::array<T, kAttVecEinsumWSize> attn_vec_einsum_w;
std::array<T, kQKVEinsumWSize> qkv_einsum_w;
std::array<T, kAOBiasDim> attention_output_biases;
};
struct {
std::array<T, kGriffinDim * kGriffinDim> linear_x_w;
std::array<T, kGriffinDim> linear_x_biases;
std::array<T, kGriffinDim * kGriffinDim> linear_y_w;
std::array<T, kGriffinDim> linear_y_biases;
std::array<T, kGriffinDim * kGriffinDim> linear_out_w;
std::array<T, kGriffinDim> linear_out_biases;
std::array<T, kConv1dWidth * kGriffinDim> conv_w;
std::array<T, kGriffinDim> conv_biases;
std::array<T, kGriffinDim * kGriffinDim / kHeads * 2> gate_w;
std::array<T, kGriffinDim * 2> gate_biases;
std::array<T, kGriffinDim> a;
} griffin;
};
std::array<T, kGatingEinsumWSize> gating_einsum_w;
std::array<T, kModelDim * kFFHiddenDim> linear_w;
std::array<T, kModelDim> pre_attention_norm_scale;
std::array<T, kModelDim> pre_ffw_norm_scale;
std::array<T, kPostNorm == PostNormType::Scale ? kModelDim : 0>
post_attention_norm_scale;
std::array<T, kPostNorm == PostNormType::Scale ? kModelDim : 0>
post_ffw_norm_scale;
std::array<T, kFFBiases ? 2 * kFFHiddenDim : 0> ffw_gating_biases;
std::array<T, kFFBiases ? kModelDim : 0> ffw_output_biases;
};
template <class TConfig>
using LayerF = Layer<float, TConfig>;
// Array instead of single large allocation for parallel mem init. Split out of
// Weights so that only these pointers are initialized.
template <typename T, class TConfig>
struct LayerPointers {
explicit LayerPointers(hwy::ThreadPool& pool) {
pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) {
this->layers[task] = hwy::AllocateAligned<Layer<T, TConfig>>(1);
});
}
using TLayer = Layer<T, TConfig>;
std::array<hwy::AlignedFreeUniquePtr<TLayer[]>, TConfig::kLayers> layers;
};
template <typename T, class TConfig>
struct Weights {
// No ctor/dtor, allocated via AllocateAligned.
std::array<T, TConfig::kVocabSize * TConfig::kModelDim>
embedder_input_embedding;
std::array<T, TConfig::kModelDim> final_norm_scale;
LayerPointers<T, TConfig> layer_ptrs;
std::array<T, TConfig::kNumTensorScales> scales;
const Layer<T, TConfig>* GetLayer(size_t layer) const {
return layer_ptrs.layers[layer].get();
}
Layer<T, TConfig>* GetLayer(size_t layer) {
return layer_ptrs.layers[layer].get();
}
};
template <class TConfig>
using WeightsF = Weights<float, TConfig>;
// TODO: can we use TConfig::Weight instead of T?
template <typename T, typename TConfig>
struct AllocateWeights {
ByteStorageT operator()(hwy::ThreadPool& pool) const {
using TWeights = Weights<T, TConfig>;
ByteStorageT weights_u8 = AllocateSizeof<TWeights>();
TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
new (&weights->layer_ptrs) LayerPointers<T, TConfig>(pool);
return weights_u8;
}
};
template <typename TConfig>
struct AllocateWeightsF {
ByteStorageT operator()(hwy::ThreadPool& pool) const {
return AllocateWeights<float, TConfig>()(pool);
}
};
// TODO: make a member of Weights<T>.
template <typename T, typename TConfig>
struct ZeroInitWeights {
void operator()(ByteStorageT& weights, hwy::ThreadPool& pool) const {
Weights<T, TConfig>& w =
*reinterpret_cast<Weights<T, TConfig>*>(weights.get());
hwy::ZeroBytes(&w.embedder_input_embedding,
sizeof(w.embedder_input_embedding));
hwy::ZeroBytes(&w.final_norm_scale, sizeof(w.final_norm_scale));
for (int i = 0; i < TConfig::kLayers; ++i) {
hwy::ZeroBytes(w.GetLayer(i), sizeof(*w.GetLayer(i)));
}
}
};
template <typename TConfig>
struct ZeroInitWeightsF {
void operator()(ByteStorageT& weights, hwy::ThreadPool& pool) const {
ZeroInitWeights<float, TConfig>()(weights, pool);
}
};
template <typename T, typename TConfig>
struct CopyWeights {
void operator()(Weights<T, TConfig>& dst,
const Weights<T, TConfig>& src) const {
hwy::CopyBytes(&src.embedder_input_embedding, &dst.embedder_input_embedding,
sizeof(src.embedder_input_embedding));
hwy::CopyBytes(&src.final_norm_scale, &dst.final_norm_scale,
sizeof(src.final_norm_scale));
for (int i = 0; i < TConfig::kLayers; ++i) {
hwy::CopyBytes(src.GetLayer(i), dst.GetLayer(i),
sizeof(*dst.GetLayer(i)));
}
}
};
template <typename T, size_t kLen>
void RandInit(std::array<T, kLen>& x, T stddev, std::mt19937& gen) {
std::normal_distribution<T> dist(0.0, stddev);
for (size_t i = 0; i < kLen; ++i) {
x[i] = dist(gen);
}
}
// TODO: make a member of Layer<T>.
template <typename T, typename TConfig>
void RandInit(Layer<T, TConfig>& w, T stddev, std::mt19937& gen) {
RandInit(w.pre_attention_norm_scale, stddev, gen);
RandInit(w.attn_vec_einsum_w, stddev, gen);
RandInit(w.qkv_einsum_w, stddev, gen);
RandInit(w.pre_ffw_norm_scale, stddev, gen);
RandInit(w.gating_einsum_w, stddev, gen);
RandInit(w.linear_w, stddev, gen);
}
template <typename T, typename TConfig>
void RandInit(Weights<T, TConfig>& w, T stddev, std::mt19937& gen) {
static constexpr size_t kLayers = TConfig::kLayers;
RandInit(w.embedder_input_embedding, stddev, gen);
RandInit(w.final_norm_scale, stddev, gen);
for (size_t i = 0; i < kLayers; ++i) {
RandInit(*w.GetLayer(i), stddev, gen);
}
}
// Owns weights and provides access to TConfig.
template <typename T, typename TConfig>
class WeightsWrapper {
public:
WeightsWrapper()
: pool_(0),
data_(AllocateWeights<T, TConfig>()(pool_)),
weights_(reinterpret_cast<Weights<T, TConfig>*>(data_.get())) {}
~WeightsWrapper() {
get().layer_ptrs.~LayerPointers<T, TConfig>();
}
const Weights<T, TConfig>& get() const { return *weights_; }
Weights<T, TConfig>& get() { return *weights_; }
void clear() { ZeroInitWeights<T, TConfig>()(data_, pool_); }
void copy(const WeightsWrapper<T, TConfig>& other) {
CopyWeights<T, TConfig>()(get(), other.get());
}
private:
hwy::ThreadPool pool_;
ByteStorageT data_;
Weights<T, TConfig>* weights_;
};
} // namespace gcpp
#endif // THIRD_PARTY_GEMMA_CPP_COMPRESSION_WEIGHTS_RAW_H_

View File

@ -149,43 +149,45 @@ decltype(auto) CallForModelAndWeight(Model model, Type weight,
#define GEMMA_DISPATCH_MODEL(MODEL, TWEIGHT, FUNC, ARGS) \
switch (MODEL) { \
case Model::GEMMA_TINY: { \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGemmaTiny<TWEIGHT>>) \
ARGS; \
using CP = ConfigPair<ConfigGemmaTiny<TWEIGHT>, ConfigGemmaTiny<float>>; \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS; \
break; \
} \
case Model::GEMMA_2B: { \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGemma2B<TWEIGHT>>) \
ARGS; \
using CP = ConfigPair<ConfigGemma2B<TWEIGHT>, ConfigGemma2B<float>>; \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS; \
break; \
} \
case Model::GEMMA_7B: { \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGemma7B<TWEIGHT>>) \
ARGS; \
using CP = ConfigPair<ConfigGemma7B<TWEIGHT>, ConfigGemma7B<float>>; \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS; \
break; \
} \
case Model::GRIFFIN_2B: { \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGriffin2B<TWEIGHT>>) \
ARGS; \
using CP = ConfigPair<ConfigGriffin2B<TWEIGHT>, ConfigGriffin2B<float>>; \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS; \
break; \
} \
case Model::GEMMA2_2B: { \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGemma2_2B<TWEIGHT>>) \
ARGS; \
using CP = ConfigPair<ConfigGemma2_2B<TWEIGHT>, ConfigGemma2_2B<float>>; \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS; \
break; \
} \
case Model::GEMMA2_9B: { \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGemma2_9B<TWEIGHT>>) \
ARGS; \
using CP = ConfigPair<ConfigGemma2_9B<TWEIGHT>, ConfigGemma2_9B<float>>; \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS; \
break; \
} \
case Model::GEMMA2_27B: { \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGemma2_27B<TWEIGHT>>) \
ARGS; \
using CP = \
ConfigPair<ConfigGemma2_27B<TWEIGHT>, ConfigGemma2_27B<float>>; \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS; \
break; \
} \
case Model::PALIGEMMA_224: { \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigPaliGemma_224<TWEIGHT>>)\
ARGS; \
using CP = ConfigPair<ConfigPaliGemma_224<TWEIGHT>, \
ConfigPaliGemma_224<float>>; \
HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS; \
break; \
} \
default: \

View File

@ -244,10 +244,18 @@ class GemmaAttention {
const auto pre_att_rms_out =
ConstMat(activations_.pre_att_rms_out.All(), kModelDim);
MatMul</*kAdd=*/false>(
num_interleaved, pre_att_rms_out,
ConstMat(layer_weights_.qkv_einsum_w.data(), kModelDim),
layer_weights_.qkv_einsum_w.scale(), /*add=*/nullptr, activations_.env,
const auto w_q1 =
layer_weights_.qkv_einsum_w.data() == nullptr
? ConstMat(layer_weights_.qkv_einsum_w1.data(), kModelDim)
: ConstMat(layer_weights_.qkv_einsum_w.data(), kModelDim);
const auto w_q2 =
layer_weights_.qkv_einsum_w.data() == nullptr
? ConstMat(layer_weights_.qkv_einsum_w2.data(), kModelDim)
: ConstMat(layer_weights_.qkv_einsum_w.data(), kModelDim, kModelDim,
kHeads * kQKVDim * kModelDim);
MatMul</*kAdd=*/false>(num_interleaved, pre_att_rms_out, w_q1,
layer_weights_.qkv_einsum_w.scale(), /*add=*/nullptr,
activations_.env,
MutableMat(activations_.q.All(), kHeads * kQStride));
if constexpr (kIsMHA) {
@ -263,9 +271,7 @@ class GemmaAttention {
// KV structure is [k, v, k, v, ....] = kKVHeads pairs of (k, v).
float* HWY_RESTRICT kv = kv_caches_[0].kv_cache.get() + kv_ofs;
MatMul</*kAdd=*/false>(
num_tokens_, pre_att_rms_out,
ConstMat(layer_weights_.qkv_einsum_w.data(), kModelDim, kModelDim,
kHeads * kQKVDim * kModelDim),
num_tokens_, pre_att_rms_out, w_q2,
layer_weights_.qkv_einsum_w.scale(), /*add=*/nullptr,
activations_.env,
MutableMat(kv, kKVHeads * 2 * kQKVDim, kCachePosSize));
@ -283,9 +289,14 @@ class GemmaAttention {
cache_pos * kCachePosSize + layer_ * kCacheLayerSize;
float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + kv_offset;
// KV structure is [k, v, k, v, ....] = kKVHeads pairs of (k, v).
if (layer_weights_.qkv_einsum_w.data() == nullptr) {
MatVec<kKVHeads * 2 * kQKVDim, kModelDim>(
layer_weights_.qkv_einsum_w, kHeads * kQKVDim * kModelDim, x, kv,
pool_);
layer_weights_.qkv_einsum_w2, 0, x, kv, pool_);
} else {
MatVec<kKVHeads * 2 * kQKVDim, kModelDim>(
layer_weights_.qkv_einsum_w, kHeads * kQKVDim * kModelDim, x,
kv, pool_);
}
}
}
}
@ -692,10 +703,16 @@ HWY_NOINLINE void FFW(Activations& activations, size_t num_interleaved,
output_bias = layer_weights->ffw_output_biases.data_scale1();
}
if constexpr (!kIsVit) {
w1 = ConstMat(layer_weights->gating_einsum_w.data(), kModelDim);
w2 = ConstMat(layer_weights->gating_einsum_w.data(), kModelDim, kModelDim,
kModelDim * kFFHiddenDim);
scale = layer_weights->gating_einsum_w.scale();
w1 = layer_weights->gating_einsum_w.data() == nullptr
? ConstMat(layer_weights->gating_einsum_w1.data(), kModelDim)
: ConstMat(layer_weights->gating_einsum_w.data(), kModelDim);
w2 = layer_weights->gating_einsum_w.data() == nullptr
? ConstMat(layer_weights->gating_einsum_w2.data(), kModelDim)
: ConstMat(layer_weights->gating_einsum_w.data(), kModelDim,
kModelDim, kModelDim * kFFHiddenDim);
scale = layer_weights->gating_einsum_w.data() == nullptr
? layer_weights->gating_einsum_w1.scale()
: layer_weights->gating_einsum_w.scale();
w_output = ConstMat(layer_weights->linear_w.data(), kFFHiddenDim);
output_scale = layer_weights->linear_w.scale();
} else {

View File

@ -52,8 +52,6 @@ Gemma::Gemma(GemmaTokenizer&& tokenizer, const ModelInfo& info,
}
Gemma::~Gemma() {
CallForModelAndWeight<DeleteCompressedWeights>(info_.model, info_.weight,
weights_u8_);
}
// There are >100 instantiations of the inference code. To reduce compile time,

View File

@ -15,15 +15,15 @@
#include "gemma/weights.h"
#include <stdio.h>
#include <cstdio>
#include <cstdlib>
#include <vector>
#include "compression/compress.h"
#include "compression/io.h" // Path
#include "gemma/common.h"
#include "gemma/configs.h"
#include "util/allocator.h"
#include "hwy/aligned_allocator.h"
#include "hwy/base.h" // HWY_ABORT
#include "hwy/contrib/thread_pool/thread_pool.h"
#include "hwy/profiler.h"
@ -47,32 +47,23 @@ struct LoadCompressedWeightsT {
CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
new (c_weights) CWeights(pool);
std::array<float, TConfig::kNumTensorScales> scales;
CacheLoader loader(weights);
ForEachTensor<TConfig>(nullptr, *c_weights, loader);
ForEachType fet =
loader.HaveToc() ? ForEachType::kLoadWithToc : ForEachType::kLoadNoToc;
CWeights::ForEachTensor(
{c_weights}, fet,
[&loader](const char* name, hwy::Span<MatPtr*> tensors) {
loader(name, tensors);
});
std::vector<float> scales(TConfig::kNumTensorScales);
if (TConfig::kNumTensorScales > 0) {
loader.LoadScales(scales.data(), scales.size());
if (!loader.ReadAll(pool)) {
}
if (!loader.ReadAll(pool, c_weights->model_storage)) {
HWY_ABORT("Failed to load model weights.");
}
if (TConfig::kNumTensorScales > 0) {
size_t scale_pos = 0;
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
auto type = TConfig::kLayerConfig[layer_idx];
const size_t idx = static_cast<size_t>(layer_idx);
CompressedLayer<TConfig>* layer_weights = c_weights->GetLayer(idx);
if (type == LayerAttentionType::kGemma) {
layer_weights->attn_vec_einsum_w.set_scale(scales[scale_pos++]);
layer_weights->qkv_einsum_w.set_scale(scales[scale_pos++]);
} else {
layer_weights->griffin.linear_x_w.set_scale(scales[scale_pos++]);
layer_weights->griffin.linear_y_w.set_scale(scales[scale_pos++]);
layer_weights->griffin.linear_out_w.set_scale(scales[scale_pos++]);
layer_weights->griffin.gate_w.set_scale(scales[scale_pos++]);
}
layer_weights->gating_einsum_w.set_scale(scales[scale_pos++]);
layer_weights->linear_w.set_scale(scales[scale_pos++]);
}
HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
c_weights->GetOrApplyScales(scales);
}
{
PROFILER_ZONE("Startup.Reshape");
@ -102,13 +93,13 @@ void HWY_MAYBE_UNUSED LogVec(const char* name, const float* data, size_t len) {
class WeightLogger {
public:
template <size_t N>
void operator()(const char* name, const CompressedArray<float, N>& tensor) {
void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
const MatPtr& tensor = *tensors[0];
if (tensor.scale() != 1.0f) {
printf("[scale=%f] ", tensor.scale());
}
LogVec(name, tensor.data(), N);
total_weights += N;
LogVec(name, tensor.data<float>(), tensor.NumElements());
total_weights += tensor.NumElements();
}
size_t total_weights = 0;
};
@ -116,10 +107,11 @@ class WeightLogger {
template <typename TConfig>
struct LogWeightStatsT {
void operator()(const ByteStorageT& weights_u8) const {
const auto& weights =
auto& weights =
*reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
WeightLogger logger;
ForEachTensor1<TConfig>(logger, weights);
CompressedWeights<TConfig>::ForEachTensor(
{&weights}, ForEachType::kIgnoreNulls, logger);
printf("%-20s %12zu\n", "Total", logger.total_weights);
}
};

View File

@ -18,7 +18,15 @@
#include <stddef.h>
#include <array>
#include <complex>
#include <cstdio>
#include <string>
#include <unordered_set>
#include <vector>
#include "compression/compress.h"
#include "compression/shared.h"
#include "gemma/common.h"
#include "gemma/configs.h"
#include "util/allocator.h"
@ -28,16 +36,82 @@
namespace gcpp {
// Different tensors need to appear in a ForEachTensor, according to what is
// happening.
enum class ForEachType {
// Under normal circumstances, when not initializing or loading, we can
// include all tensors and ignore the null ones.
kIgnoreNulls,
// If there is a table of contents, we can include all tensors.
kLoadWithToc,
// There is no table of contents, so we have to be careful to only include
// tensors that are actually present.
kLoadNoToc,
// We need to initialize all tensors needed when there is no table of
// contents. This differs from kLoadNoToc in that we need to include any
// tensor that is allocated but not loaded directly from file.
kInitNoToc,
};
template <class TConfig>
struct CompressedLayer {
// No ctor/dtor, allocated via AllocateAligned.
// Large data is constructed separately.
CompressedLayer()
: attn_vec_einsum_w("att_ein", kModelDim, kHeads * kQKVDim),
qkv_einsum_w("qkv_ein", (kHeads + 2 * kKVHeads) * kQKVDim, kModelDim),
qkv_einsum_w1("qkv1_w", kHeads * kQKVDim, kModelDim),
qkv_einsum_w2("qkv2_w", 2 * kKVHeads * kQKVDim, kModelDim),
attention_output_biases("attn_ob", 1, kAOBiasDim),
griffin({.linear_x_w = {"gr_lin_x_w", kGriffinDim, kGriffinDim},
.linear_x_biases = {"gr_lin_x_b", 1, kGriffinDim},
.linear_y_w = {"gr_lin_y_w", kGriffinDim, kGriffinDim},
.linear_y_biases = {"gr_lin_y_b", 1, kGriffinDim},
.linear_out_w = {"gr_lin_out_w", kGriffinDim, kGriffinDim},
.linear_out_biases = {"gr_lin_out_b", 1, kGriffinDim},
.conv_w = {"gr_conv_w", kConv1dWidth, kGriffinDim},
.conv_biases = {"gr_conv_b", 1, kGriffinDim},
.gate_w = {"gr_gate_w", 2 * kGriffinDim, kGriffinDim / kHeads},
.gate_biases = {"gr_gate_b", 1, kGriffinDim * 2},
.a = {"gr_a", 1, kGriffinDim}}),
// MultiHeadDotProductAttention.
vit({.attn_out_w = {"attn_out_w", kHeads * kQKVDim, kModelDim},
.attn_out_b = {"attn_out_b", 1, kModelDim},
.qkv_einsum_w = {"qkv_ein_w", (kHeads + 2 * kKVHeads) * kQKVDim,
kModelDim},
.qkv_einsum_b = {"qkv_ein_b", (kHeads + 2 * kKVHeads), kQKVDim},
.linear_0_w = {"linear_0_w", kModelDim, kFFHiddenDim},
.linear_0_b = {"linear_0_b", 1, kFFHiddenDim},
.linear_1_w = {"linear_1_w", kFFHiddenDim, kModelDim},
.linear_1_b = {"linear_1_b", 1, kModelDim},
.layer_norm_0_bias = {"ln_0_bias", 1, kModelDim},
.layer_norm_0_scale = {"ln_0_scale", 1, kModelDim},
.layer_norm_1_bias = {"ln_1_bias", 1, kModelDim},
.layer_norm_1_scale = {"ln_1_scale", 1, kModelDim}}),
gating_einsum_w("gating_ein", 2 * kFFHiddenDim, kModelDim),
gating_einsum_w1("gating1_w", kFFHiddenDim, kModelDim),
gating_einsum_w2("gating2_w", kFFHiddenDim, kModelDim),
linear_w("linear_w", kModelDim, kFFHiddenDim),
pre_attention_norm_scale("pre_att_ns", 1, kModelDim),
pre_ffw_norm_scale("pre_ff_ns", 1, kModelDim),
post_attention_norm_scale(
"post_att_ns", 1, kPostNorm == PostNormType::Scale ? kModelDim : 0),
post_ffw_norm_scale("post_ff_ns", 1,
kPostNorm == PostNormType::Scale ? kModelDim : 0),
ffw_gating_biases("ffw_gat_b", 1, kFFBiases ? 2 * kFFHiddenDim : 0),
ffw_output_biases("ffw_out_b", 1, kFFBiases ? kModelDim : 0),
att_weights("att_w", kModelDim, kHeads * kQKVDim)
{}
~CompressedLayer() = default;
using Weight = typename TConfig::Weight;
// If weights are f32, also f32; otherwise at least bf16. Useful for ops that
// do not yet support smaller compressed types, or require at least bf16. When
// weights are f32, we also want such tensors to be f32.
using WeightF32OrBF16 =
hwy::If<hwy::IsSame<Weight, float>(), float, hwy::bfloat16_t>;
// If weights are complex, this is also complex.
using WeightF32OrBF16 = hwy::If<
hwy::IsSame<Weight, std::complex<double>>(), std::complex<double>,
hwy::If<hwy::IsSame<Weight, double>(), double,
hwy::If<hwy::IsSame<Weight, float>(), float, hwy::bfloat16_t>>>;
static constexpr size_t kHeads = TConfig::kHeads;
static constexpr size_t kKVHeads = TConfig::kKVHeads;
@ -58,69 +132,75 @@ struct CompressedLayer {
static constexpr size_t kGriffinDim =
TConfig::kGriffinLayers > 0 ? kModelDim : 0;
template <class T, size_t N>
using ArrayT = CompressedArray<T, N>;
template <class T>
using ArrayT = MatPtrT<T>;
union {
struct {
ArrayT<Weight, kAttVecEinsumWSize> attn_vec_einsum_w;
ArrayT<Weight, kQKVEinsumWSize> qkv_einsum_w;
ArrayT<float, kAOBiasDim> attention_output_biases;
};
ArrayT<Weight> attn_vec_einsum_w;
// qkv_einsum_w holds 2 different matrices, which may be separated out.
// On loading, which is used depends on what is in the file.
// At inference, the one with a non-null ptr is used.
ArrayT<Weight> qkv_einsum_w;
ArrayT<Weight> qkv_einsum_w1;
ArrayT<Weight> qkv_einsum_w2;
ArrayT<float> attention_output_biases;
struct {
ArrayT<Weight, kGriffinDim * kGriffinDim> linear_x_w;
ArrayT<float, kGriffinDim> linear_x_biases;
ArrayT<Weight, kGriffinDim * kGriffinDim> linear_y_w;
ArrayT<float, kGriffinDim> linear_y_biases;
ArrayT<Weight, kGriffinDim * kGriffinDim> linear_out_w;
ArrayT<float, kGriffinDim> linear_out_biases;
ArrayT<float, kConv1dWidth * kGriffinDim> conv_w;
ArrayT<float, kGriffinDim> conv_biases;
ArrayT<Weight, kGriffinDim * kGriffinDim / kHeads * 2> gate_w;
ArrayT<float, kGriffinDim * 2> gate_biases;
ArrayT<float, kGriffinDim> a;
ArrayT<Weight> linear_x_w;
ArrayT<float> linear_x_biases;
ArrayT<Weight> linear_y_w;
ArrayT<float> linear_y_biases;
ArrayT<Weight> linear_out_w;
ArrayT<float> linear_out_biases;
ArrayT<float> conv_w;
ArrayT<float> conv_biases;
ArrayT<Weight> gate_w;
ArrayT<float> gate_biases;
ArrayT<float> a;
} griffin;
struct {
// MultiHeadDotProductAttention.
ArrayT<WeightF32OrBF16, kAttVecEinsumWSize> attn_out_w;
ArrayT<float, kModelDim> attn_out_b;
ArrayT<WeightF32OrBF16, kQKVEinsumWSize> qkv_einsum_w;
ArrayT<float, kQKVEinsumBSize> qkv_einsum_b;
ArrayT<WeightF32OrBF16> attn_out_w;
ArrayT<float> attn_out_b;
ArrayT<WeightF32OrBF16> qkv_einsum_w;
ArrayT<float> qkv_einsum_b;
// MlpBlock.
ArrayT<WeightF32OrBF16, kModelDim * kFFHiddenDim> linear_0_w;
ArrayT<float, kFFHiddenDim> linear_0_b;
ArrayT<WeightF32OrBF16, kFFHiddenDim * kModelDim> linear_1_w;
ArrayT<float, kModelDim> linear_1_b;
ArrayT<WeightF32OrBF16> linear_0_w;
ArrayT<float> linear_0_b;
ArrayT<WeightF32OrBF16> linear_1_w;
ArrayT<float> linear_1_b;
// LayerNorm.
ArrayT<WeightF32OrBF16, kModelDim> layer_norm_0_bias;
ArrayT<WeightF32OrBF16, kModelDim> layer_norm_0_scale;
ArrayT<WeightF32OrBF16, kModelDim> layer_norm_1_bias;
ArrayT<WeightF32OrBF16, kModelDim> layer_norm_1_scale;
ArrayT<WeightF32OrBF16> layer_norm_0_bias;
ArrayT<WeightF32OrBF16> layer_norm_0_scale;
ArrayT<WeightF32OrBF16> layer_norm_1_bias;
ArrayT<WeightF32OrBF16> layer_norm_1_scale;
} vit;
};
ArrayT<Weight, kGatingEinsumWSize> gating_einsum_w;
ArrayT<Weight, kModelDim * kFFHiddenDim> linear_w;
// gating_einsum_w holds 2 different matrices, which may be separated out.
// On loading, which is used depends on what is in the file.
// At inference, the one with a non-null ptr is used.
ArrayT<Weight> gating_einsum_w;
ArrayT<Weight> gating_einsum_w1;
ArrayT<Weight> gating_einsum_w2;
ArrayT<Weight> linear_w;
// We don't yet have an RMSNorm that accepts all Weight.
ArrayT<WeightF32OrBF16, kModelDim> pre_attention_norm_scale;
ArrayT<WeightF32OrBF16, kModelDim> pre_ffw_norm_scale;
ArrayT<WeightF32OrBF16, kPostNorm == PostNormType::Scale ? kModelDim : 0>
post_attention_norm_scale;
ArrayT<WeightF32OrBF16, kPostNorm == PostNormType::Scale ? kModelDim : 0>
post_ffw_norm_scale;
ArrayT<WeightF32OrBF16> pre_attention_norm_scale;
ArrayT<WeightF32OrBF16> pre_ffw_norm_scale;
ArrayT<WeightF32OrBF16> post_attention_norm_scale;
ArrayT<WeightF32OrBF16> post_ffw_norm_scale;
ArrayT<float, kFFBiases ? 2 * kFFHiddenDim : 0> ffw_gating_biases;
ArrayT<float, kFFBiases ? kModelDim : 0> ffw_output_biases;
ArrayT<float> ffw_gating_biases;
ArrayT<float> ffw_output_biases;
// Reshaped attention; not loaded from disk via ForEachTensor.
ArrayT<Weight, kModelDim * kHeads * kQKVDim> att_weights;
ArrayT<Weight> att_weights;
// Initializes att_weights from attn_vec_einsum_w, hence this must be called
// after loading weights via ForEachTensor.
// TODO: update compression/convert_weights to bake this in.
void Reshape() {
void Reshape(MatStorage& storage) {
if (attn_vec_einsum_w.data() == nullptr) return;
constexpr size_t kModelDim = TConfig::kModelDim;
constexpr size_t kHeads = TConfig::kHeads;
constexpr size_t kQKVDim = TConfig::kQKVDim;
@ -129,6 +209,8 @@ struct CompressedLayer {
static_assert(!hwy::IsSame<Weight, NuqStream>());
// Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
storage.Allocate();
att_weights.SetPtr(storage);
for (size_t m = 0; m < kModelDim; ++m) {
Weight* HWY_RESTRICT out_row = att_weights.data() + m * kHeads * kQKVDim;
for (size_t h = 0; h < kHeads; ++h) {
@ -139,118 +221,291 @@ struct CompressedLayer {
}
att_weights.set_scale(attn_vec_einsum_w.scale());
}
};
// Array instead of single large allocation for parallel mem init. Split out
// of CompressedWeights so that only these pointers are initialized, not the
// CompressedArray.
template <class TConfig>
struct CompressedLayerPointers {
explicit CompressedLayerPointers(hwy::ThreadPool& pool) {
pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) {
this->c_layers[task] = hwy::AllocateAligned<CompressedLayer<TConfig>>(1);
});
if constexpr (TConfig::VitConfig::kLayers > 0) {
pool.Run(0, TConfig::VitConfig::kLayers,
[this](uint64_t task, size_t /*thread*/) {
this->c_vit_layers[task] = hwy::AllocateAligned<
CompressedLayer<typename TConfig::VitConfig>>(1);
});
// Used by ForEachTensor for per-layer tensors.
#define GEMMA_CALL_FUNC(member) \
{ \
for (int i = 0; i < ptrs.size(); ++i) { \
tensors[i] = &ptrs[i]->member; \
} \
if (tensors[0]->Ptr() != nullptr || fet != ForEachType::kIgnoreNulls) { \
func(ptrs[0]->member.CacheName(layer_idx, sep, sep_index).c_str(), \
hwy::Span<MatPtr*>(tensors, ptrs.size())); \
} \
}
template <class Func>
static void ForEachTensor(const std::vector<CompressedLayer<TConfig>*>& ptrs,
int layer_idx, ForEachType fet, Func func,
char sep = ' ', int sep_index = -1) {
MatPtr* tensors[ptrs.size()];
auto type = TConfig::kLayerConfig[layer_idx];
if (type == LayerAttentionType::kVit) {
// MHA.
GEMMA_CALL_FUNC(vit.attn_out_w);
GEMMA_CALL_FUNC(vit.attn_out_b);
GEMMA_CALL_FUNC(vit.qkv_einsum_w);
GEMMA_CALL_FUNC(vit.qkv_einsum_b);
// MlpBlock.
GEMMA_CALL_FUNC(vit.linear_0_w);
GEMMA_CALL_FUNC(vit.linear_0_b);
GEMMA_CALL_FUNC(vit.linear_1_w);
GEMMA_CALL_FUNC(vit.linear_1_b);
// LayerNorm.
GEMMA_CALL_FUNC(vit.layer_norm_0_bias);
GEMMA_CALL_FUNC(vit.layer_norm_0_scale);
GEMMA_CALL_FUNC(vit.layer_norm_1_bias);
GEMMA_CALL_FUNC(vit.layer_norm_1_scale);
return;
}
if (type == LayerAttentionType::kGemma) {
if (fet != ForEachType::kLoadNoToc) {
GEMMA_CALL_FUNC(att_weights);
}
if (fet == ForEachType::kInitNoToc || fet == ForEachType::kLoadNoToc ||
fet == ForEachType::kIgnoreNulls) {
GEMMA_CALL_FUNC(attn_vec_einsum_w);
}
GEMMA_CALL_FUNC(qkv_einsum_w);
if (fet == ForEachType::kIgnoreNulls ||
fet == ForEachType::kLoadWithToc) {
// The unwanted ones will be null or not in the toc.
GEMMA_CALL_FUNC(qkv_einsum_w1);
GEMMA_CALL_FUNC(qkv_einsum_w2);
}
} else {
GEMMA_CALL_FUNC(griffin.linear_x_w);
GEMMA_CALL_FUNC(griffin.linear_x_biases);
GEMMA_CALL_FUNC(griffin.linear_y_w);
GEMMA_CALL_FUNC(griffin.linear_y_biases);
GEMMA_CALL_FUNC(griffin.linear_out_w);
GEMMA_CALL_FUNC(griffin.linear_out_biases);
GEMMA_CALL_FUNC(griffin.conv_w);
GEMMA_CALL_FUNC(griffin.conv_biases);
GEMMA_CALL_FUNC(griffin.gate_w);
GEMMA_CALL_FUNC(griffin.gate_biases);
GEMMA_CALL_FUNC(griffin.a);
}
GEMMA_CALL_FUNC(gating_einsum_w);
if (fet == ForEachType::kIgnoreNulls || fet == ForEachType::kLoadWithToc) {
// The unwanted ones will be null or not in the toc.
GEMMA_CALL_FUNC(gating_einsum_w1);
GEMMA_CALL_FUNC(gating_einsum_w2);
}
GEMMA_CALL_FUNC(linear_w);
GEMMA_CALL_FUNC(pre_attention_norm_scale);
GEMMA_CALL_FUNC(pre_ffw_norm_scale);
if (TConfig::kPostNorm == PostNormType::Scale) {
GEMMA_CALL_FUNC(post_attention_norm_scale);
GEMMA_CALL_FUNC(post_ffw_norm_scale);
}
if (TConfig::kFFBiases) {
GEMMA_CALL_FUNC(ffw_gating_biases);
GEMMA_CALL_FUNC(ffw_output_biases);
}
if (TConfig::kSoftmaxAttnOutputBiases &&
type == LayerAttentionType::kGemma) {
GEMMA_CALL_FUNC(attention_output_biases);
}
}
using CLayer = CompressedLayer<TConfig>;
std::array<hwy::AlignedFreeUniquePtr<CLayer[]>, TConfig::kLayers> c_layers;
using CVitLayer = CompressedLayer<typename TConfig::VitConfig>;
std::array<hwy::AlignedFreeUniquePtr<CVitLayer[]>,
TConfig::VitConfig::kLayers>
c_vit_layers;
// Sets all the tensors in the layer to zero. Memory must have been allocated.
void ZeroInit(int layer_idx) {
ForEachTensor({this}, layer_idx, ForEachType::kIgnoreNulls,
[](const char*, hwy::Span<MatPtr*> tensors) {
tensors[0]->ZeroInit();
});
}
// Allocates memory for all the tensors in the layer.
// Note that this is slow and only used for a stand-alone layer.
void Allocate() {
layer_storage.clear();
ForEachTensor({this}, /*layer_idx=*/0, ForEachType::kInitNoToc,
[this](const char* name, hwy::Span<MatPtr*> tensors) {
this->layer_storage.emplace_back(*tensors[0]);
layer_storage.back().Allocate();
tensors[0]->SetPtr(layer_storage.back());
});
}
// Storage for all the matrices and vectors. Only used for a stand-alone
// layer. For a model, the CompressedWeights::model_storage is used instead.
std::vector<MatStorage> layer_storage;
};
template <class TConfig>
struct CompressedWeights {
// Must be allocated via AllocateAligned and initialized with placement new.
void* operator new(size_t, void* addr) { return addr; }
void* operator new(size_t) = delete;
void* operator new[](size_t) = delete;
void operator delete(void*) = delete;
void operator delete[](void*) = delete;
explicit CompressedWeights(hwy::ThreadPool& pool)
: embedder_input_embedding("c_embedding", TConfig::kVocabSize,
TConfig::kModelDim),
final_norm_scale("c_final_norm", 1, TConfig::kModelDim),
vit_encoder_norm_bias("c_vit_encoder_norm_bias", 1,
TConfig::VitConfig::kModelDim),
vit_encoder_norm_scale("c_vit_encoder_norm_scale", 1,
TConfig::VitConfig::kModelDim),
vit_img_embedding_bias("c_vit_img_embedding_bias", 1,
TConfig::VitConfig::kModelDim),
vit_img_embedding_kernel("c_vit_img_embedding_kernel", 14 * 14 * 3,
TConfig::VitConfig::kModelDim),
vit_img_pos_embedding("c_vit_img_pos_embedding", 256,
TConfig::VitConfig::kModelDim),
vit_img_head_bias("c_vit_img_head_bias", 1, TConfig::kModelDim),
vit_img_head_kernel("c_vit_img_head_kernel",
TConfig::VitConfig::kModelDim, TConfig::kModelDim),
scale_names({"att_ein", "qkv_ein", "gr_lin_x_w", "gr_lin_y_w",
"gr_lin_out_w", "gr_gate_w", "gating_ein", "linear_w"}) {}
~CompressedWeights() = default;
using Weight = typename TConfig::Weight;
using WeightF32OrBF16 = typename CompressedLayer<TConfig>::WeightF32OrBF16;
using WeightF32OrInputT =
hwy::If<hwy::IsSame<Weight, float>(), float, EmbedderInputT>;
CompressedArray<WeightF32OrInputT, TConfig::kVocabSize * TConfig::kModelDim>
embedder_input_embedding;
hwy::If<hwy::IsSame<WeightF32OrBF16, hwy::bfloat16_t>(), EmbedderInputT,
WeightF32OrBF16>;
using WeightF32OrBF16 =
hwy::If<hwy::IsSame<Weight, float>(), float, hwy::bfloat16_t>;
CompressedArray<WeightF32OrBF16, TConfig::kModelDim> final_norm_scale;
MatPtrT<WeightF32OrInputT> embedder_input_embedding;
MatPtrT<WeightF32OrBF16> final_norm_scale;
// Vit parts.
CompressedArray<WeightF32OrBF16, TConfig::VitConfig::kModelDim>
vit_encoder_norm_bias;
CompressedArray<WeightF32OrBF16, TConfig::VitConfig::kModelDim>
vit_encoder_norm_scale;
CompressedArray<float, TConfig::VitConfig::kModelDim> vit_img_embedding_bias;
CompressedArray<WeightF32OrBF16, TConfig::VitConfig::kModelDim * 14 * 14 * 3>
vit_img_embedding_kernel;
CompressedArray<float, 256 * TConfig::VitConfig::kModelDim>
vit_img_pos_embedding;
MatPtrT<WeightF32OrBF16> vit_encoder_norm_bias;
MatPtrT<WeightF32OrBF16> vit_encoder_norm_scale;
MatPtrT<float> vit_img_embedding_bias;
MatPtrT<WeightF32OrBF16> vit_img_embedding_kernel;
MatPtrT<float> vit_img_pos_embedding;
// The head maps from VitConfig::kModelDim (Vit final layer) to
// kModelDim (LLM input).
CompressedArray<float, TConfig::kModelDim> vit_img_head_bias;
CompressedArray<WeightF32OrBF16,
TConfig::VitConfig::kModelDim * TConfig::kModelDim>
vit_img_head_kernel;
MatPtrT<float> vit_img_head_bias;
MatPtrT<WeightF32OrBF16> vit_img_head_kernel;
// Must be last so that the other arrays remain aligned.
CompressedLayerPointers<TConfig> c_layer_ptrs;
// Storage for all the matrices and vectors.
std::vector<MatStorage> model_storage;
std::unordered_set<std::string> scale_names;
explicit CompressedWeights(hwy::ThreadPool& pool)
: c_layer_ptrs(pool)
{}
CompressedLayer<TConfig> c_layers[TConfig::kLayers];
CompressedLayer<typename TConfig::VitConfig>
vit_layers[TConfig::VitConfig::kLayers];
// Called by weights.cc after ForEachTensor.
void Reshape(hwy::ThreadPool& pool) {
pool.Run(0, TConfig::kLayers, [this](uint64_t layer, size_t /*thread*/) {
GetLayer(layer)->Reshape();
size_t storage_index = model_storage.size();
for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
model_storage.emplace_back(GetLayer(layer)->att_weights);
}
pool.Run(0, TConfig::kLayers,
[this, storage_index](uint64_t layer, size_t /*thread*/) {
GetLayer(layer)->Reshape(model_storage[storage_index + layer]);
});
}
void ZeroInit() {
hwy::ZeroBytes(&embedder_input_embedding, sizeof(embedder_input_embedding));
hwy::ZeroBytes(&final_norm_scale, sizeof(final_norm_scale));
hwy::ZeroBytes(&vit_encoder_norm_bias, sizeof(vit_encoder_norm_bias));
hwy::ZeroBytes(&vit_encoder_norm_scale, sizeof(vit_encoder_norm_scale));
hwy::ZeroBytes(&vit_img_embedding_bias, sizeof(vit_img_embedding_bias));
hwy::ZeroBytes(&vit_img_embedding_kernel, sizeof(vit_img_embedding_kernel));
hwy::ZeroBytes(&vit_img_head_bias, sizeof(vit_img_head_bias));
hwy::ZeroBytes(&vit_img_head_kernel, sizeof(vit_img_head_kernel));
hwy::ZeroBytes(&vit_img_pos_embedding, sizeof(vit_img_pos_embedding));
embedder_input_embedding.ZeroInit();
final_norm_scale.ZeroInit();
for (int i = 0; i < TConfig::kLayers; ++i) {
hwy::ZeroBytes(GetLayer(i), sizeof(*GetLayer(i)));
}
if constexpr (TConfig::VitConfig::kLayers > 0) {
for (int i = 0; i < TConfig::VitConfig::kLayers; ++i) {
hwy::ZeroBytes(GetVitLayer(i), sizeof(*GetVitLayer(i)));
}
c_layers[i].ZeroInit(i);
}
}
const CompressedLayer<TConfig>* GetLayer(size_t layer) const {
return c_layer_ptrs.c_layers[layer].get();
}
CompressedLayer<TConfig>* GetLayer(size_t layer) {
return c_layer_ptrs.c_layers[layer].get();
return &c_layers[layer];
}
CompressedLayer<TConfig>* GetLayer(size_t layer) { return &c_layers[layer]; }
const CompressedLayer<typename TConfig::VitConfig>* GetVitLayer(
size_t layer) const {
return c_layer_ptrs.c_vit_layers[layer].get();
return &vit_layers[layer];
}
CompressedLayer<typename TConfig::VitConfig>* GetVitLayer(size_t layer) {
return c_layer_ptrs.c_vit_layers[layer].get();
return &vit_layers[layer];
}
// Copies the data from other to *this.
void CopyFrom(const CompressedWeights<TConfig>& other) {
ForEachTensor({this, const_cast<CompressedWeights<TConfig>*>(&other)},
ForEachType::kIgnoreNulls,
[](const char*, hwy::Span<MatPtr*> tensors) {
hwy::CopyBytes(tensors[1]->Ptr(), tensors[0]->Ptr(),
tensors[1]->SizeBytes());
});
}
// If scales is empty, computes and returns the scale factors for the tensors,
// otherwise applies the scale factors to the tensors.
void GetOrApplyScales(std::vector<float>& scales) {
int scale_pos = 0;
ForEachTensor(
{this}, ForEachType::kIgnoreNulls,
[&scales, &scale_pos, this](const char*, hwy::Span<MatPtr*> tensors) {
if (this->scale_names.count(tensors[0]->Name())) {
if (scale_pos < scales.size()) {
tensors[0]->set_scale(scales[scale_pos]);
} else {
float scale = ScaleWeights(tensors[0]->data<float>(),
tensors[0]->NumElements());
scales.push_back(scale);
}
++scale_pos;
}
});
HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
}
template <class Func>
static void ForEachTensor(
const std::vector<CompressedWeights<TConfig>*>& ptrs, ForEachType fet,
Func func) {
std::vector<CompressedLayer<TConfig>*> layers(ptrs.size());
std::vector<CompressedLayer<typename TConfig::VitConfig>*> vit_layers(
ptrs.size());
MatPtr* tensors[ptrs.size()];
// Variables used by GEMMA_CALL_FUNC.
int layer_idx = -1;
char sep = ' ';
int sep_index = -1;
GEMMA_CALL_FUNC(embedder_input_embedding);
GEMMA_CALL_FUNC(final_norm_scale);
if constexpr (TConfig::VitConfig::kLayers > 0) {
// Vit parts.
GEMMA_CALL_FUNC(vit_encoder_norm_bias);
GEMMA_CALL_FUNC(vit_encoder_norm_scale);
GEMMA_CALL_FUNC(vit_img_embedding_bias);
GEMMA_CALL_FUNC(vit_img_embedding_kernel);
GEMMA_CALL_FUNC(vit_img_pos_embedding);
GEMMA_CALL_FUNC(vit_img_head_bias);
GEMMA_CALL_FUNC(vit_img_head_kernel);
}
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
for (int i = 0; i < ptrs.size(); ++i) {
layers[i] = ptrs[i]->GetLayer(layer_idx);
}
CompressedLayer<TConfig>::ForEachTensor(layers, layer_idx, fet, func);
}
// Vit layers. Not supported for compress_weights.
if constexpr (TConfig::VitConfig::kLayers > 0) {
for (int layer_idx = 0; layer_idx < TConfig::VitConfig::kLayers;
++layer_idx) {
auto type = TConfig::VitConfig::kLayerConfig[layer_idx];
HWY_ASSERT(type == LayerAttentionType::kVit);
for (int i = 0; i < ptrs.size(); ++i) {
vit_layers[i] = ptrs[i]->GetVitLayer(layer_idx);
}
CompressedLayer<typename TConfig::VitConfig>::ForEachTensor(
vit_layers, layer_idx, fet, func);
}
}
}
};
#undef GEMMA_CALL_FUNC
// Pair of configs for the compressed and uncompressed weights.
template <class CConfig, class UCConfig>
struct ConfigPair {
using uc = UCConfig;
using c = CConfig;
};
// ----------------------------------------------------------------------------
@ -263,6 +518,20 @@ struct AllocateCompressedWeights {
ByteStorageT weights_u8 = AllocateSizeof<TWeights>();
TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
new (weights) TWeights(pool);
std::vector<MatPtr*> model_toc;
auto& model_storage = weights->model_storage;
TWeights::ForEachTensor(
{weights}, ForEachType::kInitNoToc,
[&model_toc, &model_storage](const char*, hwy::Span<MatPtr*> tensors) {
model_toc.push_back(tensors[0]);
model_storage.emplace_back(*tensors[0]);
});
// Allocate in parallel using the pool.
pool.Run(0, model_storage.size(),
[&model_toc, &model_storage](uint64_t task, size_t /*thread*/) {
model_storage[task].Allocate();
model_toc[task]->SetPtr(model_storage[task]);
});
return weights_u8;
}
};
@ -287,291 +556,11 @@ struct ReshapeCompressedWeights {
// TODO: also add RandInitCompressedWeights
template <class TConfig>
struct DeleteCompressedWeights {
void operator()(ByteStorageT& weights_u8) const {
CompressedWeights<TConfig>& weights =
*reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
weights.~CompressedWeights<TConfig>();
}
};
ByteStorageT LoadCompressedWeights(const Path& weights, Model model_type,
Type weight_type, hwy::ThreadPool& pool);
void LogWeightStats(Model model, Type weight_type, const ByteStorageT& weights);
// ----------------------------------------------------------------------------
// Iterators
// We rely on `if constexpr` to ensure raw_weights->member is only compiled
// when valid, i.e., kHaveRaw == true, but the IDE analysis does not understand
// this, hence hide the member access from it.
#if HWY_IDE
#define GEMMA_MEMBER(aggregate, member) nullptr
#else
#define GEMMA_MEMBER(aggregate, member) aggregate->member
#endif
// Used by ForEachTensor for tensors that are not in a layer.
#define GEMMA_CALL_TOP_FUNC(name, member) \
{ \
const float* raw_tensor = nullptr; \
if constexpr (kHaveRaw) { \
raw_tensor = GEMMA_MEMBER(raw_weights, member.data()); \
} \
func(name, raw_tensor, c_weights.member); \
}
// Used by ForEachTensor for per-layer tensors. Writes into name_buf.
#define GEMMA_CALL_FUNC(name, member) \
snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
{ \
const float* raw_tensor = nullptr; \
if constexpr (kHaveRaw) { \
raw_tensor = GEMMA_MEMBER(raw_layer, member.data()); \
} \
func(name_buf, raw_tensor, c_layer->member); \
}
// Calls func(name, float*, CompressedArray&) for each tensor. float* is
// null if raw_weights is nullptr, e.g., when loading weights from BlobStore.
// Otherwise, RawLayer must be specified and we pass a float* pointing to the
// raw float weights for that tensor for use by compress_weights.cc.
//
// This avoids repeating the list of tensors between loading and compressing,
// while also avoiding dependency on raw_weights.h.
//
// This only calls Func for tensors that TConfig requests/specifies, which means
// scale() is uninitialized for the other tensors, so their data_scale1() must
// not be called. (In other words, if the config doesn't specify a tensor, it
// shouldn't be used.)
template <class TConfig, class RawLayer = void, class RawWeightsPtr, class Func>
void ForEachTensor(RawWeightsPtr raw_weights,
CompressedWeights<TConfig>& c_weights, Func& func) {
constexpr bool kHaveRaw = !hwy::IsSame<RawWeightsPtr, std::nullptr_t>();
GEMMA_CALL_TOP_FUNC("c_embedding", embedder_input_embedding);
GEMMA_CALL_TOP_FUNC("c_final_norm", final_norm_scale);
if constexpr (TConfig::VitConfig::kLayers > 0 && !kHaveRaw) {
GEMMA_CALL_TOP_FUNC("enc_norm_bias", vit_encoder_norm_bias);
GEMMA_CALL_TOP_FUNC("enc_norm_scale", vit_encoder_norm_scale);
GEMMA_CALL_TOP_FUNC("img_emb_bias", vit_img_embedding_bias);
GEMMA_CALL_TOP_FUNC("img_emb_kernel", vit_img_embedding_kernel);
GEMMA_CALL_TOP_FUNC("img_head_bias", vit_img_head_bias);
GEMMA_CALL_TOP_FUNC("img_head_kernel", vit_img_head_kernel);
GEMMA_CALL_TOP_FUNC("img_pos_emb", vit_img_pos_embedding);
}
char name_buf[16];
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
auto type = TConfig::kLayerConfig[layer_idx];
const size_t idx = static_cast<size_t>(layer_idx);
const RawLayer* raw_layer = nullptr;
if constexpr (kHaveRaw) {
raw_layer = raw_weights->GetLayer(idx);
}
CompressedLayer<TConfig>* c_layer = c_weights.GetLayer(idx);
GEMMA_CALL_FUNC("pre_ff_ns", pre_ffw_norm_scale);
GEMMA_CALL_FUNC("gating_ein", gating_einsum_w);
GEMMA_CALL_FUNC("linear_w", linear_w);
if (type == LayerAttentionType::kGemma) {
GEMMA_CALL_FUNC("qkv_ein", qkv_einsum_w);
GEMMA_CALL_FUNC("att_ein", attn_vec_einsum_w);
} else {
GEMMA_CALL_FUNC("gr_lin_x_w", griffin.linear_x_w);
GEMMA_CALL_FUNC("gr_lin_x_b", griffin.linear_x_biases);
GEMMA_CALL_FUNC("gr_lin_y_w", griffin.linear_y_w);
GEMMA_CALL_FUNC("gr_lin_y_b", griffin.linear_y_biases);
GEMMA_CALL_FUNC("gr_lin_out_w", griffin.linear_out_w);
GEMMA_CALL_FUNC("gr_lin_out_b", griffin.linear_out_biases);
GEMMA_CALL_FUNC("gr_conv_w", griffin.conv_w);
GEMMA_CALL_FUNC("gr_conv_b", griffin.conv_biases);
GEMMA_CALL_FUNC("gr_gate_w", griffin.gate_w);
GEMMA_CALL_FUNC("gr_gate_b", griffin.gate_biases);
GEMMA_CALL_FUNC("gr_a", griffin.a);
}
GEMMA_CALL_FUNC("pre_att_ns", pre_attention_norm_scale);
if (TConfig::kPostNorm == PostNormType::Scale) {
GEMMA_CALL_FUNC("post_att_ns", post_attention_norm_scale);
GEMMA_CALL_FUNC("post_ff_ns", post_ffw_norm_scale);
}
if (TConfig::kFFBiases) {
GEMMA_CALL_FUNC("ffw_gat_b", ffw_gating_biases);
GEMMA_CALL_FUNC("ffw_out_b", ffw_output_biases);
}
if (TConfig::kSoftmaxAttnOutputBiases &&
type == LayerAttentionType::kGemma) {
GEMMA_CALL_FUNC("attn_ob", attention_output_biases);
}
}
// Vit layers. Not supported for compress_weights.
if constexpr (TConfig::VitConfig::kLayers > 0 && !kHaveRaw) {
for (int layer_idx = 0; layer_idx < TConfig::VitConfig::kLayers;
++layer_idx) {
auto type = TConfig::VitConfig::kLayerConfig[layer_idx];
HWY_ASSERT(type == LayerAttentionType::kVit);
const size_t idx = static_cast<size_t>(layer_idx);
const RawLayer* raw_layer = nullptr;
CompressedLayer<typename TConfig::VitConfig>* c_layer =
c_weights.GetVitLayer(idx);
// MHA.
GEMMA_CALL_FUNC("attn_out_w", vit.attn_out_w);
GEMMA_CALL_FUNC("attn_out_b", vit.attn_out_b);
GEMMA_CALL_FUNC("qkv_ein_w", vit.qkv_einsum_w);
GEMMA_CALL_FUNC("qkv_ein_b", vit.qkv_einsum_b);
// MlpBlock.
GEMMA_CALL_FUNC("linear_0_w", vit.linear_0_w);
GEMMA_CALL_FUNC("linear_0_b", vit.linear_0_b);
GEMMA_CALL_FUNC("linear_1_w", vit.linear_1_w);
GEMMA_CALL_FUNC("linear_1_b", vit.linear_1_b);
// LayerNorm.
GEMMA_CALL_FUNC("ln_0_bias", vit.layer_norm_0_bias);
GEMMA_CALL_FUNC("ln_0_scale", vit.layer_norm_0_scale);
GEMMA_CALL_FUNC("ln_1_bias", vit.layer_norm_1_bias);
GEMMA_CALL_FUNC("ln_1_scale", vit.layer_norm_1_scale);
}
}
#undef GEMMA_CALL_FUNC
#undef GEMMA_CALL_TOP_FUNC
} // ForEachTensor
#define GEMMA_CALL_TOP_FUNC1(name, member) func(name, weights1.member)
#define GEMMA_CALL_TOP_FUNC2(name, member) \
func(name, weights1.member, weights2.member)
#define GEMMA_CALL_TOP_FUNC3(name, member) \
func(name, weights1.member, weights2.member, weights3.member)
#define GEMMA_CALL_TOP_FUNC4(name, member) \
func(name, weights1.member, weights2.member, \
weights3.member, weights4.member)
#define GEMMA_CALL_LAYER_FUNC1(name, member) \
snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
func(name_buf, layer1.member)
#define GEMMA_CALL_LAYER_FUNC2(name, member) \
snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
func(name_buf, layer1.member, layer2.member)
#define GEMMA_CALL_LAYER_FUNC3(name, member) \
snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
func(name_buf, layer1.member, layer2.member, layer3.member)
#define GEMMA_CALL_LAYER_FUNC4(name, member) \
snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
func(name_buf, layer1.member, layer2.member, layer3.member, layer4.member)
#define GEMMA_CALL_ALL_LAYER_FUNC(N) \
if (type == LayerAttentionType::kGemma) { \
GEMMA_CALL_LAYER_FUNC ## N("att_ein", attn_vec_einsum_w); \
GEMMA_CALL_LAYER_FUNC ## N("qkv_ein", qkv_einsum_w); \
} else { \
GEMMA_CALL_LAYER_FUNC ## N("gr_lin_x_w", griffin.linear_x_w); \
GEMMA_CALL_LAYER_FUNC ## N("gr_lin_x_b", griffin.linear_x_biases); \
GEMMA_CALL_LAYER_FUNC ## N("gr_lin_y_w", griffin.linear_y_w); \
GEMMA_CALL_LAYER_FUNC ## N("gr_lin_y_b", griffin.linear_y_biases); \
GEMMA_CALL_LAYER_FUNC ## N("gr_lin_out_w", griffin.linear_out_w); \
GEMMA_CALL_LAYER_FUNC ## N("gr_lin_out_b", griffin.linear_out_biases); \
GEMMA_CALL_LAYER_FUNC ## N("gr_conv_w", griffin.conv_w); \
GEMMA_CALL_LAYER_FUNC ## N("gr_conv_b", griffin.conv_biases); \
GEMMA_CALL_LAYER_FUNC ## N("gr_gate_w", griffin.gate_w); \
GEMMA_CALL_LAYER_FUNC ## N("gr_gate_b", griffin.gate_biases); \
GEMMA_CALL_LAYER_FUNC ## N("gr_a", griffin.a); \
} \
GEMMA_CALL_LAYER_FUNC ## N("gating_ein", gating_einsum_w); \
GEMMA_CALL_LAYER_FUNC ## N("linear_w", linear_w); \
GEMMA_CALL_LAYER_FUNC ## N("pre_att_ns", pre_attention_norm_scale); \
if (TConfig::kPostNorm == PostNormType::Scale) { \
GEMMA_CALL_LAYER_FUNC ## N("post_att_ns", post_attention_norm_scale); \
GEMMA_CALL_LAYER_FUNC ## N("post_ff_ns", post_ffw_norm_scale); \
} \
GEMMA_CALL_LAYER_FUNC ## N("pre_ff_ns", pre_ffw_norm_scale); \
if (TConfig::kFFBiases) { \
GEMMA_CALL_LAYER_FUNC ## N("ffw_gat_b", ffw_gating_biases); \
GEMMA_CALL_LAYER_FUNC ## N("ffw_out_b", ffw_output_biases); \
} \
if (TConfig::kSoftmaxAttnOutputBiases && \
type == LayerAttentionType::kGemma) { \
GEMMA_CALL_LAYER_FUNC ## N("attn_ob", attention_output_biases); \
}
template <typename TConfig, class Func>
void ForEachTensor1(Func& func, const CompressedWeights<TConfig>& weights1) {
GEMMA_CALL_TOP_FUNC1("embedding", embedder_input_embedding);
GEMMA_CALL_TOP_FUNC1("final_norm", final_norm_scale);
char name_buf[16];
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
auto type = TConfig::kLayerConfig[layer_idx];
const size_t idx = static_cast<size_t>(layer_idx);
const CompressedLayer<TConfig>& layer1 = *weights1.GetLayer(idx);
GEMMA_CALL_ALL_LAYER_FUNC(1)
}
}
template <typename TConfig, class Func>
void ForEachTensor1(Func& func, CompressedWeights<TConfig>& weights1) {
GEMMA_CALL_TOP_FUNC1("embedding", embedder_input_embedding);
GEMMA_CALL_TOP_FUNC1("final_norm", final_norm_scale);
char name_buf[16];
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
auto type = TConfig::kLayerConfig[layer_idx];
const size_t idx = static_cast<size_t>(layer_idx);
CompressedLayer<TConfig>& layer1 = *weights1.GetLayer(idx);
GEMMA_CALL_ALL_LAYER_FUNC(1)
}
}
template <typename TConfig, class Func>
void ForEachTensor2(Func& func, const CompressedWeights<TConfig>& weights1,
CompressedWeights<TConfig>& weights2) {
GEMMA_CALL_TOP_FUNC2("embedding", embedder_input_embedding);
GEMMA_CALL_TOP_FUNC2("final_norm", final_norm_scale);
char name_buf[16];
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
auto type = TConfig::kLayerConfig[layer_idx];
const size_t idx = static_cast<size_t>(layer_idx);
const CompressedLayer<TConfig>& layer1 = *weights1.GetLayer(idx);
CompressedLayer<TConfig>& layer2 = *weights2.GetLayer(idx);
GEMMA_CALL_ALL_LAYER_FUNC(2)
}
}
template <typename TConfig, class Func>
void ForEachTensor4(Func& func, const CompressedWeights<TConfig>& weights1,
CompressedWeights<TConfig>& weights2,
CompressedWeights<TConfig>& weights3,
CompressedWeights<TConfig>& weights4) {
GEMMA_CALL_TOP_FUNC4("embedding", embedder_input_embedding);
GEMMA_CALL_TOP_FUNC4("final_norm", final_norm_scale);
char name_buf[16];
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
auto type = TConfig::kLayerConfig[layer_idx];
const size_t idx = static_cast<size_t>(layer_idx);
const CompressedLayer<TConfig>& layer1 = *weights1.GetLayer(idx);
CompressedLayer<TConfig>& layer2 = *weights2.GetLayer(idx);
CompressedLayer<TConfig>& layer3 = *weights3.GetLayer(idx);
CompressedLayer<TConfig>& layer4 = *weights4.GetLayer(idx);
GEMMA_CALL_ALL_LAYER_FUNC(4)
}
}
#undef GEMMA_CALL_TOP_FUNC1
#undef GEMMA_CALL_TOP_FUNC2
#undef GEMMA_CALL_TOP_FUNC3
#undef GEMMA_CALL_TOP_FUNC4
#undef GEMMA_CALL_LAYER_FUNC1
#undef GEMMA_CALL_LAYER_FUNC2
#undef GEMMA_CALL_LAYER_FUNC3
#undef GEMMA_CALL_LAYER_FUNC4
#undef GEMMA_CALL_ALL_LAYER_FUNC
} // namespace gcpp
#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_H_

View File

@ -377,20 +377,23 @@ HWY_INLINE float Dot(const WT* HWY_RESTRICT w, const VT* vec, size_t num) {
}
// Adapter for use by matvec-inl.h. TODO: remove when that is no longer used.
template <size_t kCapacity, typename VT>
HWY_INLINE float Dot(const std::array<float, kCapacity>& w, size_t w_ofs,
const VT* vec, size_t num) {
template <typename MatT, size_t kCapacity, typename VT>
HWY_INLINE float Dot(const CompressedArray<MatT, kCapacity>& w, size_t w_ofs,
const VT* vec_aligned, size_t num) {
const hn::ScalableTag<VT> d;
return Dot(d, MakeConstSpan(w.data(), kCapacity), w_ofs, vec, num);
return w.scale() *
Dot(d, MakeConstSpan(w.data(), kCapacity), w_ofs, vec_aligned, num);
}
// Adapter for use by matvec-inl.h. TODO: remove when that is no longer used.
template <typename MatT, size_t kCapacity, typename VT>
HWY_INLINE float Dot(const CompressedArray<MatT, kCapacity>& w, size_t w_ofs,
const VT* vec, size_t num) {
template <typename MatT, typename VT>
HWY_INLINE float Dot(const MatPtrT<MatT>& w, size_t w_ofs,
const VT* vec_aligned, size_t num) {
const hn::ScalableTag<VT> d;
return w.scale() *
Dot(d, MakeConstSpan(w.data(), kCapacity), w_ofs, vec, num);
return w.scale() * Dot(d,
MakeConstSpan(reinterpret_cast<const MatT*>(w.Ptr()),
w.NumElements()),
w_ofs, vec_aligned, num);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)

View File

@ -46,18 +46,18 @@ namespace HWY_NAMESPACE {
using FloatPtr = hwy::AlignedFreeUniquePtr<float[]>;
template <size_t kOuter, size_t kInner, size_t kNum = kOuter * kInner>
FloatPtr SimpleMatVecAdd(const CompressedArray<float, kNum>& mat,
const FloatPtr& vec, const FloatPtr& add) {
FloatPtr raw_mat = hwy::AllocateAligned<float>(kNum);
FloatPtr out = hwy::AllocateAligned<float>(kOuter);
FloatPtr SimpleMatVecAdd(const MatStorageT<float>& mat, const FloatPtr& vec,
const FloatPtr& add) {
FloatPtr raw_mat = hwy::AllocateAligned<float>(mat.NumElements());
FloatPtr out = hwy::AllocateAligned<float>(mat.Rows());
HWY_ASSERT(raw_mat && out);
const hn::ScalableTag<float> df;
DecompressAndZeroPad(df, MakeSpan(mat.data(), kNum), 0, raw_mat.get(), kNum);
for (size_t idx_row = 0; idx_row < kOuter; idx_row++) {
DecompressAndZeroPad(df, MakeSpan(mat.data(), mat.NumElements()), 0,
raw_mat.get(), mat.NumElements());
for (size_t idx_row = 0; idx_row < mat.Rows(); idx_row++) {
out[idx_row] = 0.0f;
for (size_t idx_col = 0; idx_col < kInner; idx_col++) {
out[idx_row] += raw_mat[kInner * idx_row + idx_col] * vec[idx_col];
for (size_t idx_col = 0; idx_col < mat.Cols(); idx_col++) {
out[idx_row] += raw_mat[mat.Cols() * idx_row + idx_col] * vec[idx_col];
}
out[idx_row] *= mat.scale();
out[idx_row] += add[idx_row];
@ -65,13 +65,12 @@ FloatPtr SimpleMatVecAdd(const CompressedArray<float, kNum>& mat,
return out;
}
template <typename MatT, size_t kOuter, size_t kInner,
size_t kNum = kOuter * kInner,
class MatPtr = std::unique_ptr<CompressedArray<MatT, kNum>>>
MatPtr GenerateMat(size_t offset, hwy::ThreadPool& pool) {
template <typename MatT, size_t kOuter, size_t kInner>
std::unique_ptr<MatStorageT<float>> GenerateMat(size_t offset,
hwy::ThreadPool& pool) {
gcpp::CompressWorkingSet ws;
MatPtr mat = std::make_unique<CompressedArray<MatT, kNum>>();
FloatPtr raw_mat = hwy::AllocateAligned<float>(kNum);
auto mat = std::make_unique<MatStorageT<float>>("TestMat", kOuter, kInner);
FloatPtr raw_mat = hwy::AllocateAligned<float>(mat->NumElements());
HWY_ASSERT(raw_mat);
const float scale = 1.0f / kInner;
pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) {
@ -81,7 +80,7 @@ MatPtr GenerateMat(size_t offset, hwy::ThreadPool& pool) {
}
});
CompressScaled(raw_mat.get(), kNum, ws, *mat, pool);
CompressScaled(raw_mat.get(), mat->NumElements(), ws, *mat, pool);
mat->set_scale(1.9f); // Arbitrary value, different from 1.
return mat;
}
@ -113,7 +112,7 @@ void TestMatVecAdd() {
auto mat = GenerateMat<float, kOuter, kInner>(0, pool);
FloatPtr vec = GenerateVec<kInner>(0);
FloatPtr add = GenerateVec<kOuter>(0);
FloatPtr expected_out = SimpleMatVecAdd<kOuter, kInner>(*mat, vec, add);
FloatPtr expected_out = SimpleMatVecAdd(*mat, vec, add);
FloatPtr actual_out = hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(vec && add && expected_out && actual_out);
MatVecAdd<kOuter, kInner>(*mat, 0, vec.get(), add.get(), actual_out.get(),
@ -130,8 +129,8 @@ void TestTwoMatVecAdd() {
FloatPtr vec = GenerateVec<kInner>(0);
FloatPtr add0 = GenerateVec<kOuter>(0);
FloatPtr add1 = GenerateVec<kOuter>(1);
FloatPtr expected_out0 = SimpleMatVecAdd<kOuter, kInner>(*mat0, vec, add0);
FloatPtr expected_out1 = SimpleMatVecAdd<kOuter, kInner>(*mat1, vec, add1);
FloatPtr expected_out0 = SimpleMatVecAdd(*mat0, vec, add0);
FloatPtr expected_out1 = SimpleMatVecAdd(*mat1, vec, add1);
FloatPtr actual_out0 = hwy::AllocateAligned<float>(kOuter);
FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
@ -151,8 +150,8 @@ void TestTwoOfsMatVecAddLoop() {
FloatPtr vec = GenerateVec<kInner>(0);
FloatPtr add0 = GenerateVec<kOuter>(0);
FloatPtr add1 = GenerateVec<kOuter>(1);
FloatPtr expected_out0 = SimpleMatVecAdd<kOuter, kInner>(*mat, vec, add0);
FloatPtr expected_out1 = SimpleMatVecAdd<kOuter, kInner>(*mat, vec, add1);
FloatPtr expected_out0 = SimpleMatVecAdd(*mat, vec, add0);
FloatPtr expected_out1 = SimpleMatVecAdd(*mat, vec, add1);
FloatPtr actual_out0 = hwy::AllocateAligned<float>(kOuter);
FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&

View File

@ -28,6 +28,8 @@ namespace gcpp {
// Bundles ptr/size/stride arguments to simplify MatMul call sites. T can be
// const or non-const. Create via ConstMat/MutableMat.
// TODO(rays): Replace with MatPtr and get rid of stride, which is only != cols
// in one place.
template <typename T>
struct Mat {
bool NotEmpty() const {

View File

@ -52,13 +52,13 @@ using FloatPtr = hwy::AlignedFreeUniquePtr<float[]>;
// Generates inputs: deterministic, within max SfpStream range.
template <typename MatT, size_t kRows, size_t kCols,
size_t kNum = kRows * kCols,
class MatPtr = std::unique_ptr<CompressedArray<MatT, kNum>>>
class MatPtr = std::unique_ptr<MatStorageT<MatT>>>
MatPtr GenerateMat(size_t offset, hwy::ThreadPool& pool) {
gcpp::CompressWorkingSet ws;
FloatPtr content = hwy::AllocateAligned<float>(kNum);
auto mat = std::make_unique<MatStorageT<MatT>>("test", kRows, kCols);
FloatPtr content = hwy::AllocateAligned<float>(mat->NumElements());
HWY_ASSERT(content);
const float scale = SfpStream::kMax / (kNum + offset);
const float scale = SfpStream::kMax / (mat->NumElements() + offset);
pool.Run(0, kRows, [&](const size_t i, size_t /*thread*/) {
for (size_t j = 0; j < kCols; j++) {
content[i * kCols + j] =
@ -66,19 +66,18 @@ MatPtr GenerateMat(size_t offset, hwy::ThreadPool& pool) {
}
});
MatPtr mat = std::make_unique<CompressedArray<MatT, kNum>>();
CompressScaled(content.get(), kNum, ws, *mat, pool);
CompressScaled(content.get(), mat->NumElements(), ws, *mat, pool);
mat->set_scale(0.6f); // Arbitrary value, different from 1.
return mat;
}
template <typename MatT, size_t kRows, size_t kCols,
size_t kNum = kRows * kCols,
class MatPtr = std::unique_ptr<CompressedArray<MatT, kNum>>>
class MatPtr = std::unique_ptr<MatStorageT<MatT>>>
MatPtr GenerateTransposedMat(size_t offset, hwy::ThreadPool& pool) {
gcpp::CompressWorkingSet ws;
FloatPtr content = hwy::AllocateAligned<float>(kNum);
const float scale = SfpStream::kMax / (kNum + offset);
MatPtr mat = std::make_unique<MatStorageT<MatT>>("test", kCols, kRows);
FloatPtr content = hwy::AllocateAligned<float>(mat->NumElements());
const float scale = SfpStream::kMax / (mat->NumElements() + offset);
pool.Run(0, kRows, [&](const size_t i, size_t /*thread*/) {
for (size_t j = 0; j < kCols; j++) {
content[j * kRows + i] =
@ -86,27 +85,25 @@ MatPtr GenerateTransposedMat(size_t offset, hwy::ThreadPool& pool) {
}
});
MatPtr mat = std::make_unique<CompressedArray<MatT, kNum>>();
CompressScaled(content.get(), kNum, ws, *mat, pool);
CompressScaled(content.get(), mat->NumElements(), ws, *mat, pool);
// Arbitrary value, different from 1, must match GenerateMatHeap.
mat->set_scale(0.6f);
return mat;
}
template <typename MatT, size_t kRows, size_t kCols,
size_t kNum = kRows * kCols,
class MatPtr = std::unique_ptr<CompressedArray<MatT, kNum>>>
class MatPtr = std::unique_ptr<MatStorageT<MatT>>>
MatPtr GenerateZeroMat(hwy::ThreadPool& pool) {
gcpp::CompressWorkingSet ws;
FloatPtr content = hwy::AllocateAligned<float>(kNum);
auto mat = std::make_unique<MatStorageT<MatT>>("Array", kRows, kCols);
FloatPtr content = hwy::AllocateAligned<float>(mat->NumElements());
HWY_ASSERT(content);
pool.Run(0, kRows, [&](const size_t i, size_t thread) {
hwy::ZeroBytes(&content[i * kCols], kCols * sizeof(content[0]));
});
MatPtr mat = std::make_unique<CompressedArray<MatT, kNum>>();
CompressScaled(content.get(), kNum, ws, *mat, pool);
CompressScaled(content.get(), mat->NumElements(), ws, *mat, pool);
mat->set_scale(1.2f); // Arbitrary value, different from 1.
return mat;
}
@ -216,21 +213,21 @@ void TestMatMul(MatMulEnv& env) {
kRowsAC, kColsARowsB, kColsBC, kAdd, TypeName<MatTA>(),
TypeName<MatTB>());
std::unique_ptr<CompressedArray<MatTA, kRowsAC * kColsARowsB>> a =
std::unique_ptr<MatStorageT<MatTA>> a =
GenerateMat<MatTA, kRowsAC, kColsARowsB>(0, pool);
std::unique_ptr<CompressedArray<MatTB, kColsARowsB * kColsBC>> b_trans =
std::unique_ptr<MatStorageT<MatTB>> b_trans =
GenerateTransposedMat<MatTB, kColsARowsB, kColsBC>(0, pool);
FloatPtr c = hwy::AllocateAligned<float>(kRowsAC * kColsBC);
HWY_ASSERT(c);
const float scale = a->scale() * b_trans->scale();
std::unique_ptr<CompressedArray<float, kColsBC>> add;
std::unique_ptr<MatStorageT<float>> add;
if (kAdd) {
add = GenerateMat<float, 1, kColsBC>(0, pool);
add->set_scale(1.0f);
}
std::unique_ptr<CompressedArray<float, kRowsAC * kColsBC>> c_slow =
std::unique_ptr<MatStorageT<float>> c_slow =
GenerateZeroMat<float, kRowsAC, kColsBC>(pool);
const double start_slow = hwy::platform::Now();
MatMulSlow(kRowsAC, kColsARowsB, kColsBC, a->data(), b_trans->data(), scale,

View File

@ -214,9 +214,9 @@ HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
}
// Two matrices, same vector
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
typename VecT, typename AddT>
HWY_NOINLINE void TwoMatVecT(const ArrayT& mat0, const ArrayT& mat1,
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT1,
typename ArrayT2, typename VecT, typename AddT>
HWY_NOINLINE void TwoMatVecT(const ArrayT1& mat0, const ArrayT2& mat1,
const size_t mat_ofs,
const VecT* HWY_RESTRICT vec_aligned,
const AddT* HWY_RESTRICT add0,
@ -254,10 +254,10 @@ HWY_NOINLINE void TwoMatVecT(const ArrayT& mat0, const ArrayT& mat1,
}
// With addition
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
typename AddT>
template <size_t kOuter, size_t kInner, typename ArrayT1, typename ArrayT2,
typename VecT, typename AddT>
HWY_NOINLINE void TwoMatVecAdd(
const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs,
const ArrayT1& mat0, const ArrayT2& mat1, const size_t mat_ofs,
const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0,
const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0,
float* HWY_RESTRICT out1, hwy::ThreadPool& pool) {
@ -266,13 +266,14 @@ HWY_NOINLINE void TwoMatVecAdd(
}
// Without addition
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1,
template <size_t kOuter, size_t kInner, typename ArrayT1, typename ArrayT2,
typename VecT>
HWY_NOINLINE void TwoMatVec(const ArrayT1& mat0, const ArrayT2& mat1,
const size_t mat_ofs,
const VecT* HWY_RESTRICT vec_aligned,
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
hwy::ThreadPool& pool) {
TwoMatVecT</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
TwoMatVecT</*kAdd=*/false, kOuter, kInner, ArrayT1, ArrayT2, VecT, VecT>(
mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
out0, out1, pool);
}

View File

@ -146,7 +146,7 @@ bool Image::WriteBinary(const std::string& filename) const {
std::cerr << "Failed to open " << filename << "\n";
return false;
}
for (int i = 0; i < data_.size(); ++i) {
for (size_t i = 0; i < data_.size(); ++i) {
file.write(reinterpret_cast<const char*>(&data_[i]), sizeof(float));
}
file.close();