gemma.cpp/gemma/configs_test.cc

462 lines
19 KiB
C++

#include "gemma/configs.h"
#include <array>
#include <cstddef>
#include <cstdint>
#include <type_traits>
#include <vector>
#include "gtest/gtest.h"
#include "hwy/aligned_allocator.h"
namespace gcpp {
template <size_t kNum>
constexpr std::array<LayerAttentionType, kNum> OldFixedLayerConfig(
LayerAttentionType type) {
std::array<LayerAttentionType, kNum> config = {};
for (LayerAttentionType& l : config) {
l = type;
}
return config;
}
template <size_t kNum>
constexpr std::array<size_t, kNum> OldFixedAttentionWindowSizes(
size_t window_size) {
std::array<size_t, kNum> window_size_configs = {};
for (size_t& l : window_size_configs) {
l = window_size;
}
return window_size_configs;
}
// Repeat window_size_pattern for kNum / kPatternSize times.
template <size_t kNum, size_t kPatternSize>
constexpr std::array<size_t, kNum> OldRepeatedAttentionWindowSizes(
const std::array<size_t, kPatternSize>& window_size_pattern) {
static_assert(kNum % kPatternSize == 0,
"kNum must be a multiple of kPatternSize");
std::array<size_t, kNum> window_size_configs = {};
for (size_t i = 0; i < kNum; ++i) {
window_size_configs[i] = window_size_pattern[i % kPatternSize];
}
return window_size_configs;
}
template <size_t kNumLayers>
constexpr size_t OldNumLayersOfTypeBefore(
const std::array<LayerAttentionType, kNumLayers>& layers,
LayerAttentionType type, size_t num) {
size_t count = 0;
for (size_t i = 0; i < num; i++) {
if (layers[i] == type) count++;
}
return count;
}
template <class TConfig, typename = void>
struct CacheLayerSize {
constexpr size_t operator()() const {
return TConfig::kKVHeads * TConfig::kQKVDim * 2;
}
};
template <class TConfig, typename = void>
struct CachePosSize {
constexpr size_t operator()() const {
return TConfig::kGemmaLayers * CacheLayerSize<TConfig>()();
}
};
struct OldConfigNoVit {
struct VitConfig {
// Some of these are needed to make the compiler happy when trying to
// generate code that will actually never be used.
using Weight = float;
static constexpr int kLayers = 0;
static constexpr std::array<LayerAttentionType, 0> kLayerConfig =
OldFixedLayerConfig<0>(LayerAttentionType::kVit);
static constexpr int kModelDim = 0;
static constexpr int kFFHiddenDim = 0;
static constexpr int kHeads = 1; // Avoid division by 0 in griffin gate_w.
static constexpr int kKVHeads = 0;
static constexpr int kQKVDim = 0;
static constexpr int kSeqLen = 0;
static constexpr ResidualType kResidual = ResidualType::Add;
static constexpr int kGriffinLayers = 0;
static constexpr int kConv1dWidth = 0;
static constexpr bool kFFBiases = false;
static constexpr bool kSoftmaxAttnOutputBiases = false;
static constexpr PostNormType kPostNorm = PostNormType::None;
};
};
struct OldConfigNoSSM : OldConfigNoVit {
static constexpr int kGriffinLayers = 0;
static constexpr int kConv1dWidth = 0;
static constexpr bool kFFBiases = false;
static constexpr bool kSoftmaxAttnOutputBiases = false;
static constexpr bool kUseHalfRope = false;
static constexpr bool kUseLocalAttention = false;
static constexpr bool kInterleaveQKV = true;
static constexpr PostQKType kPostQK = PostQKType::Rope;
static constexpr ActivationType kActivation = ActivationType::Gelu;
static constexpr ResidualType kResidual = ResidualType::Add;
};
struct OldConfigBaseGemmaV1 : OldConfigNoSSM {
static constexpr float kAttCap = 0.0f;
static constexpr float kFinalCap = 0.0f;
static constexpr PostNormType kPostNorm = PostNormType::None;
static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
};
struct OldConfigBaseGemmaV2 : OldConfigNoSSM {
static constexpr float kAttCap = 50.0f;
static constexpr float kFinalCap = 30.0f;
static constexpr PostNormType kPostNorm = PostNormType::Scale;
};
template <typename TWeight>
struct OldConfigGemma2_27B : public OldConfigBaseGemmaV2 {
using Weight = TWeight; // make accessible where we only have a TConfig
static constexpr int kSeqLen = 8192;
static constexpr int kVocabSize = gcpp::kVocabSize;
static constexpr std::array<LayerAttentionType, 46> kLayerConfig =
OldFixedLayerConfig<46>(LayerAttentionType::kGemma);
static constexpr std::array<size_t, 46> kAttentionWindowSizes =
OldRepeatedAttentionWindowSizes<46, 2>({4096, kSeqLen});
static constexpr int kLayers = kLayerConfig.size();
static constexpr int kNumTensorScales = 4 * kLayers;
static constexpr int kGemmaLayers = kLayers;
static constexpr int kModelDim = 4608;
static constexpr int kFFHiddenDim = 16 * 4608 / 2; // = 36864
static constexpr int kHeads = 32;
static constexpr int kKVHeads = 16;
static constexpr int kQKVDim = 128; // query size == key size == value size
static constexpr int kTopK = gcpp::kTopK;
static constexpr bool kAbsolutePE = false;
static constexpr QueryScaleType kQueryScale =
QueryScaleType::SqrtModelDimDivNumHeads;
};
template <typename TWeight>
struct OldConfigGemma2_9B : public OldConfigBaseGemmaV2 {
using Weight = TWeight; // make accessible where we only have a TConfig
static constexpr int kSeqLen = 8192;
static constexpr int kVocabSize = gcpp::kVocabSize;
static constexpr std::array<LayerAttentionType, 42> kLayerConfig =
OldFixedLayerConfig<42>(LayerAttentionType::kGemma);
static constexpr std::array<size_t, 42> kAttentionWindowSizes =
OldRepeatedAttentionWindowSizes<42, 2>({4096, kSeqLen});
static constexpr int kLayers = kLayerConfig.size();
static constexpr int kNumTensorScales = 4 * kLayers;
static constexpr int kGemmaLayers = kLayers;
static constexpr int kModelDim = 3584;
static constexpr int kFFHiddenDim = 8 * 3584 / 2; // = 14336
static constexpr int kHeads = 16;
static constexpr int kKVHeads = 8;
static constexpr int kQKVDim = 256; // query size == key size == value size
static constexpr int kTopK = gcpp::kTopK;
static constexpr bool kAbsolutePE = false;
static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
};
template <typename TWeight>
struct OldConfigGemma7B : public OldConfigBaseGemmaV1 {
using Weight = TWeight; // make accessible where we only have a TConfig
static constexpr int kSeqLen = gcpp::kSeqLen;
static constexpr int kVocabSize = gcpp::kVocabSize;
static constexpr std::array<LayerAttentionType, 28> kLayerConfig =
OldFixedLayerConfig<28>(LayerAttentionType::kGemma);
static constexpr std::array<size_t, 28> kAttentionWindowSizes =
OldFixedAttentionWindowSizes<28>(kSeqLen);
static constexpr int kLayers = kLayerConfig.size();
static constexpr int kNumTensorScales = 4 * kLayers;
static constexpr int kGemmaLayers = kLayers;
static constexpr int kModelDim = 3072;
static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576
static constexpr int kHeads = 16;
static constexpr int kKVHeads = 16; // standard MHA
static constexpr int kQKVDim = 256; // query size == key size == value size
static constexpr int kTopK = gcpp::kTopK;
static constexpr bool kAbsolutePE = false;
};
template <typename TWeight>
struct OldConfigGemma2B : public OldConfigBaseGemmaV1 {
using Weight = TWeight; // make accessible where we only have a TConfig
static constexpr int kSeqLen = gcpp::kSeqLen;
static constexpr int kVocabSize = gcpp::kVocabSize;
static constexpr std::array<LayerAttentionType, 18> kLayerConfig =
OldFixedLayerConfig<18>(LayerAttentionType::kGemma);
static constexpr std::array<size_t, 18> kAttentionWindowSizes =
OldFixedAttentionWindowSizes<18>(kSeqLen);
static constexpr int kLayers = kLayerConfig.size();
static constexpr int kNumTensorScales = 4 * kLayers;
static constexpr int kGemmaLayers = kLayers;
static constexpr int kModelDim = 2048;
static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384
static constexpr int kHeads = 8;
static constexpr int kKVHeads = 1;
static constexpr int kQKVDim = 256; // query size == key size == value size
static constexpr int kTopK = gcpp::kTopK;
static constexpr bool kAbsolutePE = false;
};
template <typename TWeight>
struct OldConfigPaliGemma_224 : public OldConfigGemma2B<TWeight> {
// On the LM side, the vocab size is one difference to Gemma1-2B in the
// architecture. PaliGemma adds 1024 <locNNNN> and 128 <segNNN> tokens.
static constexpr int kVocabSize = 256000 + 1024 + 128; // = 257152
// Sub-config for the Vision-Transformer part.
struct VitConfig : public OldConfigNoSSM {
using Weight = TWeight;
// The ViT parts. https://arxiv.org/abs/2305.13035
// "SoViT-400m/14 [...] has a width of 1152, depth 27, and MLP dim 4304."
static constexpr std::array<LayerAttentionType, 27> kLayerConfig =
OldFixedLayerConfig<27>(LayerAttentionType::kVit);
static constexpr int kLayers = kLayerConfig.size();
static constexpr int kNumTensorScales = 4 * kLayers;
static constexpr int kModelDim = 1152;
static constexpr int kFFHiddenDim = 4304;
static constexpr int kHeads = 16;
static constexpr int kKVHeads = 16; // standard MHA
static constexpr int kQKVDim = 72;
static constexpr int kSeqLen = 16 * 16; // 256
static constexpr bool kFFBiases = true;
// The Vit part does not have a vocabulary, the image patches are embedded.
static constexpr int kVocabSize = 0;
// Dimensions related to image processing.
static constexpr int kPatchWidth = 14;
static constexpr int kImageSize = 224;
// Necessary constant for the layer configuration.
static constexpr PostNormType kPostNorm = PostNormType::None;
};
};
template <typename TWeight>
struct OldConfigGemma2_2B : public OldConfigBaseGemmaV2 {
using Weight = TWeight; // make accessible where we only have a TConfig
static constexpr int kSeqLen = 8192;
static constexpr int kVocabSize = gcpp::kVocabSize;
static constexpr std::array<LayerAttentionType, 26> kLayerConfig =
OldFixedLayerConfig<26>(LayerAttentionType::kGemma);
static constexpr std::array<size_t, 26> kAttentionWindowSizes =
OldRepeatedAttentionWindowSizes<26, 2>({4096, kSeqLen});
static constexpr int kLayers = kLayerConfig.size();
static constexpr int kNumTensorScales = 4 * kLayers;
static constexpr int kGemmaLayers = kLayers;
static constexpr int kModelDim = 2304;
static constexpr int kFFHiddenDim = 8 * 2304 / 2; // = 9216
static constexpr int kHeads = 8;
static constexpr int kKVHeads = 4;
static constexpr int kQKVDim = 256; // query size == key size == value size
static constexpr int kTopK = gcpp::kTopK;
static constexpr bool kAbsolutePE = false;
static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
};
template <typename TWeight>
struct OldConfigGemmaTiny : public OldConfigNoSSM {
using Weight = TWeight; // make accessible where we only have a TConfig
static constexpr int kSeqLen = 32;
static constexpr int kVocabSize = 64;
static constexpr std::array<LayerAttentionType, 3> kLayerConfig =
OldFixedLayerConfig<3>(LayerAttentionType::kGemma);
static constexpr std::array<size_t, 3> kAttentionWindowSizes =
OldFixedAttentionWindowSizes<3>(kSeqLen);
static constexpr int kLayers = kLayerConfig.size();
static constexpr int kNumTensorScales = 4 * kLayers;
static constexpr int kGemmaLayers = kLayers;
static constexpr int kModelDim = 128;
static constexpr int kFFHiddenDim = 256;
static constexpr int kHeads = 4;
static constexpr int kKVHeads = 1;
static constexpr int kQKVDim = 16; // query size == key size == value size
static constexpr int kTopK = gcpp::kTopK;
static constexpr bool kAbsolutePE = false;
static constexpr PostNormType kPostNorm = PostNormType::None;
static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
static constexpr float kAttCap = 0.0f;
// This is required for optimize_test to pass.
static constexpr float kFinalCap = 30.0f;
};
template <typename TWeight>
struct OldConfigGriffin2B : OldConfigNoVit {
using Weight = TWeight; // make accessible where we only have a TConfig
// Griffin uses local attention, so kSeqLen is actually the local attention
// window.
static constexpr int kSeqLen = 2048;
static constexpr int kVocabSize = gcpp::kVocabSize;
static constexpr std::array<LayerAttentionType, 26> kLayerConfig = {
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGemma,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGemma,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGemma,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGemma,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGemma,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGemma,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGemma,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGemma,
LayerAttentionType::kGriffinRecurrentBlock,
LayerAttentionType::kGriffinRecurrentBlock,
};
static constexpr std::array<size_t, 26> kAttentionWindowSizes =
OldFixedAttentionWindowSizes<26>(kSeqLen);
static constexpr int kLayers = kLayerConfig.size();
static constexpr int kGemmaLayers = OldNumLayersOfTypeBefore(
kLayerConfig, LayerAttentionType::kGemma, kLayers);
static constexpr int kGriffinLayers = OldNumLayersOfTypeBefore(
kLayerConfig, LayerAttentionType::kGriffinRecurrentBlock, kLayers);
static constexpr int kModelDim = 2560;
static constexpr int kFFHiddenDim = 7680;
static constexpr int kHeads = 10;
static constexpr int kKVHeads = 1;
static constexpr int kQKVDim = 256; // query size == key size == value size
static constexpr int kTopK = gcpp::kTopK;
static constexpr bool kAbsolutePE = false;
static constexpr PostNormType kPostNorm = PostNormType::None;
// No SoftCap.
static constexpr float kAttCap = 0.0f;
static constexpr float kFinalCap = 0.0f;
// SSM config.
static constexpr int kConv1dWidth = 4;
static constexpr bool kFFBiases = true;
static constexpr bool kSoftmaxAttnOutputBiases = true;
static constexpr bool kUseHalfRope = true;
static constexpr bool kUseLocalAttention = true;
static constexpr bool kInterleaveQKV = false;
static constexpr int kNumTensorScales = 140;
static constexpr PostQKType kPostQK = PostQKType::Rope;
static constexpr ActivationType kActivation = ActivationType::Gelu;
static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
static constexpr ResidualType kResidual = ResidualType::Add;
};
template <class TConfig>
void AssertMatch(const ModelConfig& config) {
ASSERT_EQ(TConfig::kModelDim, config.model_dim);
if constexpr (TConfig::VitConfig::kModelDim != 0) {
ASSERT_EQ(TConfig::VitConfig::kModelDim, config.vit_config.model_dim);
ASSERT_EQ(TConfig::VitConfig::kSeqLen, config.vit_config.seq_len);
ASSERT_EQ(TConfig::VitConfig::kNumTensorScales,
config.vit_config.num_scales);
for (size_t i = 0; i < config.vit_config.layer_configs.size(); ++i) {
ASSERT_EQ(TConfig::VitConfig::kLayerConfig[i],
config.vit_config.layer_configs[i].type);
}
}
ASSERT_EQ(TConfig::kVocabSize, config.vocab_size);
ASSERT_EQ(TConfig::kSeqLen, config.seq_len);
ASSERT_EQ(TConfig::kAttCap, config.att_cap);
ASSERT_EQ(TConfig::kFinalCap, config.final_cap);
ASSERT_EQ(TConfig::kAbsolutePE, config.absolute_pe);
ASSERT_EQ(TConfig::kUseLocalAttention, config.use_local_attention);
ASSERT_EQ(TConfig::kQueryScale, config.query_scale);
ASSERT_EQ(TConfig::kGemmaLayers,
config.NumLayersOfType(LayerAttentionType::kGemma));
ASSERT_EQ(TConfig::kGriffinLayers,
config.NumLayersOfType(LayerAttentionType::kGriffinRecurrentBlock));
for (size_t i = 0; i < config.layer_configs.size(); ++i) {
ASSERT_EQ(TConfig::kModelDim, config.layer_configs[i].model_dim);
ASSERT_EQ(TConfig::kFFHiddenDim, config.layer_configs[i].ff_hidden_dim);
ASSERT_EQ(TConfig::kHeads, config.layer_configs[i].heads);
ASSERT_EQ(TConfig::kKVHeads, config.layer_configs[i].kv_heads);
ASSERT_EQ(TConfig::kQKVDim, config.layer_configs[i].qkv_dim);
ASSERT_EQ(TConfig::kConv1dWidth, config.layer_configs[i].conv1d_width);
ASSERT_EQ(TConfig::kFFBiases, config.layer_configs[i].ff_biases);
ASSERT_EQ(TConfig::kSoftmaxAttnOutputBiases,
config.layer_configs[i].softmax_attn_output_biases);
ASSERT_EQ(TConfig::kPostNorm, config.layer_configs[i].post_norm);
ASSERT_EQ(TConfig::kLayerConfig[i], config.layer_configs[i].type);
ASSERT_EQ(TConfig::kActivation, config.layer_configs[i].activation);
PostQKType post_qk = TConfig::kPostQK;
if (TConfig::kUseHalfRope) {
post_qk = PostQKType::HalfRope;
}
ASSERT_EQ(post_qk, config.layer_configs[i].post_qk);
}
ASSERT_EQ(TConfig::kAttentionWindowSizes.size(),
config.attention_window_sizes.size());
for (size_t i = 0; i < config.attention_window_sizes.size(); ++i) {
ASSERT_EQ(TConfig::kAttentionWindowSizes[i],
config.attention_window_sizes[i]);
}
ASSERT_EQ(TConfig::kNumTensorScales, config.num_tensor_scales);
}
ModelConfig RoundTripSerialize(const ModelConfig& config) {
std::vector<uint32_t> config_buffer = config.Write();
ModelConfig deserialized;
deserialized.Read(hwy::Span<const uint32_t>(config_buffer), 0);
return deserialized;
}
TEST(ConfigsTest, OldConfigGemma2B) {
AssertMatch<OldConfigGemma2B<float>>(ConfigFromModel(Model::GEMMA_2B));
ModelConfig config = RoundTripSerialize(ConfigFromModel(Model::GEMMA_2B));
AssertMatch<OldConfigGemma2B<float>>(config);
}
TEST(ConfigsTest, OldConfigGemma7B) {
AssertMatch<OldConfigGemma7B<float>>(ConfigFromModel(Model::GEMMA_7B));
}
TEST(ConfigsTest, OldConfigGemma2_2B) {
AssertMatch<OldConfigGemma2_2B<float>>(ConfigFromModel(Model::GEMMA2_2B));
}
TEST(ConfigsTest, OldConfigGemma2_9B) {
AssertMatch<OldConfigGemma2_9B<float>>(ConfigFromModel(Model::GEMMA2_9B));
}
TEST(ConfigsTest, OldConfigGemma2_27B) {
AssertMatch<OldConfigGemma2_27B<float>>(ConfigFromModel(Model::GEMMA2_27B));
}
TEST(ConfigsTest, OldConfigGriffin2B) {
AssertMatch<OldConfigGriffin2B<float>>(ConfigFromModel(Model::GRIFFIN_2B));
}
TEST(ConfigsTest, OldConfigGemmaTiny) {
AssertMatch<OldConfigGemmaTiny<float>>(ConfigFromModel(Model::GEMMA_TINY));
}
TEST(ConfigsTest, OldConfigPaliGemma_224) {
AssertMatch<OldConfigPaliGemma_224<float>>(
ConfigFromModel(Model::PALIGEMMA_224));
}
} // namespace gcpp