gemma.cpp/gemma/configs_test.cc

#include "gemma/configs.h"

#include <array>
#include <cstddef>
#include <cstdint>
#include <type_traits>
#include <vector>

#include "gtest/gtest.h"
#include "hwy/aligned_allocator.h"

namespace gcpp {

template <size_t kNum>
constexpr std::array<LayerAttentionType, kNum> OldFixedLayerConfig(
    LayerAttentionType type) {
  std::array<LayerAttentionType, kNum> config = {};
  for (LayerAttentionType& l : config) {
    l = type;
  }
  return config;
}

template <size_t kNum>
constexpr std::array<size_t, kNum> OldFixedAttentionWindowSizes(
    size_t window_size) {
  std::array<size_t, kNum> window_size_configs = {};
  for (size_t& l : window_size_configs) {
    l = window_size;
  }
  return window_size_configs;
}

// Repeat window_size_pattern for kNum / kPatternSize times.
template <size_t kNum, size_t kPatternSize>
constexpr std::array<size_t, kNum> OldRepeatedAttentionWindowSizes(
    const std::array<size_t, kPatternSize>& window_size_pattern) {
  static_assert(kNum % kPatternSize == 0,
                "kNum must be a multiple of kPatternSize");
  std::array<size_t, kNum> window_size_configs = {};
  for (size_t i = 0; i < kNum; ++i) {
    window_size_configs[i] = window_size_pattern[i % kPatternSize];
  }
  return window_size_configs;
}

template <size_t kNumLayers>
constexpr size_t OldNumLayersOfTypeBefore(
    const std::array<LayerAttentionType, kNumLayers>& layers,
    LayerAttentionType type, size_t num) {
  size_t count = 0;
  for (size_t i = 0; i < num; i++) {
    if (layers[i] == type) count++;
  }
  return count;
}

template <class TConfig, typename = void>
struct CacheLayerSize {
  constexpr size_t operator()() const {
    return TConfig::kKVHeads * TConfig::kQKVDim * 2;
  }
};

template <class TConfig, typename = void>
struct CachePosSize {
  constexpr size_t operator()() const {
    return TConfig::kGemmaLayers * CacheLayerSize<TConfig>()();
  }
};

struct OldConfigNoVit {
  struct VitConfig {
    // Some of these are needed to make the compiler happy when trying to
    // generate code that will actually never be used.
    using Weight = float;
    static constexpr int kLayers = 0;
    static constexpr std::array<LayerAttentionType, 0> kLayerConfig =
        OldFixedLayerConfig<0>(LayerAttentionType::kVit);
    static constexpr int kModelDim = 0;
    static constexpr int kFFHiddenDim = 0;
    static constexpr int kHeads = 1;  // Avoid division by 0 in griffin gate_w.
    static constexpr int kKVHeads = 0;
    static constexpr int kQKVDim = 0;
    static constexpr int kSeqLen = 0;
    static constexpr ResidualType kResidual = ResidualType::Add;
    static constexpr int kGriffinLayers = 0;
    static constexpr int kConv1dWidth = 0;
    static constexpr bool kFFBiases = false;
    static constexpr bool kSoftmaxAttnOutputBiases = false;
    static constexpr PostNormType kPostNorm = PostNormType::None;
  };
};

struct OldConfigNoSSM : OldConfigNoVit {
  static constexpr int kGriffinLayers = 0;

  static constexpr int kConv1dWidth = 0;
  static constexpr bool kFFBiases = false;
  static constexpr bool kSoftmaxAttnOutputBiases = false;
  static constexpr bool kUseHalfRope = false;
  static constexpr bool kUseLocalAttention = false;
  static constexpr bool kInterleaveQKV = true;
  static constexpr PostQKType kPostQK = PostQKType::Rope;
  static constexpr ActivationType kActivation = ActivationType::Gelu;
  static constexpr ResidualType kResidual = ResidualType::Add;
};

struct OldConfigBaseGemmaV1 : OldConfigNoSSM {
  static constexpr float kAttCap = 0.0f;
  static constexpr float kFinalCap = 0.0f;
  static constexpr PostNormType kPostNorm = PostNormType::None;
  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
};

struct OldConfigBaseGemmaV2 : OldConfigNoSSM {
  static constexpr float kAttCap = 50.0f;
  static constexpr float kFinalCap = 30.0f;
  static constexpr PostNormType kPostNorm = PostNormType::Scale;
};

template <typename TWeight>
struct OldConfigGemma2_27B : public OldConfigBaseGemmaV2 {
  using Weight = TWeight;  // make accessible where we only have a TConfig

  static constexpr int kSeqLen = 8192;
  static constexpr int kVocabSize = gcpp::kVocabSize;
  static constexpr std::array<LayerAttentionType, 46> kLayerConfig =
      OldFixedLayerConfig<46>(LayerAttentionType::kGemma);
  static constexpr std::array<size_t, 46> kAttentionWindowSizes =
      OldRepeatedAttentionWindowSizes<46, 2>({4096, kSeqLen});
  static constexpr int kLayers = kLayerConfig.size();
  static constexpr int kNumTensorScales = 4 * kLayers;
  static constexpr int kGemmaLayers = kLayers;
  static constexpr int kModelDim = 4608;
  static constexpr int kFFHiddenDim = 16 * 4608 / 2;  // = 36864
  static constexpr int kHeads = 32;
  static constexpr int kKVHeads = 16;
  static constexpr int kQKVDim = 128;  // query size == key size == value size
  static constexpr int kTopK = gcpp::kTopK;
  static constexpr bool kAbsolutePE = false;
  static constexpr QueryScaleType kQueryScale =
      QueryScaleType::SqrtModelDimDivNumHeads;
};

template <typename TWeight>
struct OldConfigGemma2_9B : public OldConfigBaseGemmaV2 {
  using Weight = TWeight;  // make accessible where we only have a TConfig

  static constexpr int kSeqLen = 8192;
  static constexpr int kVocabSize = gcpp::kVocabSize;
  static constexpr std::array<LayerAttentionType, 42> kLayerConfig =
      OldFixedLayerConfig<42>(LayerAttentionType::kGemma);
  static constexpr std::array<size_t, 42> kAttentionWindowSizes =
      OldRepeatedAttentionWindowSizes<42, 2>({4096, kSeqLen});
  static constexpr int kLayers = kLayerConfig.size();
  static constexpr int kNumTensorScales = 4 * kLayers;
  static constexpr int kGemmaLayers = kLayers;
  static constexpr int kModelDim = 3584;
  static constexpr int kFFHiddenDim = 8 * 3584 / 2;  // = 14336
  static constexpr int kHeads = 16;
  static constexpr int kKVHeads = 8;
  static constexpr int kQKVDim = 256;  // query size == key size == value size
  static constexpr int kTopK = gcpp::kTopK;
  static constexpr bool kAbsolutePE = false;
  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
};

template <typename TWeight>
struct OldConfigGemma7B : public OldConfigBaseGemmaV1 {
  using Weight = TWeight;  // make accessible where we only have a TConfig

  static constexpr int kSeqLen = gcpp::kSeqLen;
  static constexpr int kVocabSize = gcpp::kVocabSize;
  static constexpr std::array<LayerAttentionType, 28> kLayerConfig =
      OldFixedLayerConfig<28>(LayerAttentionType::kGemma);
  static constexpr std::array<size_t, 28> kAttentionWindowSizes =
      OldFixedAttentionWindowSizes<28>(kSeqLen);
  static constexpr int kLayers = kLayerConfig.size();
  static constexpr int kNumTensorScales = 4 * kLayers;
  static constexpr int kGemmaLayers = kLayers;
  static constexpr int kModelDim = 3072;
  static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
  static constexpr int kHeads = 16;
  static constexpr int kKVHeads = 16;  // standard MHA
  static constexpr int kQKVDim = 256;  // query size == key size == value size
  static constexpr int kTopK = gcpp::kTopK;
  static constexpr bool kAbsolutePE = false;
};

template <typename TWeight>
struct OldConfigGemma2B : public OldConfigBaseGemmaV1 {
  using Weight = TWeight;  // make accessible where we only have a TConfig

  static constexpr int kSeqLen = gcpp::kSeqLen;
  static constexpr int kVocabSize = gcpp::kVocabSize;
  static constexpr std::array<LayerAttentionType, 18> kLayerConfig =
      OldFixedLayerConfig<18>(LayerAttentionType::kGemma);
  static constexpr std::array<size_t, 18> kAttentionWindowSizes =
      OldFixedAttentionWindowSizes<18>(kSeqLen);
  static constexpr int kLayers = kLayerConfig.size();
  static constexpr int kNumTensorScales = 4 * kLayers;
  static constexpr int kGemmaLayers = kLayers;
  static constexpr int kModelDim = 2048;
  static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
  static constexpr int kHeads = 8;
  static constexpr int kKVHeads = 1;
  static constexpr int kQKVDim = 256;  // query size == key size == value size
  static constexpr int kTopK = gcpp::kTopK;
  static constexpr bool kAbsolutePE = false;
};

template <typename TWeight>
struct OldConfigPaliGemma_224 : public OldConfigGemma2B<TWeight> {
  // On the LM side, the vocab size is one difference to Gemma1-2B in the
  // architecture. PaliGemma adds 1024 <locNNNN> and 128 <segNNN> tokens.
  static constexpr int kVocabSize = 256000 + 1024 + 128;  // = 257152

  // Sub-config for the Vision-Transformer part.
  struct VitConfig : public OldConfigNoSSM {
    using Weight = TWeight;
    // The ViT parts. https://arxiv.org/abs/2305.13035
    // "SoViT-400m/14 [...] has a width of 1152, depth 27, and MLP dim 4304."
    static constexpr std::array<LayerAttentionType, 27> kLayerConfig =
        OldFixedLayerConfig<27>(LayerAttentionType::kVit);
    static constexpr int kLayers = kLayerConfig.size();
    static constexpr int kNumTensorScales = 4 * kLayers;
    static constexpr int kModelDim = 1152;
    static constexpr int kFFHiddenDim = 4304;
    static constexpr int kHeads = 16;
    static constexpr int kKVHeads = 16;  // standard MHA
    static constexpr int kQKVDim = 72;
    static constexpr int kSeqLen = 16 * 16;  // 256
    static constexpr bool kFFBiases = true;
    // The Vit part does not have a vocabulary, the image patches are embedded.
    static constexpr int kVocabSize = 0;
    // Dimensions related to image processing.
    static constexpr int kPatchWidth = 14;
    static constexpr int kImageSize = 224;
    // Necessary constant for the layer configuration.
    static constexpr PostNormType kPostNorm = PostNormType::None;
  };
};

template <typename TWeight>
struct OldConfigGemma2_2B : public OldConfigBaseGemmaV2 {
  using Weight = TWeight;  // make accessible where we only have a TConfig

  static constexpr int kSeqLen = 8192;
  static constexpr int kVocabSize = gcpp::kVocabSize;
  static constexpr std::array<LayerAttentionType, 26> kLayerConfig =
      OldFixedLayerConfig<26>(LayerAttentionType::kGemma);
  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
      OldRepeatedAttentionWindowSizes<26, 2>({4096, kSeqLen});
  static constexpr int kLayers = kLayerConfig.size();
  static constexpr int kNumTensorScales = 4 * kLayers;
  static constexpr int kGemmaLayers = kLayers;
  static constexpr int kModelDim = 2304;
  static constexpr int kFFHiddenDim = 8 * 2304 / 2;  // = 9216
  static constexpr int kHeads = 8;
  static constexpr int kKVHeads = 4;
  static constexpr int kQKVDim = 256;  // query size == key size == value size
  static constexpr int kTopK = gcpp::kTopK;
  static constexpr bool kAbsolutePE = false;
  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
};

template <typename TWeight>
struct OldConfigGemmaTiny : public OldConfigNoSSM {
  using Weight = TWeight;  // make accessible where we only have a TConfig

  static constexpr int kSeqLen = 32;
  static constexpr int kVocabSize = 64;
  static constexpr std::array<LayerAttentionType, 3> kLayerConfig =
      OldFixedLayerConfig<3>(LayerAttentionType::kGemma);
  static constexpr std::array<size_t, 3> kAttentionWindowSizes =
      OldFixedAttentionWindowSizes<3>(kSeqLen);
  static constexpr int kLayers = kLayerConfig.size();
  static constexpr int kNumTensorScales = 4 * kLayers;
  static constexpr int kGemmaLayers = kLayers;
  static constexpr int kModelDim = 128;
  static constexpr int kFFHiddenDim = 256;
  static constexpr int kHeads = 4;
  static constexpr int kKVHeads = 1;
  static constexpr int kQKVDim = 16;  // query size == key size == value size
  static constexpr int kTopK = gcpp::kTopK;
  static constexpr bool kAbsolutePE = false;
  static constexpr PostNormType kPostNorm = PostNormType::None;
  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;

  static constexpr float kAttCap = 0.0f;
  // This is required for optimize_test to pass.
  static constexpr float kFinalCap = 30.0f;
};

template <typename TWeight>
struct OldConfigGriffin2B : OldConfigNoVit {
  using Weight = TWeight;  // make accessible where we only have a TConfig

  // Griffin uses local attention, so kSeqLen is actually the local attention
  // window.
  static constexpr int kSeqLen = 2048;
  static constexpr int kVocabSize = gcpp::kVocabSize;
  static constexpr std::array<LayerAttentionType, 26> kLayerConfig = {
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGemma,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGemma,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGemma,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGemma,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGemma,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGemma,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGemma,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGemma,
      LayerAttentionType::kGriffinRecurrentBlock,
      LayerAttentionType::kGriffinRecurrentBlock,
  };
  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
      OldFixedAttentionWindowSizes<26>(kSeqLen);
  static constexpr int kLayers = kLayerConfig.size();
  static constexpr int kGemmaLayers = OldNumLayersOfTypeBefore(
      kLayerConfig, LayerAttentionType::kGemma, kLayers);
  static constexpr int kGriffinLayers = OldNumLayersOfTypeBefore(
      kLayerConfig, LayerAttentionType::kGriffinRecurrentBlock, kLayers);
  static constexpr int kModelDim = 2560;
  static constexpr int kFFHiddenDim = 7680;
  static constexpr int kHeads = 10;
  static constexpr int kKVHeads = 1;
  static constexpr int kQKVDim = 256;  // query size == key size == value size
  static constexpr int kTopK = gcpp::kTopK;
  static constexpr bool kAbsolutePE = false;
  static constexpr PostNormType kPostNorm = PostNormType::None;

  // No SoftCap.
  static constexpr float kAttCap = 0.0f;
  static constexpr float kFinalCap = 0.0f;

  // SSM config.
  static constexpr int kConv1dWidth = 4;
  static constexpr bool kFFBiases = true;
  static constexpr bool kSoftmaxAttnOutputBiases = true;
  static constexpr bool kUseHalfRope = true;
  static constexpr bool kUseLocalAttention = true;
  static constexpr bool kInterleaveQKV = false;
  static constexpr int kNumTensorScales = 140;
  static constexpr PostQKType kPostQK = PostQKType::Rope;
  static constexpr ActivationType kActivation = ActivationType::Gelu;
  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
  static constexpr ResidualType kResidual = ResidualType::Add;
};

template <class TConfig>
void AssertMatch(const ModelConfig& config) {
  ASSERT_EQ(TConfig::kModelDim, config.model_dim);
  if constexpr (TConfig::VitConfig::kModelDim != 0) {
    ASSERT_EQ(TConfig::VitConfig::kModelDim, config.vit_config.model_dim);
    ASSERT_EQ(TConfig::VitConfig::kSeqLen, config.vit_config.seq_len);
    ASSERT_EQ(TConfig::VitConfig::kNumTensorScales,
              config.vit_config.num_scales);
    for (size_t i = 0; i < config.vit_config.layer_configs.size(); ++i) {
      ASSERT_EQ(TConfig::VitConfig::kLayerConfig[i],
                config.vit_config.layer_configs[i].type);
    }
  }
  ASSERT_EQ(TConfig::kVocabSize, config.vocab_size);
  ASSERT_EQ(TConfig::kSeqLen, config.seq_len);
  ASSERT_EQ(TConfig::kAttCap, config.att_cap);
  ASSERT_EQ(TConfig::kFinalCap, config.final_cap);
  ASSERT_EQ(TConfig::kAbsolutePE, config.absolute_pe);
  ASSERT_EQ(TConfig::kUseLocalAttention, config.use_local_attention);
  ASSERT_EQ(TConfig::kQueryScale, config.query_scale);
  ASSERT_EQ(TConfig::kGemmaLayers,
            config.NumLayersOfType(LayerAttentionType::kGemma));
  ASSERT_EQ(TConfig::kGriffinLayers,
            config.NumLayersOfType(LayerAttentionType::kGriffinRecurrentBlock));
  for (size_t i = 0; i < config.layer_configs.size(); ++i) {
    ASSERT_EQ(TConfig::kModelDim, config.layer_configs[i].model_dim);
    ASSERT_EQ(TConfig::kFFHiddenDim, config.layer_configs[i].ff_hidden_dim);
    ASSERT_EQ(TConfig::kHeads, config.layer_configs[i].heads);
    ASSERT_EQ(TConfig::kKVHeads, config.layer_configs[i].kv_heads);
    ASSERT_EQ(TConfig::kQKVDim, config.layer_configs[i].qkv_dim);
    ASSERT_EQ(TConfig::kConv1dWidth, config.layer_configs[i].conv1d_width);
    ASSERT_EQ(TConfig::kFFBiases, config.layer_configs[i].ff_biases);
    ASSERT_EQ(TConfig::kSoftmaxAttnOutputBiases,
              config.layer_configs[i].softmax_attn_output_biases);
    ASSERT_EQ(TConfig::kPostNorm, config.layer_configs[i].post_norm);
    ASSERT_EQ(TConfig::kLayerConfig[i], config.layer_configs[i].type);
    ASSERT_EQ(TConfig::kActivation, config.layer_configs[i].activation);
    PostQKType post_qk = TConfig::kPostQK;
    if (TConfig::kUseHalfRope) {
      post_qk = PostQKType::HalfRope;
    }
    ASSERT_EQ(post_qk, config.layer_configs[i].post_qk);
  }

  ASSERT_EQ(TConfig::kAttentionWindowSizes.size(),
            config.attention_window_sizes.size());
  for (size_t i = 0; i < config.attention_window_sizes.size(); ++i) {
    ASSERT_EQ(TConfig::kAttentionWindowSizes[i],
              config.attention_window_sizes[i]);
  }
  ASSERT_EQ(TConfig::kNumTensorScales, config.num_tensor_scales);
}

ModelConfig RoundTripSerialize(const ModelConfig& config) {
  std::vector<uint32_t> config_buffer = config.Write();
  ModelConfig deserialized;
  deserialized.Read(hwy::Span<const uint32_t>(config_buffer), 0);
  return deserialized;
}

TEST(ConfigsTest, OldConfigGemma2B) {
  AssertMatch<OldConfigGemma2B<float>>(ConfigFromModel(Model::GEMMA_2B));
  ModelConfig config = RoundTripSerialize(ConfigFromModel(Model::GEMMA_2B));
  AssertMatch<OldConfigGemma2B<float>>(config);
}

TEST(ConfigsTest, OldConfigGemma7B) {
  AssertMatch<OldConfigGemma7B<float>>(ConfigFromModel(Model::GEMMA_7B));
}

TEST(ConfigsTest, OldConfigGemma2_2B) {
  AssertMatch<OldConfigGemma2_2B<float>>(ConfigFromModel(Model::GEMMA2_2B));
}

TEST(ConfigsTest, OldConfigGemma2_9B) {
  AssertMatch<OldConfigGemma2_9B<float>>(ConfigFromModel(Model::GEMMA2_9B));
}

TEST(ConfigsTest, OldConfigGemma2_27B) {
  AssertMatch<OldConfigGemma2_27B<float>>(ConfigFromModel(Model::GEMMA2_27B));
}

TEST(ConfigsTest, OldConfigGriffin2B) {
  AssertMatch<OldConfigGriffin2B<float>>(ConfigFromModel(Model::GRIFFIN_2B));
}

TEST(ConfigsTest, OldConfigGemmaTiny) {
  AssertMatch<OldConfigGemmaTiny<float>>(ConfigFromModel(Model::GEMMA_TINY));
}

TEST(ConfigsTest, OldConfigPaliGemma_224) {
  AssertMatch<OldConfigPaliGemma_224<float>>(
      ConfigFromModel(Model::PALIGEMMA_224));
}

}  // namespace gcpp