From b3239bf509e2ade75d92724a7bd1877af2782e50 Mon Sep 17 00:00:00 2001 From: Krzysztof Ostrowski Date: Wed, 2 Oct 2024 11:32:07 -0700 Subject: [PATCH] Internal change. PiperOrigin-RevId: 681530185 --- BUILD.bazel | 2 +- backprop/backward_scalar_test.cc | 1 + backprop/backward_test.cc | 1 + gemma/configs.h | 9 +++++++-- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/BUILD.bazel b/BUILD.bazel index 558c4e7..f38fc0d 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -105,7 +105,7 @@ cc_test( cc_test( name = "ops_test", size = "small", - timeout = "long", + timeout = "eternal", srcs = ["ops/ops_test.cc"], local_defines = ["HWY_IS_TEST"], # for test_suite. diff --git a/backprop/backward_scalar_test.cc b/backprop/backward_scalar_test.cc index 121d6a8..15aa876 100644 --- a/backprop/backward_scalar_test.cc +++ b/backprop/backward_scalar_test.cc @@ -387,6 +387,7 @@ struct TestConfig : ConfigBaseGemmaV2 { static constexpr std::array kLayerConfig = FixedLayerConfig<2>(LayerAttentionType::kGemma); static constexpr int kLayers = kLayerConfig.size(); + static constexpr int kNumTensorScales = 4 * kLayers; static constexpr bool kAbsolutePE = false; static constexpr PostNormType kPostNorm = PostNormType::None; diff --git a/backprop/backward_test.cc b/backprop/backward_test.cc index b6c0780..decec20 100644 --- a/backprop/backward_test.cc +++ b/backprop/backward_test.cc @@ -195,6 +195,7 @@ struct TestConfig : public ConfigBaseGemmaV2 { static constexpr std::array kLayerConfig = FixedLayerConfig<2>(LayerAttentionType::kGemma); static constexpr int kLayers = kLayerConfig.size(); + static constexpr int kNumTensorScales = 4 * kLayers; static constexpr bool kAbsolutePE = false; static constexpr PostNormType kPostNorm = PostNormType::None; diff --git a/gemma/configs.h b/gemma/configs.h index c78ab87..b8eb4ea 100644 --- a/gemma/configs.h +++ b/gemma/configs.h @@ -164,8 +164,6 @@ struct ConfigNoSSM : ConfigNoVit { static constexpr bool kUseHalfRope = false; static constexpr bool kUseLocalAttention = false; static constexpr bool kInterleaveQKV = true; - static constexpr int kNumTensorScales = 0; - static constexpr PostQKType kPostQK = PostQKType::Rope; static constexpr ActivationType kActivation = ActivationType::Gelu; static constexpr ResidualType kResidual = ResidualType::Add; @@ -195,6 +193,7 @@ struct ConfigGemma2_27B : public ConfigBaseGemmaV2 { static constexpr std::array kAttentionWindowSizes = RepeatedAttentionWindowSizes<46, 2>({4096, kSeqLen}); static constexpr int kLayers = kLayerConfig.size(); + static constexpr int kNumTensorScales = 4 * kLayers; static constexpr int kGemmaLayers = kLayers; static constexpr int kModelDim = 4608; static constexpr int kFFHiddenDim = 16 * 4608 / 2; // = 36864 @@ -218,6 +217,7 @@ struct ConfigGemma2_9B : public ConfigBaseGemmaV2 { static constexpr std::array kAttentionWindowSizes = RepeatedAttentionWindowSizes<42, 2>({4096, kSeqLen}); static constexpr int kLayers = kLayerConfig.size(); + static constexpr int kNumTensorScales = 4 * kLayers; static constexpr int kGemmaLayers = kLayers; static constexpr int kModelDim = 3584; static constexpr int kFFHiddenDim = 8 * 3584 / 2; // = 14336 @@ -240,6 +240,7 @@ struct ConfigGemma7B : public ConfigBaseGemmaV1 { static constexpr std::array kAttentionWindowSizes = FixedAttentionWindowSizes<28>(kSeqLen); static constexpr int kLayers = kLayerConfig.size(); + static constexpr int kNumTensorScales = 4 * kLayers; static constexpr int kGemmaLayers = kLayers; static constexpr int kModelDim = 3072; static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576 @@ -261,6 +262,7 @@ struct ConfigGemma2B : public ConfigBaseGemmaV1 { static constexpr std::array kAttentionWindowSizes = FixedAttentionWindowSizes<18>(kSeqLen); static constexpr int kLayers = kLayerConfig.size(); + static constexpr int kNumTensorScales = 4 * kLayers; static constexpr int kGemmaLayers = kLayers; static constexpr int kModelDim = 2048; static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384 @@ -285,6 +287,7 @@ struct ConfigPaliGemma_224 : public ConfigGemma2B { static constexpr std::array kLayerConfig = FixedLayerConfig<27>(LayerAttentionType::kVit); static constexpr int kLayers = kLayerConfig.size(); + static constexpr int kNumTensorScales = 4 * kLayers; static constexpr int kModelDim = 1152; static constexpr int kFFHiddenDim = 4304; static constexpr int kHeads = 16; @@ -313,6 +316,7 @@ struct ConfigGemma2_2B : public ConfigBaseGemmaV2 { static constexpr std::array kAttentionWindowSizes = RepeatedAttentionWindowSizes<26, 2>({4096, kSeqLen}); static constexpr int kLayers = kLayerConfig.size(); + static constexpr int kNumTensorScales = 4 * kLayers; static constexpr int kGemmaLayers = kLayers; static constexpr int kModelDim = 2304; static constexpr int kFFHiddenDim = 8 * 2304 / 2; // = 9216 @@ -335,6 +339,7 @@ struct ConfigGemmaTiny : public ConfigNoSSM { static constexpr std::array kAttentionWindowSizes = FixedAttentionWindowSizes<3>(kSeqLen); static constexpr int kLayers = kLayerConfig.size(); + static constexpr int kNumTensorScales = 4 * kLayers; static constexpr int kGemmaLayers = kLayers; static constexpr int kModelDim = 128; static constexpr int kFFHiddenDim = 256;