From 0d68555f875d3b34d29e8c5c7290e10a8fae3609 Mon Sep 17 00:00:00 2001
From: Ray Smith <rays@google.com>
Date: Thu, 17 Oct 2024 05:03:35 -0700
Subject: [PATCH] Eliminated TConfig. Changed CompressedLayer and
 CompressedWeights to be constructed with an instance of a LayerConfig and
 WeightsConfig respectively. Added CompressedModel to remove ByteStorageT and
 get rid of most of the type casting, as well as allowing the default
 destructor to be used and work properly. Adjusted WeightsWrapper and
 ForwardLayer etc to match. The only remaining template arg is the weight
 type. This enables all the instantiations to be deleted, apart from one per
 type. It also enables (but not yet done) the config to be stored in the blob
 file instead of having to be specified separately. Reduces the size of the
 gemma_lib and weights shared libraries by a factor of 4.3 and 3.2
 respectively.

PiperOrigin-RevId: 686870060
---
 BUILD.bazel                                 |   58 +-
 CMakeLists.txt                              |   29 +-
 backprop/activations.h                      |   93 +-
 backprop/backward-inl.h                     |  258 ++---
 backprop/backward.cc                        |   54 +-
 backprop/backward.h                         |   14 +-
 backprop/backward_scalar.h                  |  176 ++-
 backprop/backward_scalar_test.cc            |  137 +--
 backprop/backward_test.cc                   |   90 +-
 backprop/forward-inl.h                      |  132 +--
 backprop/forward.cc                         |   46 +-
 backprop/forward.h                          |   11 +-
 backprop/forward_scalar.h                   |  154 ++-
 backprop/optimize_test.cc                   |   58 +-
 backprop/optimizer.cc                       |   85 +-
 backprop/optimizer.h                        |   17 +-
 backprop/test_util.h                        |   66 +-
 compression/blob_store.cc                   |    2 +-
 compression/compress.h                      |   47 +-
 compression/compress_weights.cc             |   59 +-
 compression/shared.h                        |   61 +-
 evals/benchmark.cc                          |    4 +-
 evals/benchmark_helper.cc                   |    6 +-
 evals/cross_entropy.cc                      |    3 +-
 examples/hello_world/run.cc                 |    4 +-
 gemma/activations.h                         |   82 +-
 gemma/common.cc                             |   19 +-
 gemma/common.h                              |  212 +---
 gemma/configs.cc                            |  246 ++++
 gemma/configs.h                             |  419 ++-----
 gemma/configs_test.cc                       |  445 ++++++++
 gemma/gemma-inl.h                           | 1117 ++++++++++---------
 gemma/gemma.cc                              |   92 +-
 gemma/gemma.h                               |   20 +-
 gemma/instantiations/27b_bf16.cc            |   21 -
 gemma/instantiations/27b_f32.cc             |   21 -
 gemma/instantiations/27b_sfp.cc             |   21 -
 gemma/instantiations/2b_bf16.cc             |   21 -
 gemma/instantiations/7b_bf16.cc             |   21 -
 gemma/instantiations/7b_sfp.cc              |   21 -
 gemma/instantiations/9b_bf16.cc             |   21 -
 gemma/instantiations/9b_sfp.cc              |   21 -
 gemma/instantiations/{2b_f32.cc => bf16.cc} |    5 +-
 gemma/instantiations/{7b_f32.cc => f32.cc}  |    5 +-
 gemma/instantiations/gemma2_2b_bf16.cc      |   21 -
 gemma/instantiations/gemma2_2b_f32.cc       |   21 -
 gemma/instantiations/gemma2_2b_sfp.cc       |   21 -
 gemma/instantiations/gr2b_bf16.cc           |   21 -
 gemma/instantiations/gr2b_f32.cc            |   21 -
 gemma/instantiations/gr2b_sfp.cc            |   21 -
 gemma/instantiations/{9b_f32.cc => nuq.cc}  |    5 +-
 gemma/instantiations/paligemma_224_bf16.cc  |   21 -
 gemma/instantiations/paligemma_224_f32.cc   |   21 -
 gemma/instantiations/paligemma_224_sfp.cc   |   21 -
 gemma/instantiations/{2b_sfp.cc => sfp.cc}  |    5 +-
 gemma/instantiations/tiny_bf16.cc           |   21 -
 gemma/instantiations/tiny_f32.cc            |   21 -
 gemma/instantiations/tiny_sfp.cc            |   21 -
 gemma/kv_cache.cc                           |   87 +-
 gemma/kv_cache.h                            |    3 +-
 gemma/run.cc                                |    2 +-
 gemma/weights.cc                            |  224 ++--
 gemma/weights.h                             |  459 ++++----
 ops/gemma_matvec_test.cc                    |   14 +-
 ops/matvec-inl.h                            |  118 +-
 ops/ops-inl.h                               |   21 +-
 ops/ops_test.cc                             |   26 +-
 util/app.h                                  |    2 +
 68 files changed, 2810 insertions(+), 2902 deletions(-)
 create mode 100644 gemma/configs.cc
 create mode 100644 gemma/configs_test.cc
 delete mode 100644 gemma/instantiations/27b_bf16.cc
 delete mode 100644 gemma/instantiations/27b_f32.cc
 delete mode 100644 gemma/instantiations/27b_sfp.cc
 delete mode 100644 gemma/instantiations/2b_bf16.cc
 delete mode 100644 gemma/instantiations/7b_bf16.cc
 delete mode 100644 gemma/instantiations/7b_sfp.cc
 delete mode 100644 gemma/instantiations/9b_bf16.cc
 delete mode 100644 gemma/instantiations/9b_sfp.cc
 rename gemma/instantiations/{2b_f32.cc => bf16.cc} (87%)
 rename gemma/instantiations/{7b_f32.cc => f32.cc} (87%)
 delete mode 100644 gemma/instantiations/gemma2_2b_bf16.cc
 delete mode 100644 gemma/instantiations/gemma2_2b_f32.cc
 delete mode 100644 gemma/instantiations/gemma2_2b_sfp.cc
 delete mode 100644 gemma/instantiations/gr2b_bf16.cc
 delete mode 100644 gemma/instantiations/gr2b_f32.cc
 delete mode 100644 gemma/instantiations/gr2b_sfp.cc
 rename gemma/instantiations/{9b_f32.cc => nuq.cc} (87%)
 delete mode 100644 gemma/instantiations/paligemma_224_bf16.cc
 delete mode 100644 gemma/instantiations/paligemma_224_f32.cc
 delete mode 100644 gemma/instantiations/paligemma_224_sfp.cc
 rename gemma/instantiations/{2b_sfp.cc => sfp.cc} (87%)
 delete mode 100644 gemma/instantiations/tiny_bf16.cc
 delete mode 100644 gemma/instantiations/tiny_f32.cc
 delete mode 100644 gemma/instantiations/tiny_sfp.cc

diff --git a/BUILD.bazel b/BUILD.bazel
index 1e0cc73..c480f23 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -104,8 +104,6 @@ cc_test(
     tags = ["hwy_ops_test"],
     deps = [
         ":allocator",
-        ":common",
-        ":gemma_lib",
         ":ops",
         ":test_util",
         ":threading",
@@ -183,7 +181,10 @@ cc_test(
 
 cc_library(
     name = "common",
-    srcs = ["gemma/common.cc"],
+    srcs = [
+        "gemma/common.cc",
+        "gemma/configs.cc",
+    ],
     hdrs = [
         "gemma/common.h",
         "gemma/configs.h",
@@ -195,12 +196,20 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "configs_test",
+    srcs = ["gemma/configs_test.cc"],
+    deps = [
+        ":common",
+        "@googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "weights",
     srcs = ["gemma/weights.cc"],
     hdrs = ["gemma/weights.h"],
     deps = [
-        ":allocator",
         ":common",
         "//compression:compress",
         "//compression:io",
@@ -219,7 +228,6 @@ cc_library(
         ":common",
         "//compression:io",
         "@highway//:hwy",
-        "@highway//:nanobenchmark",  # timer
         "@highway//:profiler",
         "@com_google_sentencepiece//:sentencepiece_processor",
     ],
@@ -239,30 +247,10 @@ cc_library(
     name = "gemma_lib",
     srcs = [
         "gemma/gemma.cc",
-        "gemma/instantiations/27b_bf16.cc",
-        "gemma/instantiations/27b_f32.cc",
-        "gemma/instantiations/27b_sfp.cc",
-        "gemma/instantiations/2b_bf16.cc",
-        "gemma/instantiations/2b_f32.cc",
-        "gemma/instantiations/2b_sfp.cc",
-        "gemma/instantiations/7b_bf16.cc",
-        "gemma/instantiations/7b_f32.cc",
-        "gemma/instantiations/7b_sfp.cc",
-        "gemma/instantiations/9b_bf16.cc",
-        "gemma/instantiations/9b_f32.cc",
-        "gemma/instantiations/9b_sfp.cc",
-        "gemma/instantiations/tiny_bf16.cc",
-        "gemma/instantiations/tiny_f32.cc",
-        "gemma/instantiations/tiny_sfp.cc",
-        "gemma/instantiations/gr2b_bf16.cc",
-        "gemma/instantiations/gr2b_f32.cc",
-        "gemma/instantiations/gr2b_sfp.cc",
-        "gemma/instantiations/gemma2_2b_bf16.cc",
-        "gemma/instantiations/gemma2_2b_f32.cc",
-        "gemma/instantiations/gemma2_2b_sfp.cc",
-        "gemma/instantiations/paligemma_224_bf16.cc",
-        "gemma/instantiations/paligemma_224_f32.cc",
-        "gemma/instantiations/paligemma_224_sfp.cc",
+        "gemma/instantiations/bf16.cc",
+        "gemma/instantiations/f32.cc",
+        "gemma/instantiations/nuq.cc",
+        "gemma/instantiations/sfp.cc",
     ],
     hdrs = [
         "gemma/activations.h",
@@ -327,8 +315,6 @@ cc_library(
         ":threading",
         "//compression:io",
         "@highway//:hwy",
-        "@highway//:thread_pool",
-        "@highway//:topology",
     ],
 )
 
@@ -367,7 +353,6 @@ cc_test(
         ":benchmark_helper",
         ":common",
         ":gemma_lib",
-        ":tokenizer",
         "@googletest//:gtest_main",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
@@ -396,7 +381,6 @@ cc_binary(
     name = "single_benchmark",
     srcs = ["evals/benchmark.cc"],
     deps = [
-        ":app",
         ":args",
         ":benchmark_helper",
         ":common",
@@ -405,7 +389,6 @@ cc_binary(
         "//compression:io",
         "@highway//:hwy",
         "@highway//:nanobenchmark",
-        "@highway//:thread_pool",
         "@nlohmann_json//:json",
     ],
 )
@@ -429,13 +412,11 @@ cc_binary(
         "evals/debug_prompt.cc",
     ],
     deps = [
-        ":app",
         ":args",
         ":benchmark_helper",
         ":gemma_lib",
         "//compression:io",
         "@highway//:hwy",
-        "@highway//:thread_pool",
         "@nlohmann_json//:json",
     ],
 )
@@ -444,7 +425,6 @@ cc_binary(
     name = "gemma_mmlu",
     srcs = ["evals/run_mmlu.cc"],
     deps = [
-        ":app",
         ":args",
         ":benchmark_helper",
         ":gemma_lib",
@@ -488,7 +468,6 @@ cc_library(
     deps = [
         ":allocator",
         ":common",
-        ":gemma_lib",
         ":ops",
         ":prompt",
         ":weights",
@@ -508,7 +487,6 @@ cc_library(
         "backprop/forward_scalar.h",
     ],
     deps = [
-        ":allocator",
         ":common",
         ":prompt",
         ":weights",
@@ -525,7 +503,6 @@ cc_test(
         "backprop/test_util.h",
     ],
     deps = [
-        ":allocator",
         ":backprop_scalar",
         ":common",
         ":prompt",
@@ -599,6 +576,7 @@ cc_test(
         ":threading",
         ":weights",
         "@googletest//:gtest_main",
+        "//compression:sfp",
         "@highway//:thread_pool",
     ],
 )
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51ab2e4..bade5de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,34 +68,15 @@ set(SOURCES
   gemma/activations.h
   gemma/common.cc
   gemma/common.h
+  gemma/configs.cc
   gemma/configs.h
   gemma/gemma-inl.h
   gemma/gemma.cc
   gemma/gemma.h
-  gemma/instantiations/27b_bf16.cc
-  gemma/instantiations/27b_f32.cc
-  gemma/instantiations/27b_sfp.cc
-  gemma/instantiations/2b_bf16.cc
-  gemma/instantiations/2b_f32.cc
-  gemma/instantiations/2b_sfp.cc
-  gemma/instantiations/7b_bf16.cc
-  gemma/instantiations/7b_f32.cc
-  gemma/instantiations/7b_sfp.cc
-  gemma/instantiations/9b_bf16.cc
-  gemma/instantiations/9b_f32.cc
-  gemma/instantiations/9b_sfp.cc
-  gemma/instantiations/gr2b_bf16.cc
-  gemma/instantiations/gr2b_f32.cc
-  gemma/instantiations/gr2b_sfp.cc
-  gemma/instantiations/tiny_bf16.cc
-  gemma/instantiations/tiny_f32.cc
-  gemma/instantiations/tiny_sfp.cc
-  gemma/instantiations/gemma2_2b_bf16.cc
-  gemma/instantiations/gemma2_2b_f32.cc
-  gemma/instantiations/gemma2_2b_sfp.cc
-  gemma/instantiations/paligemma_224_bf16.cc
-  gemma/instantiations/paligemma_224_f32.cc
-  gemma/instantiations/paligemma_224_sfp.cc
+  gemma/instantiations/bf16.cc
+  gemma/instantiations/f32.cc
+  gemma/instantiations/nuq.cc
+  gemma/instantiations/sfp.cc
   gemma/kv_cache.cc
   gemma/kv_cache.h
   gemma/tokenizer.cc
diff --git a/backprop/activations.h b/backprop/activations.h
index 4f2e821..c616759 100644
--- a/backprop/activations.h
+++ b/backprop/activations.h
@@ -18,32 +18,27 @@
 
 #include <stddef.h>
 
-#include <array>
+#include <vector>
 
 #include "compression/compress.h"  // MatStorageT
-#include "util/allocator.h"  // ByteStorageT
+#include "gemma/configs.h"         // ModelConfig
 
 namespace gcpp {
 
-template <typename T, typename TConfig>
+template <typename T>
 struct ForwardLayer {
-  ForwardLayer()
-      : input("input", kSeqLen, kModelDim),
-        pre_att_rms_out("pre_att_rms_out", kSeqLen, kModelDim),
-        qkv("qkv", kSeqLen * (kHeads + 2), kQKVDim),
-        att("att", kSeqLen * kHeads, kSeqLen),
-        att_out("att_out", kSeqLen * kHeads, kQKVDim),
-        att_post1("att_post1", kSeqLen, kModelDim),
-        attention_out("attention_out", kSeqLen, kModelDim),
-        bf_pre_ffw_rms_out("bf_pre_ffw_rms_out", kSeqLen, kModelDim),
-        ffw_hidden("ffw_hidden", kSeqLen, kFFHiddenDim * 2),
-        ffw_hidden_gated("ffw_hidden_gated", kSeqLen, kFFHiddenDim) {}
-
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
+  ForwardLayer(const LayerConfig& config, size_t seq_len)
+      : input("input", seq_len, config.model_dim),
+        pre_att_rms_out("pre_att_rms_out", seq_len, config.model_dim),
+        qkv("qkv", seq_len * (config.heads + 2), config.qkv_dim),
+        att("att", seq_len * config.heads, seq_len),
+        att_out("att_out", seq_len * config.heads, config.qkv_dim),
+        att_post1("att_post1", seq_len, config.model_dim),
+        attention_out("attention_out", seq_len, config.model_dim),
+        bf_pre_ffw_rms_out("bf_pre_ffw_rms_out", seq_len, config.model_dim),
+        ffw_hidden("ffw_hidden", seq_len, config.ff_hidden_dim * 2),
+        ffw_hidden_gated("ffw_hidden_gated", seq_len, config.ff_hidden_dim),
+        layer_config(config) {}
 
   MatStorageT<T> input;
   MatStorageT<T> pre_att_rms_out;
@@ -55,56 +50,30 @@ struct ForwardLayer {
   MatStorageT<T> bf_pre_ffw_rms_out;
   MatStorageT<T> ffw_hidden;
   MatStorageT<T> ffw_hidden_gated;
+  const LayerConfig& layer_config;
 };
 
-template <typename T, typename TConfig>
+template <typename T>
 struct ForwardPass {
-  ForwardPass()
-      : final_layer_output("final_layer_output", kSeqLen, kModelDim),
-        final_norm_output("final_norm_output", kSeqLen, kModelDim),
-        logits("logits", kSeqLen, kVocabSize),
-        probs("probs", kSeqLen, kVocabSize) {
-  }  // prevents placement-new calling memset
+  ForwardPass(const ModelConfig& config)
+      : final_layer_output("final_layer_output", config.seq_len,
+                           config.model_dim),
+        final_norm_output("final_norm_output", config.seq_len,
+                          config.model_dim),
+        logits("logits", config.seq_len, config.vocab_size),
+        probs("probs", config.seq_len, config.vocab_size),
+        weights_config(config) {
+    for (const auto& layer_config : config.layer_configs) {
+      layers.emplace_back(layer_config, config.seq_len);
+    }
+  }
 
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  static constexpr size_t kLayers = TConfig::kLayers;
-
-  std::array<ForwardLayer<T, TConfig>, kLayers> layers;
+  std::vector<ForwardLayer<T>> layers;
   MatStorageT<T> final_layer_output;
   MatStorageT<T> final_norm_output;
   MatStorageT<T> logits;
   MatStorageT<T> probs;
-};
-
-template <typename TConfig>
-struct AllocateForwardPass {
-  ByteStorageT operator()() const {
-    ByteStorageT c_weights_u8 = AllocateSizeof<ForwardPass<float, TConfig>>();
-    auto* c_weights =
-        reinterpret_cast<ForwardPass<float, TConfig>*>(c_weights_u8.get());
-    new (c_weights) ForwardPass<float, TConfig>();
-    return c_weights_u8;
-  }
-};
-
-// Owns activations and undoes the type erasure of AllocateAligned.
-template<typename T, typename TConfig>
-class ActivationsWrapper {
-  using WrappedT = ForwardPass<T, TConfig>;
-
- public:
-  ActivationsWrapper()
-      : data_(AllocateSizeof<WrappedT>()),
-        activations_(*(new(data_.get()) WrappedT())) {}
-
-  const WrappedT& get() const { return activations_; }
-  WrappedT& get() { return activations_; }
-
- private:
-  ByteStorageT data_;
-  WrappedT& activations_;
+  const ModelConfig& weights_config;
 };
 
 }  // namespace gcpp
diff --git a/backprop/backward-inl.h b/backprop/backward-inl.h
index f765a5a..2a0f330 100644
--- a/backprop/backward-inl.h
+++ b/backprop/backward-inl.h
@@ -28,6 +28,7 @@
 #include "backprop/activations.h"
 #include "backprop/prompt.h"
 #include "gemma/common.h"
+#include "gemma/weights.h"
 #include "util/allocator.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -53,45 +54,41 @@ namespace gcpp {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-template <size_t kCols, size_t kRows>
-void MatMulVJP(const float* HWY_RESTRICT weights,  // kRows * kCols,
-               const float* HWY_RESTRICT x,       // num_tokens * kCols
-               const float* HWY_RESTRICT v,       // num_tokens * kRows
-               size_t num_tokens,
-               float* HWY_RESTRICT grad_w,         // kRows * kCols,
-               float* HWY_RESTRICT grad_x,        // num_tokens * kCols
-               hwy::ThreadPool& pool) {
-  hwy::ZeroBytes(grad_x, num_tokens * kCols * sizeof(grad_x[0]));
+HWY_INLINE void MatMulVJP(const float* HWY_RESTRICT weights,  // kRows * kCols,
+                          const float* HWY_RESTRICT x,  // num_tokens * kCols
+                          const float* HWY_RESTRICT v,  // num_tokens * kRows
+                          size_t cols, size_t rows, size_t num_tokens,
+                          float* HWY_RESTRICT grad_w,  // kRows * kCols,
+                          float* HWY_RESTRICT grad_x,  // num_tokens * kCols
+                          hwy::ThreadPool& pool) {
+  hwy::ZeroBytes(grad_x, num_tokens * cols * sizeof(grad_x[0]));
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t voffs = pos * kRows;
-    const size_t xoffs = pos * kCols;
-    for (size_t j = 0; j < kRows; ++j) {
-      MulByConstAndAdd(v[voffs + j], &x[xoffs], &grad_w[j * kCols], kCols);
-      MulByConstAndAdd(v[voffs + j], &weights[j * kCols], &grad_x[xoffs],
-                       kCols);
+    const size_t voffs = pos * rows;
+    const size_t xoffs = pos * cols;
+    for (size_t j = 0; j < rows; ++j) {
+      MulByConstAndAdd(v[voffs + j], &x[xoffs], &grad_w[j * cols], cols);
+      MulByConstAndAdd(v[voffs + j], &weights[j * cols], &grad_x[xoffs], cols);
     }
   }
 }
 
-template <size_t kHeads, size_t kCols, size_t kRows>
-void MultiHeadMatMulVJP(
-    const float* HWY_RESTRICT weights,  // kHeads * kRows * kCols
-    const float* HWY_RESTRICT x,        // num_tokens * kHeads * kCols
+HWY_INLINE void MultiHeadMatMulVJP(
+    const float* HWY_RESTRICT weights,  // heads * kRows * kCols
+    const float* HWY_RESTRICT x,        // num_tokens * heads * kCols
     const float* HWY_RESTRICT v,        // num_tokens * kRows
-    size_t num_tokens,
-    float* HWY_RESTRICT grad_w,         // kHeads * kRows * kCols
-    float* HWY_RESTRICT grad_x,         // num_tokens * kHeads * kCols
+    size_t heads, size_t cols, size_t rows, size_t num_tokens,
+    float* HWY_RESTRICT grad_w,  // heads * kRows * kCols
+    float* HWY_RESTRICT grad_x,  // num_tokens * heads * kCols
     hwy::ThreadPool& pool) {
-  hwy::ZeroBytes(grad_x, num_tokens * kHeads * kCols * sizeof(grad_x[0]));
+  hwy::ZeroBytes(grad_x, num_tokens * heads * cols * sizeof(grad_x[0]));
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t j = 0; j < kRows; ++j) {
-      for (size_t h = 0; h < kHeads; ++h) {
-        MulByConstAndAdd(v[pos * kRows + j],
-                         &x[pos * kHeads * kCols + h * kCols],
-                         &grad_w[h * kRows * kCols + j * kCols], kCols);
-        MulByConstAndAdd(v[pos * kRows + j],
-                         &weights[h * kRows * kCols + j * kCols],
-                         &grad_x[pos * kHeads * kCols + h * kCols], kCols);
+    for (size_t j = 0; j < rows; ++j) {
+      for (size_t h = 0; h < heads; ++h) {
+        MulByConstAndAdd(v[pos * rows + j], &x[pos * heads * cols + h * cols],
+                         &grad_w[h * rows * cols + j * cols], cols);
+        MulByConstAndAdd(v[pos * rows + j],
+                         &weights[h * rows * cols + j * cols],
+                         &grad_x[pos * heads * cols + h * cols], cols);
       }
     }
   }
@@ -168,39 +165,39 @@ static HWY_NOINLINE void InputEmbeddingVJP(
   }
 }
 
-template <typename TConfig, typename LayerT>
-void LayerVJP(const LayerT& weights,
-              const ForwardLayer<float, TConfig>& forward,
+template <typename T>
+void LayerVJP(const LayerWeightsPtrs<T>& weights,
+              const ForwardLayer<float>& forward,
               const float* HWY_RESTRICT next_layer_grad, size_t num_tokens,
-              LayerT& grad, ForwardLayer<float, TConfig>& backward,
+              LayerWeightsPtrs<T>& grad, ForwardLayer<float>& backward,
               const RowVectorBatch<float>& inv_timescale,
               hwy::ThreadPool& pool) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-  static const float kQueryScale =
-      static_cast<float>(1.0 / sqrt(static_cast<double>(kQKVDim)));
-  HWY_ASSERT(num_tokens <= kSeqLen);
+  const LayerConfig& config = weights.layer_config;
+  const size_t model_dim = config.model_dim;
+  const size_t qkv_dim = config.qkv_dim;
+  const size_t heads = config.heads;
+  const size_t seq_len = forward.input.Rows();
+  const size_t ff_hidden_dim = config.ff_hidden_dim;
+  const float query_scale =
+      static_cast<float>(1.0 / sqrt(static_cast<double>(qkv_dim)));
+  HWY_ASSERT(num_tokens <= seq_len);
 
-  MatMulVJP<kFFHiddenDim, kModelDim>(
-      weights.linear_w.data(), forward.ffw_hidden_gated.data(), next_layer_grad,
-      num_tokens, grad.linear_w.data(), backward.ffw_hidden_gated.data(),
-      pool);
+  MatMulVJP(weights.linear_w.data(), forward.ffw_hidden_gated.data(),
+            next_layer_grad, ff_hidden_dim, model_dim, num_tokens,
+            grad.linear_w.data(), backward.ffw_hidden_gated.data(), pool);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t hidden_offset = pos * kFFHiddenDim * 2;
+    const size_t hidden_offset = pos * ff_hidden_dim * 2;
     const float* HWY_RESTRICT f_out = forward.ffw_hidden.data() + hidden_offset;
-    const float* HWY_RESTRICT f_out_mul = f_out + kFFHiddenDim;
+    const float* HWY_RESTRICT f_out_mul = f_out + ff_hidden_dim;
     const float* HWY_RESTRICT b_out_gated =
-        backward.ffw_hidden_gated.data() + pos * kFFHiddenDim;
+        backward.ffw_hidden_gated.data() + pos * ff_hidden_dim;
     float* HWY_RESTRICT b_out = backward.ffw_hidden.data() + hidden_offset;
-    float* HWY_RESTRICT b_out_mul = b_out + kFFHiddenDim;
+    float* HWY_RESTRICT b_out_mul = b_out + ff_hidden_dim;
     namespace hn = hwy::HWY_NAMESPACE;
     using DF = hn::ScalableTag<float>;
     DF df;
-    for (size_t i = 0; i < kFFHiddenDim; i += Lanes(df)) {
+    for (size_t i = 0; i < ff_hidden_dim; i += Lanes(df)) {
       const auto y = Load(df, f_out + i);
       const auto x = Load(df, f_out_mul + i);
       const auto v = Load(df, b_out_gated + i);
@@ -209,101 +206,94 @@ void LayerVJP(const LayerT& weights,
     }
   }
 
-  MatMulVJP<kModelDim, kFFHiddenDim * 2>(
-      weights.gating_einsum_w.data(),
-      forward.bf_pre_ffw_rms_out.data(), backward.ffw_hidden.data(),
-      num_tokens, grad.gating_einsum_w.data(),
-      backward.bf_pre_ffw_rms_out.data(), pool);
-  RMSNormVJP(weights.pre_ffw_norm_scale.data(),
-             forward.attention_out.data(),
-             backward.bf_pre_ffw_rms_out.data(),
-             kModelDim, num_tokens,
-             grad.pre_ffw_norm_scale.data(),
-             backward.attention_out.data(), pool);
+  MatMulVJP(weights.gating_einsum_w.data(), forward.bf_pre_ffw_rms_out.data(),
+            backward.ffw_hidden.data(), model_dim, ff_hidden_dim * 2,
+            num_tokens, grad.gating_einsum_w.data(),
+            backward.bf_pre_ffw_rms_out.data(), pool);
+  RMSNormVJP(weights.pre_ffw_norm_scale.data(), forward.attention_out.data(),
+             backward.bf_pre_ffw_rms_out.data(), model_dim, num_tokens,
+             grad.pre_ffw_norm_scale.data(), backward.attention_out.data(),
+             pool);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(next_layer_grad + pos * kModelDim,
-            backward.attention_out.data() + pos * kModelDim, kModelDim);
+    AddFrom(next_layer_grad + pos * model_dim,
+            backward.attention_out.data() + pos * model_dim, model_dim);
   }
 
   backward.qkv.ZeroInit();
 
-  MultiHeadMatMulVJP<kHeads, kQKVDim, kModelDim>(
-      weights.attn_vec_einsum_w.data(), forward.att_out.data(),
-      backward.attention_out.data(), num_tokens,
-      grad.attn_vec_einsum_w.data(), backward.att_out.data(), pool);
+  MultiHeadMatMulVJP(weights.attn_vec_einsum_w.data(), forward.att_out.data(),
+                     backward.attention_out.data(), heads, qkv_dim, model_dim,
+                     num_tokens, grad.attn_vec_einsum_w.data(),
+                     backward.att_out.data(), pool);
 
-  for (size_t head = 0; head < kHeads; ++head) {
+  for (size_t head = 0; head < heads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t aoffset = head * kSeqLen + pos * kHeads * kSeqLen;
+      const size_t aoffset = head * seq_len + pos * heads * seq_len;
       const float* HWY_RESTRICT f_head_att = forward.att.data() + aoffset;
       const float* HWY_RESTRICT b_att_out =
-          backward.att_out.data() + (pos * kHeads + head) * kQKVDim;
+          backward.att_out.data() + (pos * heads + head) * qkv_dim;
       float* HWY_RESTRICT b_head_att = backward.att.data() + aoffset;
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const size_t v2offs = (pos2 * (kHeads + 2) + kHeads + 1) * kQKVDim;
+        const size_t v2offs = (pos2 * (heads + 2) + heads + 1) * qkv_dim;
         const float* HWY_RESTRICT f_v2 = forward.qkv.data() + v2offs;
         float* HWY_RESTRICT b_v2 = backward.qkv.data() + v2offs;
-        b_head_att[pos2] = Dot(b_att_out, f_v2, kQKVDim);
-        MulByConstAndAdd(f_head_att[pos2], b_att_out, b_v2, kQKVDim);
+        b_head_att[pos2] = Dot(b_att_out, f_v2, qkv_dim);
+        MulByConstAndAdd(f_head_att[pos2], b_att_out, b_v2, qkv_dim);
       }
     }
   }
 
-  for (size_t head = 0; head < kHeads; ++head) {
+  for (size_t head = 0; head < heads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t aoffset = head * kSeqLen + pos * kHeads * kSeqLen;
+      const size_t aoffset = head * seq_len + pos * heads * seq_len;
       const float* HWY_RESTRICT f_head_att = forward.att.data() + aoffset;
       float* HWY_RESTRICT b_head_att = backward.att.data() + aoffset;
       SoftmaxVJP(f_head_att, b_head_att, pos + 1);
     }
   }
 
-  for (size_t head = 0; head < kHeads; ++head) {
+  for (size_t head = 0; head < heads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t qoffs = (pos * (kHeads + 2) + head) * kQKVDim;
-      const size_t aoffs = head * kSeqLen + pos * kHeads * kSeqLen;
+      const size_t qoffs = (pos * (heads + 2) + head) * qkv_dim;
+      const size_t aoffs = head * seq_len + pos * heads * seq_len;
       const float* HWY_RESTRICT f_q = forward.qkv.data() + qoffs;
       const float* HWY_RESTRICT b_head_att = backward.att.data() + aoffs;
       float* HWY_RESTRICT b_q = backward.qkv.data() + qoffs;
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const size_t k2offs = (pos2 * (kHeads + 2) + kHeads) * kQKVDim;
+        const size_t k2offs = (pos2 * (heads + 2) + heads) * qkv_dim;
         const float* HWY_RESTRICT f_k2 = forward.qkv.data() + k2offs;
         float* HWY_RESTRICT b_k2 = backward.qkv.data() + k2offs;
-        MulByConstAndAdd(b_head_att[pos2], f_k2, b_q, kQKVDim);
-        MulByConstAndAdd(b_head_att[pos2], f_q, b_k2, kQKVDim);
+        MulByConstAndAdd(b_head_att[pos2], f_k2, b_q, qkv_dim);
+        MulByConstAndAdd(b_head_att[pos2], f_q, b_k2, qkv_dim);
       }
     }
   }
 
   for (int pos = 0; pos < static_cast<int>(num_tokens); ++pos) {
     float* HWY_RESTRICT b_kv =
-        backward.qkv.data() + (pos * (kHeads + 2) + kHeads) * kQKVDim;
-    Rope(b_kv, kQKVDim, inv_timescale.Const(), -pos);
+        backward.qkv.data() + (pos * (heads + 2) + heads) * qkv_dim;
+    Rope(b_kv, qkv_dim, inv_timescale.Const(), -pos);
   }
 
-  for (size_t head = 0; head < kHeads; ++head) {
+  for (size_t head = 0; head < heads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
       float* HWY_RESTRICT b_q =
-          backward.qkv.data() + (pos * (kHeads + 2) + head) * kQKVDim;
-      MulByConst(kQueryScale, b_q, kQKVDim);
-      Rope(b_q, kQKVDim, inv_timescale.Const(), -pos);
+          backward.qkv.data() + (pos * (heads + 2) + head) * qkv_dim;
+      MulByConst(query_scale, b_q, qkv_dim);
+      Rope(b_q, qkv_dim, inv_timescale.Const(), -pos);
     }
   }
 
-  MatMulVJP<kModelDim, (kHeads + 2) * kQKVDim>(
-      weights.qkv_einsum_w.data(), forward.pre_att_rms_out.data(),
-      backward.qkv.data(), num_tokens,
-      grad.qkv_einsum_w.data(), backward.pre_att_rms_out.data(), pool);
-  RMSNormVJP(weights.pre_attention_norm_scale.data(),
-             forward.input.data(),
-             backward.pre_att_rms_out.data(),
-             kModelDim, num_tokens,
-             grad.pre_attention_norm_scale.data(),
-             backward.input.data(), pool);
+  MatMulVJP(weights.qkv_einsum_w.data(), forward.pre_att_rms_out.data(),
+            backward.qkv.data(), model_dim, (heads + 2) * qkv_dim, num_tokens,
+            grad.qkv_einsum_w.data(), backward.pre_att_rms_out.data(), pool);
+  RMSNormVJP(weights.pre_attention_norm_scale.data(), forward.input.data(),
+             backward.pre_att_rms_out.data(), model_dim, num_tokens,
+             grad.pre_attention_norm_scale.data(), backward.input.data(), pool);
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(backward.attention_out.data() + pos * kModelDim,
-            backward.input.data() + pos * kModelDim, kModelDim);
+    AddFrom(backward.attention_out.data() + pos * model_dim,
+            backward.input.data() + pos * model_dim, model_dim);
   }
 }
 
@@ -342,20 +332,22 @@ static HWY_NOINLINE void CrossEntropyLossGrad(
   }
 }
 
-template <typename TConfig, typename WeightsT, typename LayerT>
-void CrossEntropyLossBackwardPass(const Prompt& prompt, const WeightsT& weights,
-                                  const ForwardPass<float, TConfig>& forward,
-                                  WeightsT& grad,
-                                  ForwardPass<float, TConfig>& backward,
-                                  RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool) {
-  static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kLayers = TConfig::kLayers;
-  const float kEmbScaling = EmbeddingScaling<TConfig>();
-  static_assert(!TConfig::kAbsolutePE);
-  static_assert(TConfig::kPostNorm == PostNormType::None);
-  static_assert(TConfig::kKVHeads == 1);
+template <typename T>
+void CrossEntropyLossBackwardPassInl(const Prompt& prompt,
+                                     const ModelWeightsPtrs<T>& weights,
+                                     const ForwardPass<float>& forward,
+                                     ModelWeightsPtrs<T>& grad,
+                                     ForwardPass<float>& backward,
+                                     RowVectorBatch<float>& inv_timescale,
+                                     hwy::ThreadPool& pool) {
+  const ModelConfig& config = weights.weights_config;
+  const size_t kVocabSize = config.vocab_size;
+  const size_t model_dim = config.model_dim;
+  const size_t kLayers = config.layer_configs.size();
+  const float kEmbScaling = EmbeddingScaling(model_dim);
+  HWY_ASSERT(!config.absolute_pe);
+  HWY_ASSERT(config.layer_configs[0].post_norm == PostNormType::None);
+  HWY_ASSERT(config.layer_configs[0].kv_heads == 1);
 
   HWY_DASSERT(prompt.context_size > 0);
   HWY_DASSERT(prompt.context_size < prompt.tokens.size());
@@ -370,42 +362,38 @@ void CrossEntropyLossBackwardPass(const Prompt& prompt, const WeightsT& weights,
                kVocabSize);
   }
 
-  if constexpr (TConfig::kFinalCap > 0.0f) {
+  if (config.final_cap > 0.0f) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      SoftcapVJP(TConfig::kFinalCap, forward.logits.data() + pos * kVocabSize,
+      SoftcapVJP(config.final_cap, forward.logits.data() + pos * kVocabSize,
                  backward.logits.data() + pos * kVocabSize, kVocabSize);
     }
   }
 
-  MatMulVJP<kModelDim, kVocabSize>(
-      weights.embedder_input_embedding.data(), forward.final_norm_output.data(),
-      backward.logits.data(), num_tokens,
-      grad.embedder_input_embedding.data(), backward.final_norm_output.data(),
-      pool);
+  MatMulVJP(weights.embedder_input_embedding.data(),
+            forward.final_norm_output.data(), backward.logits.data(), model_dim,
+            kVocabSize, num_tokens, grad.embedder_input_embedding.data(),
+            backward.final_norm_output.data(), pool);
 
-  RMSNormVJP(weights.final_norm_scale.data(),
-             forward.final_layer_output.data(),
-             backward.final_norm_output.data(),
-             kModelDim, num_tokens,
-             grad.final_norm_scale.data(),
-             backward.final_layer_output.data(), pool);
+  RMSNormVJP(weights.final_norm_scale.data(), forward.final_layer_output.data(),
+             backward.final_norm_output.data(), model_dim, num_tokens,
+             grad.final_norm_scale.data(), backward.final_layer_output.data(),
+             pool);
 
   for (int layer = static_cast<int>(kLayers) - 1; layer >= 0; --layer) {
-    auto type = TConfig::kLayerConfig[layer];
+    auto layer_config = config.layer_configs[layer];
     // TODO(szabadka) Implement Griffin layer vjp.
-    HWY_ASSERT(type == LayerAttentionType::kGemma);
+    HWY_ASSERT(layer_config.type == LayerAttentionType::kGemma);
     float* next_layer_grad = layer + 1 < kLayers
                              ? backward.layers[layer + 1].input.data()
                              : backward.final_layer_output.data();
-    LayerVJP<TConfig, LayerT>(*weights.GetLayer(layer), forward.layers[layer],
-                              next_layer_grad, num_tokens,
-                              *grad.GetLayer(layer), backward.layers[layer],
-                              inv_timescale, pool);
+    LayerVJP(*weights.GetLayer(layer), forward.layers[layer], next_layer_grad,
+             num_tokens, *grad.GetLayer(layer), backward.layers[layer],
+             inv_timescale, pool);
   }
 
   InputEmbeddingVJP(weights.embedder_input_embedding.data(), prompt.tokens,
                     kEmbScaling, backward.layers[0].input.data(),
-                    grad.embedder_input_embedding.data(), kModelDim);
+                    grad.embedder_input_embedding.data(), model_dim);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/backprop/backward.cc b/backprop/backward.cc
index c186952..868b391 100644
--- a/backprop/backward.cc
+++ b/backprop/backward.cc
@@ -38,44 +38,15 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-template <typename TConfig>
-void CrossEntropyLossBackwardPass(const Prompt& prompt,
-                                  const ByteStorageT& weights_u8,
-                                  const ByteStorageT& forward_u8,
-                                  ByteStorageT& grad_u8,
-                                  ByteStorageT& backward_u8,
-                                  RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool) {
-  using TWeights = CompressedWeights<TConfig>;
-  const auto& weights = *reinterpret_cast<const TWeights*>(weights_u8.get());
-  auto& grad = *reinterpret_cast<TWeights*>(grad_u8.get());
-  using TAct = ForwardPass<float, TConfig>;
-  const auto& forward = *reinterpret_cast<const TAct*>(forward_u8.get());
-  auto& backward = *reinterpret_cast<TAct*>(backward_u8.get());
-  CrossEntropyLossBackwardPass<TConfig, CompressedWeights<TConfig>,
-                               CompressedLayer<TConfig>>(
-      prompt, weights, forward, grad, backward, inv_timescale, pool);
-}
-
-void CrossEntropyLossBackwardPassT(Model model, const Prompt& prompt,
-                                   const ByteStorageT& weights,
-                                   const ByteStorageT& forward,
-                                   ByteStorageT& grad, ByteStorageT& backward,
+void CrossEntropyLossBackwardPassT(const Prompt& prompt,
+                                   const ModelWeightsPtrs<float>& weights,
+                                   const ForwardPass<float>& forward,
+                                   ModelWeightsPtrs<float>& grad,
+                                   ForwardPass<float>& backward,
                                    RowVectorBatch<float>& inv_timescale,
                                    hwy::ThreadPool& pool) {
-  // TODO(janwas): use CallFunctorForModel
-  switch (model) {
-    case Model::GEMMA_2B:
-      CrossEntropyLossBackwardPass<ConfigGemma2B<float>>(
-          prompt, weights, forward, grad, backward, inv_timescale, pool);
-      break;
-    case Model::GEMMA_TINY:
-      CrossEntropyLossBackwardPass<ConfigGemmaTiny<float>>(
-          prompt, weights, forward, grad, backward, inv_timescale, pool);
-      break;
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
+  CrossEntropyLossBackwardPassInl(prompt, weights, forward, grad, backward,
+                                  inv_timescale, pool);
 }
 
 }  // namespace HWY_NAMESPACE
@@ -87,14 +58,15 @@ namespace gcpp {
 
 HWY_EXPORT(CrossEntropyLossBackwardPassT);
 
-void CrossEntropyLossBackwardPass(const Model& model, const Prompt& prompt,
-                                  const ByteStorageT& weights,
-                                  const ByteStorageT& forward,
-                                  ByteStorageT& grad, ByteStorageT& backward,
+void CrossEntropyLossBackwardPass(const Prompt& prompt,
+                                  const ModelWeightsPtrs<float>& weights,
+                                  const ForwardPass<float>& forward,
+                                  ModelWeightsPtrs<float>& grad,
+                                  ForwardPass<float>& backward,
                                   RowVectorBatch<float>& inv_timescale,
                                   hwy::ThreadPool& pool) {
   return HWY_DYNAMIC_DISPATCH(CrossEntropyLossBackwardPassT)(
-      model, prompt, weights, forward, grad, backward, inv_timescale, pool);
+      prompt, weights, forward, grad, backward, inv_timescale, pool);
 }
 
 }  // namespace gcpp
diff --git a/backprop/backward.h b/backprop/backward.h
index 0ac218a..d8e50c7 100644
--- a/backprop/backward.h
+++ b/backprop/backward.h
@@ -16,17 +16,19 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_H_
 
+#include "backprop/activations.h"
 #include "backprop/prompt.h"
-#include "gemma/activations.h"
-#include "gemma/common.h"
+#include "gemma/weights.h"
+#include "util/allocator.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
 
-void CrossEntropyLossBackwardPass(const Model& model, const Prompt& prompt,
-                                  const ByteStorageT& weights,
-                                  const ByteStorageT& forward,
-                                  ByteStorageT& grad, ByteStorageT& backward,
+void CrossEntropyLossBackwardPass(const Prompt& prompt,
+                                  const ModelWeightsPtrs<float>& weights,
+                                  const ForwardPass<float>& forward,
+                                  ModelWeightsPtrs<float>& grad,
+                                  ForwardPass<float>& backward,
                                   RowVectorBatch<float>& inv_timescale,
                                   hwy::ThreadPool& pool);
 
diff --git a/backprop/backward_scalar.h b/backprop/backward_scalar.h
index a804cd3..b0a37b3 100644
--- a/backprop/backward_scalar.h
+++ b/backprop/backward_scalar.h
@@ -125,65 +125,64 @@ void GatedGeluVJP(const T* in, const T* d_out, T* d_in, size_t N, size_t K) {
   }
 }
 
-
-template<typename T>
+template <typename T>
 void MaskedAttentionVJP(const T* qkv, const T* doutput, T* dqkv,
-                        size_t num_tokens, size_t kHeads, size_t kQKVDim,
-                        size_t kSeqLen) {
+                        size_t num_tokens, size_t kHeads, size_t qkv_dim,
+                        size_t seq_len) {
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t offset = pos * (kHeads + 2) * kQKVDim;
-    memset(dqkv + offset, 0, (kHeads + 1) * kQKVDim * sizeof(qkv[0]));
+    const size_t offset = pos * (kHeads + 2) * qkv_dim;
+    memset(dqkv + offset, 0, (kHeads + 1) * qkv_dim * sizeof(qkv[0]));
   }
   for (size_t head = 0; head < kHeads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t qoffs = (pos * (kHeads + 2) + head) * kQKVDim;
-      const size_t aoffs = head * kSeqLen + pos * kHeads * kSeqLen;
+      const size_t qoffs = (pos * (kHeads + 2) + head) * qkv_dim;
+      const size_t aoffs = head * seq_len + pos * kHeads * seq_len;
       const T* q = qkv + qoffs;
       const T* dout = doutput + aoffs;
       T* dq = dqkv + qoffs;
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const size_t koffs = (pos2 * (kHeads + 2) + kHeads) * kQKVDim;
+        const size_t koffs = (pos2 * (kHeads + 2) + kHeads) * qkv_dim;
         const T* k = qkv + koffs;
         T* dk = dqkv + koffs;
-        MulByConstAndAddT(dout[pos2], k, dq, kQKVDim);
-        MulByConstAndAddT(dout[pos2], q, dk, kQKVDim);
+        MulByConstAndAddT(dout[pos2], k, dq, qkv_dim);
+        MulByConstAndAddT(dout[pos2], q, dk, qkv_dim);
       }
     }
   }
 }
 
-template<typename T>
-void MaskedSoftmaxVJPT(const T* y, T* dy, size_t num_tokens,
-                       size_t kHeads, size_t kSeqLen) {
+template <typename T>
+void MaskedSoftmaxVJPT(const T* y, T* dy, size_t num_tokens, size_t kHeads,
+                       size_t seq_len) {
   for (size_t head = 0; head < kHeads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      size_t offset = pos * kHeads * kSeqLen + head * kSeqLen;
+      size_t offset = pos * kHeads * seq_len + head * seq_len;
       SoftmaxVJPT(y + offset, dy + offset, pos + 1);
-      memset(dy + offset + pos + 1, 0, (kSeqLen - pos - 1) * sizeof(T));
+      memset(dy + offset + pos + 1, 0, (seq_len - pos - 1) * sizeof(T));
     }
   }
 }
 
-template<typename T>
+template <typename T>
 void MixByAttentionVJP(const T* qkv, const T* attention, const T* doutput,
-                       T* dqkv, T* dattention, size_t num_tokens,
-                       size_t kHeads, size_t kQKVDim, size_t kSeqLen) {
+                       T* dqkv, T* dattention, size_t num_tokens, size_t kHeads,
+                       size_t qkv_dim, size_t seq_len) {
   auto v_offset = [&](size_t pos) {
-    return (pos * (kHeads + 2) + kHeads + 1) * kQKVDim;
+    return (pos * (kHeads + 2) + kHeads + 1) * qkv_dim;
   };
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    memset(&dqkv[v_offset(pos)], 0, kQKVDim * sizeof(qkv[0]));
+    memset(&dqkv[v_offset(pos)], 0, qkv_dim * sizeof(qkv[0]));
   }
   for (size_t head = 0; head < kHeads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t offset = head * kQKVDim + pos * kHeads * kQKVDim;
-      const size_t aoffset = head * kSeqLen + pos * kHeads * kSeqLen;
+      const size_t offset = head * qkv_dim + pos * kHeads * qkv_dim;
+      const size_t aoffset = head * seq_len + pos * kHeads * seq_len;
       const T* att = &attention[aoffset];
       const T* dout = &doutput[offset];
       T* datt = &dattention[aoffset];
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        datt[pos2] = DotT(dout, &qkv[v_offset(pos2)], kQKVDim);
-        MulByConstAndAddT(att[pos2], dout, &dqkv[v_offset(pos2)], kQKVDim);
+        datt[pos2] = DotT(dout, &qkv[v_offset(pos2)], qkv_dim);
+        MulByConstAndAddT(att[pos2], dout, &dqkv[v_offset(pos2)], qkv_dim);
       }
     }
   }
@@ -199,77 +198,76 @@ void InputEmbeddingVJPT(const T* w, const std::vector<int>& tokens, T scaling,
   }
 }
 
-template <typename T, typename TConfig>
-void LayerVJP(const CompressedLayer<TConfig>& weights,
-              const ForwardLayer<T, TConfig>& forward, const T* dy,
-              CompressedLayer<TConfig>& grad,
-              ForwardLayer<T, TConfig>& backward, size_t num_tokens) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-  static const T kQueryScale = 1.0 / std::sqrt(T(kQKVDim));
+template <typename T>
+void LayerVJP(const LayerWeightsPtrs<T>& weights,
+              const ForwardLayer<T>& forward, const T* dy,
+              LayerWeightsPtrs<T>& grad, ForwardLayer<T>& backward,
+              size_t num_tokens) {
+  const LayerConfig& layer_config = weights.layer_config;
+  const size_t model_dim = layer_config.model_dim;
+  const size_t seq_len = forward.input.Rows();
+  const size_t qkv_dim = layer_config.qkv_dim;
+  const size_t kHeads = layer_config.heads;
+  const size_t kFFHiddenDim = layer_config.ff_hidden_dim;
+  const T kQueryScale = 1.0 / std::sqrt(T(qkv_dim));
 
-  MatMulVJPT(weights.linear_w.data(), forward.ffw_hidden_gated.data(),
-             dy, grad.linear_w.data(), backward.ffw_hidden_gated.data(),
-             kModelDim, kFFHiddenDim, num_tokens);
+  MatMulVJPT(weights.linear_w.data(), forward.ffw_hidden_gated.data(), dy,
+             grad.linear_w.data(), backward.ffw_hidden_gated.data(), model_dim,
+             kFFHiddenDim, num_tokens);
 
   GatedGeluVJP(forward.ffw_hidden.data(), backward.ffw_hidden_gated.data(),
                backward.ffw_hidden.data(), kFFHiddenDim, num_tokens);
 
   MatMulVJPT(weights.gating_einsum_w.data(), forward.bf_pre_ffw_rms_out.data(),
              backward.ffw_hidden.data(), grad.gating_einsum_w.data(),
-             backward.bf_pre_ffw_rms_out.data(), kFFHiddenDim * 2, kModelDim,
+             backward.bf_pre_ffw_rms_out.data(), kFFHiddenDim * 2, model_dim,
              num_tokens);
 
   RMSNormVJPT(weights.pre_ffw_norm_scale.data(), forward.attention_out.data(),
               backward.bf_pre_ffw_rms_out.data(),
               grad.pre_ffw_norm_scale.data(), backward.attention_out.data(),
-              kModelDim, num_tokens);
+              model_dim, num_tokens);
 
-  AddFromT(dy, backward.attention_out.data(), num_tokens * kModelDim);
+  AddFromT(dy, backward.attention_out.data(), num_tokens * model_dim);
 
   MultiHeadMatMulVJPT(weights.attn_vec_einsum_w.data(), forward.att_out.data(),
                       backward.attention_out.data(),
-                      grad.attn_vec_einsum_w.data(),
-                      backward.att_out.data(),
-                      kHeads, kModelDim, kQKVDim, num_tokens);
+                      grad.attn_vec_einsum_w.data(), backward.att_out.data(),
+                      kHeads, model_dim, qkv_dim, num_tokens);
 
   MixByAttentionVJP(forward.qkv.data(), forward.att.data(),
                     backward.att_out.data(), backward.qkv.data(),
-                    backward.att.data(), num_tokens, kHeads, kQKVDim,
-                    kSeqLen);
+                    backward.att.data(), num_tokens, kHeads, qkv_dim, seq_len);
 
-  MaskedSoftmaxVJPT(forward.att.data(), backward.att.data(),
-                    num_tokens, kHeads, kSeqLen);
+  MaskedSoftmaxVJPT(forward.att.data(), backward.att.data(), num_tokens, kHeads,
+                    seq_len);
 
   MaskedAttentionVJP(forward.qkv.data(), backward.att.data(),
-                     backward.qkv.data(), num_tokens, kHeads, kQKVDim, kSeqLen);
+                     backward.qkv.data(), num_tokens, kHeads, qkv_dim, seq_len);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = backward.qkv.data() + pos * (kHeads + 2) * kQKVDim;
-    MulByConstT(kQueryScale, qkv, kHeads * kQKVDim);
+    T* qkv = backward.qkv.data() + pos * (kHeads + 2) * qkv_dim;
+    MulByConstT(kQueryScale, qkv, kHeads * qkv_dim);
   }
 
   for (int pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = backward.qkv.data() + pos * (kHeads + 2) * kQKVDim;
+    T* qkv = backward.qkv.data() + pos * (kHeads + 2) * qkv_dim;
     for (size_t h = 0; h <= kHeads; ++h) {
-      Rope(qkv + h * kQKVDim, kQKVDim, -pos);
+      Rope(qkv + h * qkv_dim, qkv_dim, -pos);
     }
   }
 
   MatMulVJPT(weights.qkv_einsum_w.data(), forward.pre_att_rms_out.data(),
              backward.qkv.data(), grad.qkv_einsum_w.data(),
-            backward.pre_att_rms_out.data(),
-            (kHeads + 2) * kQKVDim, kModelDim, num_tokens);
+             backward.pre_att_rms_out.data(), (kHeads + 2) * qkv_dim, model_dim,
+             num_tokens);
   RMSNormVJPT(weights.pre_attention_norm_scale.data(), forward.input.data(),
               backward.pre_att_rms_out.data(),
-              grad.pre_attention_norm_scale.data(),
-              backward.input.data(), kModelDim, num_tokens);
+              grad.pre_attention_norm_scale.data(), backward.input.data(),
+              model_dim, num_tokens);
 
   AddFromT(backward.attention_out.data(), backward.input.data(),
-           num_tokens * kModelDim);
+           num_tokens * model_dim);
 }
 
 template <typename T>
@@ -296,56 +294,54 @@ void CrossEntropyLossGrad(const T* x, T* dx, const Prompt& prompt, size_t V) {
   }
 }
 
-template <typename T, typename TConfig>
+template <typename T>
 void CrossEntropyLossBackwardPass(const Prompt& prompt,
-                                  const CompressedWeights<TConfig>& weights,
-                                  const ForwardPass<T, TConfig>& forward,
-                                  CompressedWeights<TConfig>& grad,
-                                  ForwardPass<T, TConfig>& backward) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  static constexpr size_t kLayers = TConfig::kLayers;
+                                  const ModelWeightsPtrs<T>& weights,
+                                  const ForwardPass<T>& forward,
+                                  ModelWeightsPtrs<T>& grad,
+                                  ForwardPass<T>& backward) {
+  const ModelConfig& config = weights.weights_config;
+  const size_t model_dim = config.model_dim;
+  const size_t vocab_size = config.vocab_size;
+  const size_t layers = config.layer_configs.size();
   const std::vector<int> tokens = prompt.tokens;
   const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
 
   CrossEntropyLossGrad(forward.probs.data(), backward.logits.data(), prompt,
-                       kVocabSize);
+                       vocab_size);
 
-  SoftmaxVJPT(forward.probs.data(), backward.logits.data(),
-              kVocabSize, num_tokens);
+  SoftmaxVJPT(forward.probs.data(), backward.logits.data(), vocab_size,
+              num_tokens);
 
-  if constexpr (TConfig::kFinalCap > 0.0f) {
+  if (config.final_cap > 0.0f) {
     for (size_t i = 0; i < num_tokens; ++i) {
-      SoftcapVJPT(TConfig::kFinalCap, forward.logits.data() + i * kVocabSize,
-                  backward.logits.data() + i * kVocabSize, kVocabSize);
+      SoftcapVJPT(config.final_cap, forward.logits.data() + i * vocab_size,
+                  backward.logits.data() + i * vocab_size, vocab_size);
     }
   }
 
-  MatMulVJPT(weights.embedder_input_embedding.data(),
-             forward.final_norm_output.data(),
-             backward.logits.data(),
-             grad.embedder_input_embedding.data(),
-             backward.final_norm_output.data(),
-             kVocabSize, kModelDim, num_tokens);
+  MatMulVJPT(
+      weights.embedder_input_embedding.data(), forward.final_norm_output.data(),
+      backward.logits.data(), grad.embedder_input_embedding.data(),
+      backward.final_norm_output.data(), vocab_size, model_dim, num_tokens);
 
   RMSNormVJPT(weights.final_norm_scale.data(),
               forward.final_layer_output.data(),
-              backward.final_norm_output.data(),
-              grad.final_norm_scale.data(),
-              backward.final_layer_output.data(), kModelDim, num_tokens);
+              backward.final_norm_output.data(), grad.final_norm_scale.data(),
+              backward.final_layer_output.data(), model_dim, num_tokens);
 
-  for (int layer = static_cast<int>(kLayers) - 1; layer >= 0; --layer) {
-    T* next_layer_grad = layer + 1 < kLayers
-                         ? backward.layers[layer + 1].input.data()
-                         : backward.final_layer_output.data();
+  for (int layer = static_cast<int>(layers) - 1; layer >= 0; --layer) {
+    T* next_layer_grad = layer + 1 < layers
+                             ? backward.layers[layer + 1].input.data()
+                             : backward.final_layer_output.data();
     LayerVJP(*weights.GetLayer(layer), forward.layers[layer], next_layer_grad,
              *grad.GetLayer(layer), backward.layers[layer], num_tokens);
   }
 
-  const T kEmbScaling = EmbeddingScaling(kModelDim);
-  InputEmbeddingVJPT(weights.embedder_input_embedding.data(),
-                     tokens, kEmbScaling, backward.layers[0].input.data(),
-                     grad.embedder_input_embedding.data(), kModelDim);
+  const T kEmbScaling = EmbeddingScaling(model_dim);
+  InputEmbeddingVJPT(weights.embedder_input_embedding.data(), tokens,
+                     kEmbScaling, backward.layers[0].input.data(),
+                     grad.embedder_input_embedding.data(), model_dim);
 }
 
 }  // namespace gcpp
diff --git a/backprop/backward_scalar_test.cc b/backprop/backward_scalar_test.cc
index 262a121..b5e39db 100644
--- a/backprop/backward_scalar_test.cc
+++ b/backprop/backward_scalar_test.cc
@@ -19,7 +19,6 @@
 #include <stdio.h>
 #include <string.h>  // memcpy
 
-#include <array>
 #include <complex>
 #include <limits>
 #include <random>
@@ -384,44 +383,49 @@ TEST(BackPropTest, InputEmbeddingVJP) {
   }
 }
 
-template <typename T>
-struct TestConfig : ConfigBaseGemmaV2 {
-  using Weight = T;
-  static constexpr int kSeqLen = 18;
-  static constexpr int kVocabSize = 12;
-  static constexpr int kModelDim = 32;
-  static constexpr int kHeads = 3;
-  static constexpr int kQKVDim = 12;
-  static constexpr int kFFHiddenDim = 48;
-  static constexpr std::array<LayerAttentionType, 2> kLayerConfig =
-      FixedLayerConfig<2>(LayerAttentionType::kGemma);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-
-  static constexpr int kKVHeads = 1;
-  static constexpr int kGemmaLayers = kLayers;
-};
+static ModelConfig TestConfig() {
+  ModelConfig config;
+  config.scale_names = {"att_ein",      "qkv_ein",   "gr_lin_x_w", "gr_lin_y_w",
+                        "gr_lin_out_w", "gr_gate_w", "gating_ein", "linear_w"};
+  config.model_dim = 32;
+  config.vocab_size = 12;
+  config.seq_len = 18;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .ff_hidden_dim = 48,
+      .heads = 3,
+      .kv_heads = 1,
+      .qkv_dim = 12,
+  };
+  config.layer_configs = {2, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes = FixedAttentionWindowSizes<2>(32);
+  // This is required for optimize_test to pass.
+  config.final_cap = 30.0f;
+  return config;
+}
 
 TEST(BackPropTest, LayerVJP) {
   std::mt19937 gen(42);
   using T = double;
   using TC = std::complex<T>;
-  const size_t kOutputSize = TestConfig<T>::kSeqLen * TestConfig<T>::kModelDim;
-  CompressedLayer<TestConfig<T>> weights;
-  CompressedLayer<TestConfig<T>> grad;
-  ForwardLayer<T, TestConfig<T>> forward;
-  ForwardLayer<T, TestConfig<T>> backward = {};
-  CompressedLayer<TestConfig<TC>> c_weights;
-  ForwardLayer<TC, TestConfig<TC>> c_forward;
-  std::array<T, kOutputSize> y;
+  ModelConfig config = TestConfig();
+  const size_t kOutputSize = config.seq_len * config.model_dim;
+  LayerWeightsPtrs<T> weights(config.layer_configs[0]);
+  LayerWeightsPtrs<T> grad(config.layer_configs[0]);
+  ForwardLayer<T> forward(config.layer_configs[0], config.seq_len);
+  ForwardLayer<T> backward(config.layer_configs[0], config.seq_len);
+  LayerWeightsPtrs<TC> c_weights(config.layer_configs[0]);
+  ForwardLayer<TC> c_forward(config.layer_configs[0], config.seq_len);
+  MatStorageT<T> y("y", kOutputSize, 1);
   MatStorageT<T> dy("dy", kOutputSize, 1);
-  std::array<TC, kOutputSize> c_y;
+  MatStorageT<TC> c_y("c_y", kOutputSize, 1);
   const size_t num_tokens = 3;
-  weights.Allocate();
-  grad.Allocate();
-  c_weights.Allocate();
+  std::vector<MatStorage> layer_storage;
+  weights.Allocate(layer_storage);
+  grad.Allocate(layer_storage);
+  c_weights.Allocate(layer_storage);
   backward.input.ZeroInit();
 
   for (size_t iter = 0; iter < 10; ++iter) {
@@ -432,7 +436,7 @@ TEST(BackPropTest, LayerVJP) {
     Complexify(forward.input, c_forward.input);
     auto func = [&]() {
       ApplyLayer(c_weights, c_forward, num_tokens, c_y.data());
-      return DotT(dy.data(), c_y.data(), num_tokens * TestConfig<T>::kModelDim);
+      return DotT(dy.data(), c_y.data(), num_tokens * config.model_dim);
     };
     grad.ZeroInit(/*layer_idx=*/0);
     ApplyLayer(weights, forward, num_tokens, y.data());
@@ -447,12 +451,13 @@ TEST(BackPropTest, EndToEnd) {
   std::mt19937 gen(42);
   using T = double;
   using TC = std::complex<T>;
-  WeightsWrapper<TestConfig<T>> weights;
-  WeightsWrapper<TestConfig<T>> grad;
-  ForwardPass<T, TestConfig<T>> forward;
-  ForwardPass<T, TestConfig<T>> backward;
-  WeightsWrapper<TestConfig<TC>> c_weights;
-  ForwardPass<TC, TestConfig<TC>> c_forward;
+  ModelConfig config = TestConfig();
+  WeightsWrapper<T> weights(config);
+  WeightsWrapper<T> grad(config);
+  ForwardPass<T> forward(config);
+  ForwardPass<T> backward(config);
+  WeightsWrapper<TC> c_weights(config);
+  ForwardPass<TC> c_forward(config);
 
   ReverseSequenceSampler training_task({0, 0, 1, 1});
   std::vector<Prompt> batch = training_task.SampleBatch(3, gen);
@@ -474,9 +479,9 @@ TEST(BackPropTest, EndToEnd) {
   }
 }
 
-template <typename T, typename TConfig>
-void MulByConstAndAddT(T c, const CompressedLayer<TConfig>& x,
-                       CompressedLayer<TConfig>& out) {
+template <typename T>
+void MulByConstAndAddT(T c, const LayerWeightsPtrs<T>& x,
+                       LayerWeightsPtrs<T>& out) {
   MulByConstAndAddT(c, x.pre_attention_norm_scale,
                     out.pre_attention_norm_scale);
   MulByConstAndAddT(c, x.attn_vec_einsum_w, out.attn_vec_einsum_w);
@@ -486,23 +491,23 @@ void MulByConstAndAddT(T c, const CompressedLayer<TConfig>& x,
   MulByConstAndAddT(c, x.linear_w, out.linear_w);
 }
 
-template <typename T, typename TConfig>
-void MulByConstAndAddT(T c, const CompressedWeights<TConfig>& x,
-                       CompressedWeights<TConfig>& out) {
-  static constexpr size_t kLayers = TConfig::kLayers;
+template <typename T>
+void MulByConstAndAddT(T c, const ModelWeightsPtrs<T>& x,
+                       ModelWeightsPtrs<T>& out) {
+  const size_t layers = x.c_layers.size();
   MulByConstAndAddT(c, x.embedder_input_embedding,
                     out.embedder_input_embedding);
   MulByConstAndAddT(c, x.final_norm_scale, out.final_norm_scale);
-  for (size_t i = 0; i < kLayers; ++i) {
+  for (size_t i = 0; i < layers; ++i) {
     MulByConstAndAddT(c, *x.GetLayer(i), *out.GetLayer(i));
   }
 }
 
 // Evaluates forward pass on a batch.
-template <typename T, typename TConfig>
+template <typename T>
 T CrossEntropyLossForwardPass(const std::vector<Prompt>& batch,
-                              const WeightsWrapper<TConfig>& weights,
-                              ForwardPass<T, TConfig>& forward) {
+                              const WeightsWrapper<T>& weights,
+                              ForwardPass<T>& forward) {
   T loss = 0.0;
   for (const Prompt& prompt : batch) {
     loss += CrossEntropyLossForwardPass(prompt, weights.get(), forward);
@@ -514,12 +519,11 @@ T CrossEntropyLossForwardPass(const std::vector<Prompt>& batch,
 // Evaluates forward pass on a batch by applying gradient with the given
 // learning rate. Does not update weights, but uses the given tmp weights
 // instead.
-template <typename T, typename TConfig>
+template <typename T>
 T CrossEntropyLossForwardPass(T learning_rate, const std::vector<Prompt>& batch,
-                              const WeightsWrapper<TConfig>& weights,
-                              const WeightsWrapper<TConfig>& grad,
-                              WeightsWrapper<TConfig>& tmp,
-                              ForwardPass<T, TConfig>& forward) {
+                              const WeightsWrapper<T>& weights,
+                              const WeightsWrapper<T>& grad,
+                              WeightsWrapper<T>& tmp, ForwardPass<T>& forward) {
   tmp.CopyFrom(weights);
   const T scale = -learning_rate / batch.size();
   MulByConstAndAddT(scale, grad.get(), tmp.get());
@@ -529,11 +533,9 @@ T CrossEntropyLossForwardPass(T learning_rate, const std::vector<Prompt>& batch,
 // Uses line search in the negative gradient direction to update weights. We do
 // this so that we can test that each step during the gradient descent can
 // decrease the objective function value.
-template <typename T, typename TConfig>
-T FindOptimalUpdate(const WeightsWrapper<TConfig>& grad,
-                    WeightsWrapper<TConfig>& weights,
-                    WeightsWrapper<TConfig>& tmp,
-                    ForwardPass<T, TConfig>& forward,
+template <typename T>
+T FindOptimalUpdate(const WeightsWrapper<T>& grad, WeightsWrapper<T>& weights,
+                    WeightsWrapper<T>& tmp, ForwardPass<T>& forward,
                     const std::vector<Prompt>& batch, T loss,
                     T initial_learning_rate) {
   T lr0 = initial_learning_rate;
@@ -568,13 +570,14 @@ TEST(BackProptest, Convergence) {
   std::mt19937 gen(42);
   using T = float;
   using TC = std::complex<double>;
-  WeightsWrapper<TestConfig<T>> weights;
-  WeightsWrapper<TestConfig<T>> grad;
-  WeightsWrapper<TestConfig<T>> tmp;
-  ForwardPass<T, TestConfig<T>> forward;
-  ForwardPass<T, TestConfig<T>> backward;
-  WeightsWrapper<TestConfig<TC>> c_weights;
-  ForwardPass<TC, TestConfig<TC>> c_forward;
+  ModelConfig config = TestConfig();
+  WeightsWrapper<T> weights(config);
+  WeightsWrapper<T> grad(config);
+  WeightsWrapper<T> tmp(config);
+  ForwardPass<T> forward(config);
+  ForwardPass<T> backward(config);
+  WeightsWrapper<TC> c_weights(config);
+  ForwardPass<TC> c_forward(config);
   constexpr size_t kBatchSize = 5;
   ReverseSequenceSampler training_task({0, 0, 0, 1, 1});
   T learning_rate = 0.01;
diff --git a/backprop/backward_test.cc b/backprop/backward_test.cc
index 01c5e73..2b82c12 100644
--- a/backprop/backward_test.cc
+++ b/backprop/backward_test.cc
@@ -19,7 +19,6 @@
 
 #include <stddef.h>
 
-#include <array>
 #include <complex>
 #include <cstdlib>  // std::abs
 #include <random>
@@ -34,7 +33,6 @@
 #include "backprop/test_util.h"
 #include "gemma/activations.h"
 #include "gemma/configs.h"
-#include "gemma/weights.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
@@ -50,6 +48,7 @@
 #include "backprop/forward-inl.h"
 #include "compression/compress.h"
 #include "ops/ops-inl.h"
+#include "util/allocator.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
@@ -85,8 +84,8 @@ void TestMatMulVJP() {
     };
 
     grad.ZeroInit();
-    MatMulVJP<kCols, kRows>(weights.data(), x.data(), dy.data(), kTokens,
-                            grad.data(), dx.data(), pool);
+    MatMulVJP(weights.data(), x.data(), dy.data(), kCols, kRows, kTokens,
+              grad.data(), dx.data(), pool);
     TestGradient(dx, c_x, func, 5e-5f, 5e-5f, __LINE__);
     TestGradient(grad, c_weights, func, 5e-5f, 5e-5f, __LINE__);
 
@@ -130,9 +129,8 @@ void TestMultiHeadMatMulVJP() {
     };
 
     grad.ZeroInit();
-    MultiHeadMatMulVJP<kHeads, kCols, kRows>(
-        weights.data(), x.data(), dy.data(), kTokens, grad.data(), dx.data(),
-        pool);
+    MultiHeadMatMulVJP(weights.data(), x.data(), dy.data(), kHeads, kCols,
+                       kRows, kTokens, grad.data(), dx.data(), pool);
     TestGradient(dx, c_x, func, 5e-5f, 5e-5f, __LINE__);
     TestGradient(grad, c_weights, func, 5e-5f, 5e-5f, __LINE__);
 
@@ -186,63 +184,63 @@ void TestRMSNormVJP() {
   }
 }
 
-template <typename T>
-struct TestConfig : ConfigBaseGemmaV2 {
-  using Weight = T;
-  static constexpr int kSeqLen = 24;
-  static constexpr int kVocabSize = 16;
-  static constexpr int kModelDim = 32;
-  static constexpr int kHeads = 3;
-  static constexpr int kQKVDim = 16;
-  static constexpr int kFFHiddenDim = 64;
-  static constexpr std::array<LayerAttentionType, 2> kLayerConfig =
-      FixedLayerConfig<2>(LayerAttentionType::kGemma);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-
-  static constexpr int kKVHeads = 1;
-  static constexpr int kGemmaLayers = kLayers;
-};
+static ModelConfig TestConfig() {
+  ModelConfig config;
+  config.scale_names = {"att_ein",      "qkv_ein",   "gr_lin_x_w", "gr_lin_y_w",
+                        "gr_lin_out_w", "gr_gate_w", "gating_ein", "linear_w"};
+  config.model_dim = 32;
+  config.vocab_size = 16;
+  config.seq_len = 24;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .ff_hidden_dim = 64,
+      .heads = 3,
+      .kv_heads = 1,
+      .qkv_dim = 16,
+  };
+  config.layer_configs = {2, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes = FixedAttentionWindowSizes<2>(32);
+  // This is required for optimize_test to pass.
+  config.att_cap = 50.0f;
+  config.final_cap = 30.0f;
+  return config;
+}
 
 void TestEndToEnd() {
   std::mt19937 gen(42);
   hwy::ThreadPool pool(0);
-  using WeightsF = CompressedWeights<TestConfig<float>>;
-  using LayerF = CompressedLayer<TestConfig<float>>;
-  WeightsWrapper<TestConfig<float>> weights;
-  WeightsWrapper<TestConfig<float>> grad;
-  ActivationsWrapper<float, TestConfig<float>> forward0;
-  ActivationsWrapper<float, TestConfig<float>> forward1;
-  ActivationsWrapper<float, TestConfig<float>> backward;
+  ModelConfig config = TestConfig();
+  WeightsWrapper<float> weights(config);
+  WeightsWrapper<float> grad(config);
+  ForwardPass<float> forward0(config);
+  ForwardPass<float> forward1(config);
+  ForwardPass<float> backward(config);
   using TC = std::complex<double>;
-  WeightsWrapper<TestConfig<TC>> c_weights;
-  ForwardPass<TC, TestConfig<TC>> c_forward;
+  WeightsWrapper<TC> c_weights(config);
+  ForwardPass<TC> c_forward(config);
 
   ReverseSequenceSampler training_task({0, 0, 1, 1});
   std::vector<Prompt> batch = training_task.SampleBatch(3, gen);
 
-  RowVectorBatch<float> inv_timescale =
-      Activations::CreateInvTimescale<TestConfig<float>>();
+  RowVectorBatch<float> inv_timescale = Activations::CreateInvTimescale(
+      config.layer_configs[0].qkv_dim, config.layer_configs[0].post_qk);
   for (const Prompt& prompt : batch) {
     ReverseSequenceSampler::LogPrompt(prompt);
     RandInit(weights.get(), 1.0f, gen);
 
-    float loss0 = CrossEntropyLossForwardPass(
-        prompt, weights.get(), forward0.get());
+    float loss0 = CrossEntropyLossForwardPass(prompt, weights.get(), forward0);
 
-    float loss1 =
-        CrossEntropyLossForwardPass<TestConfig<float>, WeightsF, LayerF>(
-            prompt.tokens, prompt.context_size, weights.get(), forward1.get(),
-            inv_timescale, pool);
+    float loss1 = CrossEntropyLossForwardPass(
+        prompt.tokens, prompt.context_size, weights.get(), forward1,
+        inv_timescale, pool);
 
     EXPECT_NEAR(loss1, loss0, std::abs(loss0) * 2e-5);
 
     grad.ZeroInit();
-    CrossEntropyLossBackwardPass<TestConfig<float>, WeightsF, LayerF>(
-        prompt, weights.get(), forward1.get(), grad.get(), backward.get(),
-        inv_timescale, pool);
+    CrossEntropyLossBackwardPassInl(prompt, weights.get(), forward1, grad.get(),
+                                    backward, inv_timescale, pool);
 
     Complexify(weights.get(), c_weights.get());
     auto func = [&]() {
diff --git a/backprop/forward-inl.h b/backprop/forward-inl.h
index b6b1dc0..ca969c4 100644
--- a/backprop/forward-inl.h
+++ b/backprop/forward-inl.h
@@ -26,6 +26,7 @@
 #include "backprop/activations.h"
 #include "gemma/common.h"
 #include "gemma/configs.h"
+#include "gemma/weights.h"
 #include "util/allocator.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -93,29 +94,29 @@ static HWY_NOINLINE float CrossEntropyLoss(const float* HWY_RESTRICT probs,
   return loss * scaling;
 }
 
-template <typename TConfig, typename LayerT>
-void ApplyForwardLayer(const LayerT& weights,
-                       ForwardLayer<float, TConfig>& activations,
-                       size_t num_tokens, float* HWY_RESTRICT output,
+template <typename T>
+void ApplyForwardLayer(const LayerWeightsPtrs<T>& weights,
+                       ForwardLayer<float>& activations, size_t num_tokens,
+                       float* HWY_RESTRICT output,
                        const RowVectorBatch<float>& inv_timescale,
                        hwy::ThreadPool& pool) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static const float kQueryScale =
+  const LayerConfig& config = weights.layer_config;
+  const size_t model_dim = config.model_dim;
+  const size_t kSeqLen = activations.input.Rows();
+  const size_t kQKVDim = config.qkv_dim;
+  const size_t kHeads = config.heads;
+  static const float query_scale =
       static_cast<float>(1.0 / sqrt(static_cast<double>(kQKVDim)));
   HWY_ASSERT(num_tokens <= kSeqLen);
 
   ApplyRMSNorm(weights.pre_attention_norm_scale.data(),
-               activations.input.data(), kModelDim, num_tokens,
+               activations.input.data(), model_dim, num_tokens,
                activations.pre_att_rms_out.data(), pool);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec<(kHeads + 2) * kQKVDim, kModelDim>(
-        weights.qkv_einsum_w, 0,
-        activations.pre_att_rms_out.data() + pos * kModelDim,
-        activations.qkv.data() + pos * (kHeads + 2) * kQKVDim, pool);
+    MatVec(weights.qkv_einsum_w, 0, (kHeads + 2) * kQKVDim, model_dim,
+           activations.pre_att_rms_out.data() + pos * model_dim,
+           activations.qkv.data() + pos * (kHeads + 2) * kQKVDim, pool);
   }
   const size_t num_tasks = kHeads * num_tokens;
 
@@ -130,7 +131,7 @@ void ApplyForwardLayer(const LayerT& weights,
     float* HWY_RESTRICT q =
         activations.qkv.data() + (pos * (kHeads + 2) + head) * kQKVDim;
     Rope(q, kQKVDim, inv_timescale.Const(), pos);
-    MulByConst(kQueryScale, q, kQKVDim);
+    MulByConst(query_scale, q, kQKVDim);
   });
 
   pool.Run(0, num_tasks, [&](const uint64_t task, size_t thread) HWY_ATTR {
@@ -174,29 +175,29 @@ void ApplyForwardLayer(const LayerT& weights,
   activations.attention_out.ZeroInit();
   for (size_t pos = 0; pos < num_tokens; ++pos) {
     for (size_t head = 0; head < kHeads; ++head) {
-      MatVec<kModelDim, kQKVDim>(
-          weights.attn_vec_einsum_w, head * kModelDim * kQKVDim,
+      MatVec(
+          weights.attn_vec_einsum_w, head * model_dim * kQKVDim, model_dim,
+          kQKVDim,
           activations.att_out.data() + pos * kHeads * kQKVDim + head * kQKVDim,
-          activations.att_post1.data() + pos * kModelDim, pool);
-      AddFrom(activations.att_post1.data() + pos * kModelDim,
-              activations.attention_out.data() + pos * kModelDim, kModelDim);
+          activations.att_post1.data() + pos * model_dim, pool);
+      AddFrom(activations.att_post1.data() + pos * model_dim,
+              activations.attention_out.data() + pos * model_dim, model_dim);
     }
   }
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(activations.input.data() + pos * kModelDim,
-            activations.attention_out.data() + pos * kModelDim, kModelDim);
+    AddFrom(activations.input.data() + pos * model_dim,
+            activations.attention_out.data() + pos * model_dim, model_dim);
   }
 
   ApplyRMSNorm(weights.pre_ffw_norm_scale.data(),
-               activations.attention_out.data(), kModelDim, num_tokens,
+               activations.attention_out.data(), model_dim, num_tokens,
                activations.bf_pre_ffw_rms_out.data(), pool);
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
+  const size_t kFFHiddenDim = config.ff_hidden_dim;
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec<kFFHiddenDim * 2, kModelDim>(
-        weights.gating_einsum_w, 0,
-        activations.bf_pre_ffw_rms_out.data() + pos * kModelDim,
-        activations.ffw_hidden.data() + pos * kFFHiddenDim * 2, pool);
+    MatVec(weights.gating_einsum_w, 0, kFFHiddenDim * 2, model_dim,
+           activations.bf_pre_ffw_rms_out.data() + pos * model_dim,
+           activations.ffw_hidden.data() + pos * kFFHiddenDim * 2, pool);
   }
   for (size_t pos = 0; pos < num_tokens; ++pos) {
     const size_t hidden_offset = pos * kFFHiddenDim * 2;
@@ -215,77 +216,76 @@ void ApplyForwardLayer(const LayerT& weights,
     }
   }
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec<kModelDim, kFFHiddenDim>(
-        weights.linear_w, 0,
-        activations.ffw_hidden_gated.data() + pos * kFFHiddenDim,
-        output + pos * kModelDim, pool);
+    MatVec(weights.linear_w, 0, model_dim, kFFHiddenDim,
+           activations.ffw_hidden_gated.data() + pos * kFFHiddenDim,
+           output + pos * model_dim, pool);
   }
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(activations.attention_out.data() + pos * kModelDim,
-            output + pos * kModelDim, kModelDim);
+    AddFrom(activations.attention_out.data() + pos * model_dim,
+            output + pos * model_dim, model_dim);
   }
 }
 
-template <typename TConfig, typename WeightsT, typename LayerT>
+template <typename T>
 float CrossEntropyLossForwardPass(const std::vector<int>& prompt,
-                                  size_t context_size, const WeightsT& weights,
-                                  ForwardPass<float, TConfig>& forward,
+                                  size_t context_size,
+                                  const ModelWeightsPtrs<T>& weights,
+                                  ForwardPass<float>& forward,
                                   const RowVectorBatch<float>& inv_timescale,
                                   hwy::ThreadPool& pool) {
-  static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kLayers = TConfig::kLayers;
-  const float kEmbScaling = EmbeddingScaling<TConfig>();
-  static_assert(!TConfig::kAbsolutePE);
-  static_assert(TConfig::kPostNorm == PostNormType::None);
-  static_assert(TConfig::kKVHeads == 1);
+  const ModelConfig& config = weights.weights_config;
+  const size_t vocab_size = config.vocab_size;
+  const size_t model_dim = config.model_dim;
+  const size_t layers = config.layer_configs.size();
+  const float emb_scaling = EmbeddingScaling(model_dim);
+  HWY_ASSERT(!config.absolute_pe);
+  HWY_ASSERT(config.layer_configs[0].post_norm == PostNormType::None);
+  HWY_ASSERT(config.layer_configs[0].kv_heads == 1);
 
   HWY_DASSERT(context_size > 0);
   HWY_DASSERT(context_size < prompt.size());
   const size_t num_tokens = prompt.size() - 1;
 
-  InputEmbedding(weights.embedder_input_embedding, prompt, kEmbScaling,
-                 forward.layers[0].input.data(), kModelDim, kVocabSize);
+  InputEmbedding(weights.embedder_input_embedding, prompt, emb_scaling,
+                 forward.layers[0].input.data(), model_dim, vocab_size);
 
-  for (size_t layer = 0; layer < kLayers; ++layer) {
-    auto type = TConfig::kLayerConfig[layer];
+  for (size_t layer = 0; layer < config.layer_configs.size(); ++layer) {
+    auto type = config.layer_configs[layer].type;
     // TODO(szabadka) Implement Griffin layer.
     HWY_ASSERT(type == LayerAttentionType::kGemma);
-    float* HWY_RESTRICT output = layer + 1 < kLayers ?
-                                 forward.layers[layer + 1].input.data() :
-                                 forward.final_layer_output.data();
-    ApplyForwardLayer<TConfig, LayerT>(*weights.GetLayer(layer),
-                                       forward.layers[layer], num_tokens,
-                                       output, inv_timescale, pool);
+    float* HWY_RESTRICT output = layer + 1 < layers
+                                     ? forward.layers[layer + 1].input.data()
+                                     : forward.final_layer_output.data();
+    ApplyForwardLayer(*weights.GetLayer(layer), forward.layers[layer],
+                      num_tokens, output, inv_timescale, pool);
   }
 
   ApplyRMSNorm(weights.final_norm_scale.data(),
-               forward.final_layer_output.data(),
-               kModelDim, num_tokens, forward.final_norm_output.data(), pool);
+               forward.final_layer_output.data(), model_dim, num_tokens,
+               forward.final_norm_output.data(), pool);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec<kVocabSize, kModelDim>(
-        weights.embedder_input_embedding, 0,
-        forward.final_norm_output.data() + pos * kModelDim,
-        forward.logits.data() + pos * kVocabSize, pool);
+    MatVec(weights.embedder_input_embedding, 0, vocab_size, model_dim,
+           forward.final_norm_output.data() + pos * model_dim,
+           forward.logits.data() + pos * vocab_size, pool);
   }
 
-  if constexpr (TConfig::kFinalCap > 0.0f) {
+  if (config.final_cap > 0.0f) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      LogitsSoftCap(TConfig::kFinalCap,
-                    forward.logits.data() + pos * kVocabSize, kVocabSize);
+      LogitsSoftCap(config.final_cap, forward.logits.data() + pos * vocab_size,
+                    vocab_size);
     }
   }
 
   hwy::CopyBytes(forward.logits.data(), forward.probs.data(),
-                 num_tokens * kVocabSize * sizeof(forward.logits.At(0)));
+                 num_tokens * vocab_size * sizeof(forward.logits.At(0)));
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    Softmax(forward.probs.data() + pos * kVocabSize, kVocabSize);
+    Softmax(forward.probs.data() + pos * vocab_size, vocab_size);
   }
 
   return CrossEntropyLoss(forward.probs.data(), prompt, context_size,
-                          kVocabSize, pool);
+                          vocab_size, pool);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/backprop/forward.cc b/backprop/forward.cc
index 5b2cf1a..0c6cc5c 100644
--- a/backprop/forward.cc
+++ b/backprop/forward.cc
@@ -17,8 +17,9 @@
 
 #include "backprop/activations.h"
 #include "backprop/prompt.h"
-#include "gemma/activations.h"
 #include "gemma/common.h"
+#include "gemma/configs.h"
+#include "util/allocator.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 // Compiles this file for multiple architectures via "foreach_target.h", to
@@ -36,38 +37,13 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-template <typename TConfig>
-float CrossEntropyLossForwardPass(const Prompt& prompt,
-                                  const ByteStorageT& weights_u8,
-                                  ByteStorageT& forward_u8,
-                                  RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool) {
-  const auto& weights =
-      *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-  auto& forward =
-      *reinterpret_cast<ForwardPass<float, TConfig>*>(forward_u8.get());
-  return CrossEntropyLossForwardPass<TConfig, CompressedWeights<TConfig>,
-                                     CompressedLayer<TConfig>>(
-      prompt.tokens, prompt.context_size, weights, forward, inv_timescale,
-      pool);
-}
-
-float CrossEntropyLossForwardPassT(Model model, const Prompt& prompt,
-                                   const ByteStorageT& weights,
-                                   ByteStorageT& forward,
+float CrossEntropyLossForwardPassT(const Prompt& prompt,
+                                   const ModelWeightsPtrs<float>& weights,
+                                   ForwardPass<float>& forward,
                                    RowVectorBatch<float>& inv_timescale,
                                    hwy::ThreadPool& pool) {
-  // TODO(janwas): use CallFunctorForModel
-  switch (model) {
-    case Model::GEMMA_2B:
-      return CrossEntropyLossForwardPass<ConfigGemma2B<float>>(
-          prompt, weights, forward, inv_timescale, pool);
-    case Model::GEMMA_TINY:
-      return CrossEntropyLossForwardPass<ConfigGemmaTiny<float>>(
-          prompt, weights, forward, inv_timescale, pool);
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
+  return CrossEntropyLossForwardPass(prompt.tokens, prompt.context_size,
+                                     weights, forward, inv_timescale, pool);
 }
 
 }  // namespace HWY_NAMESPACE
@@ -79,13 +55,13 @@ namespace gcpp {
 
 HWY_EXPORT(CrossEntropyLossForwardPassT);
 
-float CrossEntropyLossForwardPass(const Model& model, const Prompt& prompt,
-                                  const ByteStorageT& weights,
-                                  ByteStorageT& forward,
+float CrossEntropyLossForwardPass(const Prompt& prompt,
+                                  const ModelWeightsPtrs<float>& weights,
+                                  ForwardPass<float>& forward,
                                   RowVectorBatch<float>& inv_timescale,
                                   hwy::ThreadPool& pool) {
   return HWY_DYNAMIC_DISPATCH(CrossEntropyLossForwardPassT)(
-      model, prompt, weights, forward, inv_timescale, pool);
+      prompt, weights, forward, inv_timescale, pool);
 }
 
 }  // namespace gcpp
diff --git a/backprop/forward.h b/backprop/forward.h
index 92ca371..3b42298 100644
--- a/backprop/forward.h
+++ b/backprop/forward.h
@@ -16,16 +16,17 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_H_
 
+#include "backprop/activations.h"
 #include "backprop/prompt.h"
-#include "gemma/activations.h"
-#include "gemma/common.h"
+#include "gemma/weights.h"
+#include "util/allocator.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
 
-float CrossEntropyLossForwardPass(const Model& model, const Prompt& prompt,
-                                  const ByteStorageT& weights,
-                                  ByteStorageT& forward,
+float CrossEntropyLossForwardPass(const Prompt& prompt,
+                                  const ModelWeightsPtrs<float>& weights,
+                                  ForwardPass<float>& forward,
                                   RowVectorBatch<float>& inv_timescale,
                                   hwy::ThreadPool& pool);
 
diff --git a/backprop/forward_scalar.h b/backprop/forward_scalar.h
index 064112b..617d0c3 100644
--- a/backprop/forward_scalar.h
+++ b/backprop/forward_scalar.h
@@ -127,108 +127,107 @@ void InputEmbedding(const T* w, const std::vector<int>& tokens, T scaling,
   }
 }
 
-template<typename T>
-void MaskedAttention(const T* qkv, T* output, size_t num_tokens,
-                     size_t kHeads, size_t kQKVDim, size_t kSeqLen) {
+template <typename T>
+void MaskedAttention(const T* qkv, T* output, size_t num_tokens, size_t heads,
+                     size_t qkv_dim, size_t seq_len) {
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t head = 0; head < kHeads; ++head) {
-      const size_t qoffset = pos * (kHeads + 2) * kQKVDim;
-      const size_t aoffset = pos * kHeads * kSeqLen + head * kSeqLen;
-      const T* q = qkv + qoffset + head * kQKVDim;
+    for (size_t head = 0; head < heads; ++head) {
+      const size_t qoffset = pos * (heads + 2) * qkv_dim;
+      const size_t aoffset = pos * heads * seq_len + head * seq_len;
+      const T* q = qkv + qoffset + head * qkv_dim;
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const T* k = qkv + (pos2 * (kHeads + 2) + kHeads) * kQKVDim;
-        output[aoffset + pos2] = DotT(q, k, kQKVDim);
+        const T* k = qkv + (pos2 * (heads + 2) + heads) * qkv_dim;
+        output[aoffset + pos2] = DotT(q, k, qkv_dim);
       }
     }
   }
 }
-template<typename T>
-void MaskedSoftmax(T* x, size_t num_tokens, size_t kHeads, size_t kSeqLen) {
+template <typename T>
+void MaskedSoftmax(T* x, size_t num_tokens, size_t heads, size_t seq_len) {
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t head = 0; head < kHeads; ++head) {
-      size_t offset = pos * kHeads * kSeqLen + head * kSeqLen;
+    for (size_t head = 0; head < heads; ++head) {
+      size_t offset = pos * heads * seq_len + head * seq_len;
       Softmax(x + offset, pos + 1);
-      memset(x + offset + pos + 1, 0, (kSeqLen - pos - 1) * sizeof(T));
+      memset(x + offset + pos + 1, 0, (seq_len - pos - 1) * sizeof(T));
     }
   }
 }
-template<typename T>
+template <typename T>
 void MixByAttention(const T* qkv, const T* attention, T* output,
-                    size_t num_tokens, size_t kHeads, size_t kQKVDim,
-                    size_t kSeqLen) {
+                    size_t num_tokens, size_t heads, size_t qkv_dim,
+                    size_t seq_len) {
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t head = 0; head < kHeads; ++head) {
-      const T* att = &attention[pos * kHeads * kSeqLen + head * kSeqLen];
-      T* out = &output[head * kQKVDim + pos * kHeads * kQKVDim];
-      memset(out, 0, kQKVDim * sizeof(out[0]));
+    for (size_t head = 0; head < heads; ++head) {
+      const T* att = &attention[pos * heads * seq_len + head * seq_len];
+      T* out = &output[head * qkv_dim + pos * heads * qkv_dim];
+      memset(out, 0, qkv_dim * sizeof(out[0]));
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        size_t v_offset = (pos2 * (kHeads + 2) + kHeads + 1) * kQKVDim;
+        size_t v_offset = (pos2 * (heads + 2) + heads + 1) * qkv_dim;
         const T* v = &qkv[v_offset];
-        MulByConstAndAddT(att[pos2], v, out, kQKVDim);
+        MulByConstAndAddT(att[pos2], v, out, qkv_dim);
       }
     }
   }
 }
-template <typename T, typename TConfig>
-void ApplyLayer(const CompressedLayer<TConfig>& weights,
-                ForwardLayer<T, TConfig>& activations, size_t num_tokens,
-                T* output) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-  static const T kQueryScale = T(1.0) / std::sqrt(T(kQKVDim));
+template <typename T>
+void ApplyLayer(const LayerWeightsPtrs<T>& weights,
+                ForwardLayer<T>& activations, size_t num_tokens, T* output) {
+  const LayerConfig& layer_config = weights.layer_config;
+  const size_t model_dim = layer_config.model_dim;
+  const size_t seq_len = activations.input.Rows();
+  const size_t qkv_dim = layer_config.qkv_dim;
+  const size_t heads = layer_config.heads;
+  const size_t ff_hidden_dim = layer_config.ff_hidden_dim;
+  static const T query_scale = T(1.0) / std::sqrt(T(qkv_dim));
 
   RMSNormT(weights.pre_attention_norm_scale.data(), activations.input.data(),
-           activations.pre_att_rms_out.data(), kModelDim, num_tokens);
+           activations.pre_att_rms_out.data(), model_dim, num_tokens);
 
   MatMulT(weights.qkv_einsum_w.data(), activations.pre_att_rms_out.data(),
-          activations.qkv.data(), (kHeads + 2) * kQKVDim, kModelDim,
-          num_tokens);
+          activations.qkv.data(), (heads + 2) * qkv_dim, model_dim, num_tokens);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = activations.qkv.data() + pos * (kHeads + 2) * kQKVDim;
-    for (size_t h = 0; h <= kHeads; ++h) {
-      Rope(qkv + h * kQKVDim, kQKVDim, pos);
+    T* qkv = activations.qkv.data() + pos * (heads + 2) * qkv_dim;
+    for (size_t h = 0; h <= heads; ++h) {
+      Rope(qkv + h * qkv_dim, qkv_dim, pos);
     }
   }
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = activations.qkv.data() + pos * (kHeads + 2) * kQKVDim;
-    MulByConstT(kQueryScale, qkv, kHeads * kQKVDim);
+    T* qkv = activations.qkv.data() + pos * (heads + 2) * qkv_dim;
+    MulByConstT(query_scale, qkv, heads * qkv_dim);
   }
 
-  MaskedAttention(activations.qkv.data(), activations.att.data(),
-                  num_tokens, kHeads, kQKVDim, kSeqLen);
+  MaskedAttention(activations.qkv.data(), activations.att.data(), num_tokens,
+                  heads, qkv_dim, seq_len);
 
-  MaskedSoftmax(activations.att.data(), num_tokens, kHeads, kSeqLen);
+  MaskedSoftmax(activations.att.data(), num_tokens, heads, seq_len);
 
   MixByAttention(activations.qkv.data(), activations.att.data(),
-                 activations.att_out.data(), num_tokens, kHeads, kQKVDim,
-                 kSeqLen);
+                 activations.att_out.data(), num_tokens, heads, qkv_dim,
+                 seq_len);
 
   MultiHeadMatMul(weights.attn_vec_einsum_w.data(), activations.att_out.data(),
-                  activations.attention_out.data(), kHeads, kModelDim, kQKVDim,
+                  activations.attention_out.data(), heads, model_dim, qkv_dim,
                   num_tokens);
 
   AddFromT(activations.input.data(), activations.attention_out.data(),
-           num_tokens * kModelDim);
+           num_tokens * model_dim);
 
   RMSNormT(weights.pre_ffw_norm_scale.data(), activations.attention_out.data(),
-           activations.bf_pre_ffw_rms_out.data(), kModelDim, num_tokens);
+           activations.bf_pre_ffw_rms_out.data(), model_dim, num_tokens);
 
   MatMulT(weights.gating_einsum_w.data(), activations.bf_pre_ffw_rms_out.data(),
-          activations.ffw_hidden.data(), kFFHiddenDim * 2, kModelDim,
+          activations.ffw_hidden.data(), ff_hidden_dim * 2, model_dim,
           num_tokens);
 
   GatedGelu(activations.ffw_hidden.data(), activations.ffw_hidden_gated.data(),
-            kFFHiddenDim, num_tokens);
+            ff_hidden_dim, num_tokens);
 
-  MatMulT(weights.linear_w.data(), activations.ffw_hidden_gated.data(),
-          output, kModelDim, kFFHiddenDim, num_tokens);
+  MatMulT(weights.linear_w.data(), activations.ffw_hidden_gated.data(), output,
+          model_dim, ff_hidden_dim, num_tokens);
 
-  AddFromT(activations.attention_out.data(), output, num_tokens * kModelDim);
+  AddFromT(activations.attention_out.data(), output, num_tokens * model_dim);
 }
 
 template<typename T>
@@ -247,48 +246,47 @@ T CrossEntropyLoss(const T* x, const Prompt& prompt, size_t V) {
   return loss * scaling;
 }
 
-template <typename T, typename TConfig>
+template <typename T>
 T CrossEntropyLossForwardPass(const Prompt& prompt,
-                              const CompressedWeights<TConfig>& weights,
-                              ForwardPass<T, TConfig>& forward) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  static constexpr size_t kLayers = TConfig::kLayers;
+                              const ModelWeightsPtrs<T>& weights,
+                              ForwardPass<T>& forward) {
+  const ModelConfig& config = weights.weights_config;
+  const size_t model_dim = config.model_dim;
+  const size_t vocab_size = config.vocab_size;
+  const size_t layers = config.layer_configs.size();
   const std::vector<int> tokens = prompt.tokens;
   const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
 
-  const T kEmbScaling = EmbeddingScaling(kModelDim);
-  InputEmbedding(weights.embedder_input_embedding.data(), tokens,
-                 kEmbScaling, forward.layers[0].input.data(), kModelDim);
+  const T kEmbScaling = EmbeddingScaling(model_dim);
+  InputEmbedding(weights.embedder_input_embedding.data(), tokens, kEmbScaling,
+                 forward.layers[0].input.data(), model_dim);
 
-  for (size_t layer = 0; layer < kLayers; ++layer) {
-    T* output = layer + 1 < kLayers ?
-                forward.layers[layer + 1].input.data() :
-                forward.final_layer_output.data();
+  for (size_t layer = 0; layer < layers; ++layer) {
+    T* output = layer + 1 < layers ? forward.layers[layer + 1].input.data()
+                                   : forward.final_layer_output.data();
     ApplyLayer(*weights.GetLayer(layer), forward.layers[layer], num_tokens,
                output);
   }
 
-  RMSNormT(weights.final_norm_scale.data(),
-           forward.final_layer_output.data(),
-           forward.final_norm_output.data(), kModelDim, num_tokens);
+  RMSNormT(weights.final_norm_scale.data(), forward.final_layer_output.data(),
+           forward.final_norm_output.data(), model_dim, num_tokens);
 
   MatMulT(weights.embedder_input_embedding.data(),
-          forward.final_norm_output.data(),
-          forward.logits.data(), kVocabSize, kModelDim, num_tokens);
+          forward.final_norm_output.data(), forward.logits.data(), vocab_size,
+          model_dim, num_tokens);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    if constexpr (TConfig::kFinalCap > 0.0f) {
-      Softcap(TConfig::kFinalCap, forward.logits.data() + pos * kVocabSize,
-              kVocabSize);
+    if (config.final_cap > 0.0f) {
+      Softcap(config.final_cap, forward.logits.data() + pos * vocab_size,
+              vocab_size);
     }
   }
 
   memcpy(forward.probs.data(), forward.logits.data(),
-         num_tokens * kVocabSize * sizeof(forward.logits.At(0)));
-  Softmax(forward.probs.data(), kVocabSize, num_tokens);
+         num_tokens * vocab_size * sizeof(forward.logits.At(0)));
+  Softmax(forward.probs.data(), vocab_size, num_tokens);
 
-  return CrossEntropyLoss(forward.probs.data(), prompt, kVocabSize);
+  return CrossEntropyLoss(forward.probs.data(), prompt, vocab_size);
 }
 
 }  // namespace gcpp
diff --git a/backprop/optimize_test.cc b/backprop/optimize_test.cc
index 26698c6..b47a48d 100644
--- a/backprop/optimize_test.cc
+++ b/backprop/optimize_test.cc
@@ -16,6 +16,7 @@
 #include <stddef.h>
 
 #include <algorithm>
+#include <cstdio>
 #include <random>
 #include <vector>
 
@@ -26,8 +27,10 @@
 #include "backprop/optimizer.h"
 #include "backprop/prompt.h"
 #include "backprop/sampler.h"
+#include "compression/shared.h"
 #include "gemma/activations.h"
 #include "gemma/common.h"
+#include "gemma/configs.h"
 #include "gemma/gemma.h"
 #include "gemma/weights.h"
 #include "util/threading.h"
@@ -45,20 +48,18 @@ TEST(OptimizeTest, GradientDescent) {
       .training = ModelTraining::GEMMA_IT,
       .weight = Type::kF32,
   };
-  ByteStorageT grad = CallForModelAndWeight<AllocateCompressedWeights>(
-      info.model, info.weight, pool);
-  ByteStorageT grad_m = CallForModelAndWeight<AllocateCompressedWeights>(
-      info.model, info.weight, pool);
-  ByteStorageT grad_v = CallForModelAndWeight<AllocateCompressedWeights>(
-      info.model, info.weight, pool);
-  ByteStorageT forward =
-      CallForModelAndWeight<AllocateForwardPass>(info.model, info.weight);
-  ByteStorageT backward =
-      CallForModelAndWeight<AllocateForwardPass>(info.model, info.weight);
-  KVCache kv_cache = KVCache::Create(info.model, /*prefill_tbatch_size=*/16);
+  ModelConfig config = ConfigFromModel(info.model);
+  ModelWeightsStorage grad, grad_m, grad_v;
+  grad.Allocate(info.model, info.weight, pool);
+  grad_m.Allocate(info.model, info.weight, pool);
+  grad_v.Allocate(info.model, info.weight, pool);
+  grad_m.ZeroInit();
+  grad_v.ZeroInit();
+  ForwardPass<float> forward(config), backward(config);
+  KVCache kv_cache = KVCache::Create(config, /*prefill_tbatch_size=*/16);
 
-  RowVectorBatch<float> inv_timescale =
-      Activations::CreateInvTimescale<ConfigGemmaTiny<float>>();
+  RowVectorBatch<float> inv_timescale = Activations::CreateInvTimescale(
+      config.layer_configs[0].qkv_dim, config.layer_configs[0].post_qk);
 
   Gemma gemma(GemmaTokenizer(), info, pools);
 
@@ -92,14 +93,11 @@ TEST(OptimizeTest, GradientDescent) {
                       reply.begin() + context.size());
   };
 
-  RandInitWeights(info.model, info.weight, gemma.Weights(), pool, gen);
-  CallForModelAndWeight<ZeroInitCompressedWeights>(info.model, info.weight,
-                                                   grad_m, pool);
-  CallForModelAndWeight<ZeroInitCompressedWeights>(info.model, info.weight,
-                                                   grad_v, pool);
+  gemma.MutableWeights().RandInit(gen);
+  gemma.MutableWeights().AllocAndCopyWithTranspose(pool);
 
   printf("Initial weights:\n");
-  LogWeightStats(info.model, info.weight, gemma.Weights());
+  gemma.MutableWeights().LogWeightStats();
 
   constexpr size_t kBatchSize = 8;
   const float alpha = 0.001f;
@@ -113,29 +111,29 @@ TEST(OptimizeTest, GradientDescent) {
   size_t num_ok;
   for (; steps < 1000000; ++steps) {
     std::mt19937 sgen(42);
-    CallForModelAndWeight<ZeroInitCompressedWeights>(info.model, info.weight,
-                                                     grad, pool);
+    grad.ZeroInit();
     float total_loss = 0.0f;
     num_ok = 0;
     for (size_t i = 0; i < kBatchSize; ++i) {
       Prompt prompt = training_task.Sample(sgen);
       total_loss += CrossEntropyLossForwardPass(
-          info.model, prompt, gemma.Weights(), forward, inv_timescale, pool);
-      CrossEntropyLossBackwardPass(info.model, prompt, gemma.Weights(), forward,
-                                   grad, backward, inv_timescale, pool);
-      CallForModelAndWeight<ReshapeCompressedWeights>(
-          info.model, info.weight, gemma.MutableWeights(), pool);
+          prompt, *gemma.Weights().GetWeightsOfType<float>(), forward,
+          inv_timescale, pool);
+      CrossEntropyLossBackwardPass(
+          prompt, *gemma.Weights().GetWeightsOfType<float>(), forward,
+          *grad.GetWeightsOfType<float>(), backward, inv_timescale, pool);
+      gemma.MutableWeights().CopyWithTranspose(pool);
       num_ok += verify(prompt) ? 1 : 0;
     }
     total_loss /= kBatchSize;
 
-    AdamUpdate(info.model, info.weight, grad, alpha, beta1, beta2, epsilon,
-               steps + 1, gemma.Weights(), grad_m, grad_v, pool);
+    AdamUpdate(info.weight, grad, alpha, beta1, beta2, epsilon, steps + 1,
+               gemma.Weights(), grad_m, grad_v, pool);
     printf("step: %zu  total_loss: %.15f   num_ok: %zu/%zu\n",
            steps, total_loss, num_ok, kBatchSize);
     if (steps % 100 == 0) {
       printf("Batch gradient:\n");
-      LogWeightStats(info.model, info.weight, grad);
+      grad.LogWeightStats();
     }
     if (total_loss < 0.5f) {
       break;
@@ -143,7 +141,7 @@ TEST(OptimizeTest, GradientDescent) {
   }
   printf("Num steps: %zu\n", steps);
   printf("Final weights:\n");
-  LogWeightStats(info.model, info.weight, gemma.Weights());
+  gemma.MutableWeights().LogWeightStats();
   EXPECT_LT(steps, 300);
   EXPECT_EQ(num_ok, kBatchSize);
 }
diff --git a/backprop/optimizer.cc b/backprop/optimizer.cc
index 800f2fa..9187bf7 100644
--- a/backprop/optimizer.cc
+++ b/backprop/optimizer.cc
@@ -16,7 +16,6 @@
 #include "backprop/optimizer.h"
 
 #include <cmath>
-#include <random>
 
 #include "compression/compress.h"
 #include "gemma/common.h"
@@ -30,37 +29,6 @@ namespace gcpp {
 
 namespace {
 
-class WeightInitializer {
- public:
-  WeightInitializer(std::mt19937& gen) : dist_(0.0f, 1.0f), gen_(gen) {}
-
-  void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
-    float* data = tensors[0]->data<float>();
-    for (size_t i = 0; i < tensors[0]->NumElements(); ++i) {
-      data[i] = dist_(gen_);
-    }
-    tensors[0]->set_scale(1.0f);
-  }
-
- private:
-  std::normal_distribution<float> dist_;
-  std::mt19937& gen_;
-};
-
-template <typename TConfig>
-struct RandInitWeightsT {
-  void operator()(const ByteStorageT& weights_u8, hwy::ThreadPool& pool,
-                  std::mt19937& gen) const {
-    auto& weights =
-        *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-    // TODO(szabadka) Use the same weight initialization method as in the python
-    // version.
-    WeightInitializer init(gen);
-    CompressedWeights<TConfig>::ForEachTensor({&weights},
-                                              ForEachType::kLoadNoToc, init);
-  }
-};
-
 class AdamUpdater {
  public:
   explicit AdamUpdater(float alpha, float beta1, float beta2, float epsilon,
@@ -97,42 +65,31 @@ class AdamUpdater {
   float epsilon_;
 };
 
-template <typename TConfig>
-struct AdamUpdateT {
-  void operator()(const ByteStorageT& grad_u8, float alpha, float beta1,
-                  float beta2, float epsilon, size_t t,
-                  const ByteStorageT& weights_u8, const ByteStorageT& grad_m_u8,
-                  const ByteStorageT& grad_v_u8, hwy::ThreadPool& pool) const {
-    using TWeights = CompressedWeights<TConfig>;
-    auto& grad = *reinterpret_cast<TWeights*>(grad_u8.get());
-    auto& weights = *reinterpret_cast<TWeights*>(weights_u8.get());
-    auto& grad_m = *reinterpret_cast<TWeights*>(grad_m_u8.get());
-    auto& grad_v = *reinterpret_cast<TWeights*>(grad_v_u8.get());
-    AdamUpdater updater(alpha, beta1, beta2, epsilon, t);
-    TWeights::ForEachTensor(
-        {&grad, &weights, &grad_m, &grad_v}, ForEachType::kLoadNoToc,
-        [&updater](const char* name, hwy::Span<MatPtr*> tensors) {
-          updater(name, *tensors[0], *tensors[1], *tensors[2], *tensors[3]);
-        });
-  }
-};
+void AdamUpdate(ModelWeightsPtrs<float>* grad, float alpha, float beta1,
+                float beta2, float epsilon, size_t t,
+                ModelWeightsPtrs<float>* weights,
+                ModelWeightsPtrs<float>* grad_m,
+                ModelWeightsPtrs<float>* grad_v, hwy::ThreadPool& pool) {
+  AdamUpdater updater(alpha, beta1, beta2, epsilon, t);
+  ModelWeightsPtrs<float>::ForEachTensor(
+      {grad, weights, grad_m, grad_v}, ForEachType::kLoadNoToc,
+      [&updater](const char* name, hwy::Span<MatPtr*> tensors) {
+        updater(name, *tensors[0], *tensors[1], *tensors[2], *tensors[3]);
+      });
+}
 
 }  // namespace
 
-void RandInitWeights(Model model_type, Type weight_type,
-                     const ByteStorageT& weights, hwy::ThreadPool& pool,
-                     std::mt19937& gen) {
+void AdamUpdate(Type weight_type, const ModelWeightsStorage& grad, float alpha,
+                float beta1, float beta2, float epsilon, size_t t,
+                const ModelWeightsStorage& weights,
+                const ModelWeightsStorage& grad_m,
+                const ModelWeightsStorage& grad_v, hwy::ThreadPool& pool) {
   HWY_ASSERT(weight_type == Type::kF32);
-  CallForModel<float, RandInitWeightsT>(model_type, weights, pool, gen);
-}
-
-void AdamUpdate(Model model_type, Type weight_type, const ByteStorageT& grad,
-                float alpha, float beta1, float beta2, float epsilon, size_t t,
-                const ByteStorageT& weights, const ByteStorageT& grad_m,
-                const ByteStorageT& grad_v, hwy::ThreadPool& pool) {
-  HWY_ASSERT(weight_type == Type::kF32);
-  CallForModel<float, AdamUpdateT>(model_type, grad, alpha, beta1, beta2,
-                                   epsilon, t, weights, grad_m, grad_v, pool);
+  AdamUpdate(grad.GetWeightsOfType<float>(), alpha, beta1, beta2, epsilon, t,
+             weights.GetWeightsOfType<float>(),
+             grad_m.GetWeightsOfType<float>(), grad_v.GetWeightsOfType<float>(),
+             pool);
 }
 
 }  // namespace gcpp
diff --git a/backprop/optimizer.h b/backprop/optimizer.h
index b42f311..8b25c52 100644
--- a/backprop/optimizer.h
+++ b/backprop/optimizer.h
@@ -16,22 +16,17 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_OPTIMIZER_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_OPTIMIZER_H_
 
-#include <random>
-
 #include "gemma/common.h"
-#include "util/allocator.h"
+#include "gemma/weights.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
 
-void RandInitWeights(Model model_type, Type weight_type,
-                     const ByteStorageT& weights, hwy::ThreadPool& pool,
-                     std::mt19937& gen);
-
-void AdamUpdate(Model model_type, Type weight_type, const ByteStorageT& grad,
-                float alpha, float beta1, float beta2, float epsilon, size_t t,
-                const ByteStorageT& weights, const ByteStorageT& grad_m,
-                const ByteStorageT& grad_v, hwy::ThreadPool& pool);
+void AdamUpdate(Type weight_type, const ModelWeightsStorage& grad, float alpha,
+                float beta1, float beta2, float epsilon, size_t t,
+                const ModelWeightsStorage& weights,
+                const ModelWeightsStorage& grad_m,
+                const ModelWeightsStorage& grad_v, hwy::ThreadPool& pool);
 
 }  // namespace gcpp
 
diff --git a/backprop/test_util.h b/backprop/test_util.h
index bfa2cc5..86f99b1 100644
--- a/backprop/test_util.h
+++ b/backprop/test_util.h
@@ -21,11 +21,12 @@
 #include <cmath>
 #include <complex>
 #include <random>
+#include <vector>
 
 #include "gtest/gtest.h"
 #include "compression/compress.h"
+#include "gemma/configs.h"
 #include "gemma/weights.h"
-#include "util/allocator.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
@@ -39,8 +40,8 @@ void RandInit(MatPtrT<T>& x, T stddev, std::mt19937& gen) {
 }
 
 // TODO: make a member of Layer<T>.
-template <typename T, typename TConfig>
-void RandInit(CompressedLayer<TConfig>& w, T stddev, std::mt19937& gen) {
+template <typename T>
+void RandInit(LayerWeightsPtrs<T>& w, T stddev, std::mt19937& gen) {
   RandInit(w.pre_attention_norm_scale, stddev, gen);
   RandInit(w.attn_vec_einsum_w, stddev, gen);
   RandInit(w.qkv_einsum_w, stddev, gen);
@@ -49,9 +50,9 @@ void RandInit(CompressedLayer<TConfig>& w, T stddev, std::mt19937& gen) {
   RandInit(w.linear_w, stddev, gen);
 }
 
-template <typename T, typename TConfig>
-void RandInit(CompressedWeights<TConfig>& w, T stddev, std::mt19937& gen) {
-  static constexpr size_t kLayers = TConfig::kLayers;
+template <typename T>
+void RandInit(ModelWeightsPtrs<T>& w, T stddev, std::mt19937& gen) {
+  const size_t kLayers = w.c_layers.size();
   RandInit(w.embedder_input_embedding, stddev, gen);
   RandInit(w.final_norm_scale, stddev, gen);
   for (size_t i = 0; i < kLayers; ++i) {
@@ -66,9 +67,8 @@ void Complexify(const MatPtrT<T>& x, MatPtrT<std::complex<U>>& c_x) {
   }
 }
 
-template <typename TConfig, typename UConfig>
-void Complexify(const CompressedLayer<TConfig>& w,
-                CompressedLayer<UConfig>& c_w) {
+template <typename T, typename U>
+void Complexify(const LayerWeightsPtrs<T>& w, LayerWeightsPtrs<U>& c_w) {
   Complexify(w.pre_attention_norm_scale, c_w.pre_attention_norm_scale);
   Complexify(w.attn_vec_einsum_w, c_w.attn_vec_einsum_w);
   Complexify(w.qkv_einsum_w, c_w.qkv_einsum_w);
@@ -77,10 +77,9 @@ void Complexify(const CompressedLayer<TConfig>& w,
   Complexify(w.linear_w, c_w.linear_w);
 }
 
-template <typename TConfig, typename UConfig>
-void Complexify(const CompressedWeights<TConfig>& w,
-                CompressedWeights<UConfig>& c_w) {
-  static constexpr size_t kLayers = TConfig::kLayers;
+template <typename T, typename U>
+void Complexify(const ModelWeightsPtrs<T>& w, ModelWeightsPtrs<U>& c_w) {
+  const size_t kLayers = w.c_layers.size();
   Complexify(w.embedder_input_embedding, c_w.embedder_input_embedding);
   Complexify(w.final_norm_scale, c_w.final_norm_scale);
   for (size_t i = 0; i < kLayers; ++i) {
@@ -88,26 +87,27 @@ void Complexify(const CompressedWeights<TConfig>& w,
   }
 }
 
-// Owns weights and provides access to TConfig.
-template <typename TConfig>
+// Somewhat duplicates ModelWeightsStorage, but that has neither double nor
+// complex types allowed and it would cause code bloat to add them there.
+template <typename T>
 class WeightsWrapper {
  public:
-  WeightsWrapper()
-      : pool_(0),
-        data_(AllocateCompressedWeights<TConfig>()(pool_)),
-        weights_(reinterpret_cast<CompressedWeights<TConfig>*>(data_.get())) {}
+  explicit WeightsWrapper(const ModelConfig& config)
+      : pool_(0), weights_(config, pool_) {
+    weights_.Allocate(data_, pool_);
+  }
 
-  const CompressedWeights<TConfig>& get() const { return *weights_; }
-  CompressedWeights<TConfig>& get() { return *weights_; }
-  void ZeroInit() { weights_->ZeroInit(); }
-  void CopyFrom(const WeightsWrapper<TConfig>& other) {
-    get().CopyFrom(other.get());
+  const ModelWeightsPtrs<T>& get() const { return weights_; }
+  ModelWeightsPtrs<T>& get() { return weights_; }
+  void ZeroInit() { weights_.ZeroInit(); }
+  void CopyFrom(const WeightsWrapper<T>& other) {
+    weights_.CopyFrom(other.weights_);
   }
 
  private:
   hwy::ThreadPool pool_;
-  ByteStorageT data_;
-  CompressedWeights<TConfig>* weights_;
+  std::vector<MatStorage> data_;
+  ModelWeightsPtrs<T> weights_;
 };
 
 template <typename T, typename U>
@@ -173,9 +173,9 @@ void TestGradient(const MatPtrT<T>& grad, MatPtrT<std::complex<double>>& x,
   TestGradient(grad, x, func, 1e-50, max_abs_err, max_rel_error, line);
 }
 
-template <typename T, typename TConfig, typename UConfig, typename FUNC>
-void TestGradient(const CompressedLayer<TConfig>& grad,
-                  CompressedLayer<UConfig>& c_weights, FUNC func, T max_err) {
+template <typename T, typename U, typename FUNC>
+void TestGradient(const LayerWeightsPtrs<T>& grad,
+                  LayerWeightsPtrs<U>& c_weights, FUNC func, T max_err) {
   TestGradient(grad.pre_attention_norm_scale,
                c_weights.pre_attention_norm_scale,
                func, max_err, max_err, __LINE__);
@@ -191,15 +191,15 @@ void TestGradient(const CompressedLayer<TConfig>& grad,
                func, max_err, max_err, __LINE__);
 }
 
-template <typename T, typename TConfig, typename UConfig, typename FUNC>
-void TestGradient(const CompressedWeights<TConfig>& grad,
-                  CompressedWeights<UConfig>& c_weights, FUNC func, T max_err) {
+template <typename T, typename U, typename FUNC>
+void TestGradient(const ModelWeightsPtrs<T>& grad,
+                  ModelWeightsPtrs<U>& c_weights, FUNC func, T max_err) {
   TestGradient(grad.embedder_input_embedding,
                  c_weights.embedder_input_embedding,
                  func,  2 * max_err, max_err, __LINE__);
   TestGradient(grad.final_norm_scale, c_weights.final_norm_scale,
                func, max_err, max_err, __LINE__);
-  for (int i = 0; i < TConfig::kLayers; ++i) {
+  for (size_t i = 0; i < grad.c_layers.size(); ++i) {
     TestGradient(*grad.GetLayer(i), *c_weights.GetLayer(i), func, max_err);
   }
 }
diff --git a/compression/blob_store.cc b/compression/blob_store.cc
index 24248a1..57f50f5 100644
--- a/compression/blob_store.cc
+++ b/compression/blob_store.cc
@@ -21,7 +21,6 @@
 #include <atomic>
 #include <cstdio>
 #include <memory>
-#include <new>
 #include <string>
 #include <vector>
 
@@ -276,6 +275,7 @@ BlobError BlobReader::ReadAll(hwy::ThreadPool& pool) {
            [pfile, &requests, &err](uint64_t i, size_t /*thread*/) {
              if (!pfile->Read(requests[i].offset, requests[i].size,
                               requests[i].data)) {
+               fprintf(stderr, "Failed to read blob %zu\n", i);
                err.test_and_set();
              }
            });
diff --git a/compression/compress.h b/compression/compress.h
index e0ea0d7..adb35a1 100644
--- a/compression/compress.h
+++ b/compression/compress.h
@@ -102,8 +102,8 @@ class CompressedArray {
 class MatPtr {
  public:
   // Full constructor for dynamic sizing.
-  MatPtr(const std::string& name, const std::string& type, size_t element_size,
-         size_t rows, size_t cols)
+  MatPtr(const std::string& name, Type type, size_t element_size, size_t rows,
+         size_t cols)
       : name_(name),
         type_(type),
         element_size_(element_size),
@@ -129,7 +129,7 @@ class MatPtr {
   MatPtr(const hwy::uint128_t& key0, const hwy::uint128_t& key1,
          const hwy::uint128_t& key2, const hwy::uint128_t& key3)
       : name_(StringFromKey(key0)),
-        type_(StringFromKey(key1)),
+        type_(static_cast<Type>(key1.lo)),
         element_size_(key2.hi),
         num_elements_(key2.lo),
         rows_(key3.lo),
@@ -138,7 +138,7 @@ class MatPtr {
   // Adds the contents entry to the table of contents.
   void AddToToc(std::vector<hwy::uint128_t>& toc) const {
     toc.push_back(MakeKey(name_.c_str()));
-    toc.push_back(MakeKey(type_.c_str()));
+    toc.push_back({static_cast<uint64_t>(type_), 0});
     toc.push_back({num_elements_, element_size_});
     toc.push_back({rows_, cols_});
   }
@@ -167,7 +167,7 @@ class MatPtr {
   void SetName(const std::string& name) { name_ = name; }
 
   // Returns the type of the blob.
-  const std::string& Type() const { return type_; }
+  Type GetType() const { return type_; }
 
   // Returns the size of each element in bytes.
   size_t ElementSize() const { return element_size_; }
@@ -219,8 +219,8 @@ class MatPtr {
  protected:
   // Arbitrary name for the array of preferably <= 16 characters.
   std::string name_;
-  // Should be the result of TypeName<T> for CallUpcasted() to work.
-  std::string type_;
+  // Should be the result of TypeEnum<T> for CallUpcasted() to work.
+  Type type_;
   // sizeof(T)
   size_t element_size_ = 0;
   // Number of elements in the array.
@@ -247,7 +247,7 @@ class MatPtrT : public MatPtr {
 
   // Full constructor for dynamic sizing.
   MatPtrT(const std::string& name, size_t rows, size_t cols)
-      : MatPtr(name, TypeName<MatT>(), sizeof(MatT), rows, cols) {}
+      : MatPtr(name, TypeEnum<MatT>(), sizeof(MatT), rows, cols) {}
 
   // Copying allowed as the metadata is small.
   MatPtrT(const MatPtr& other) : MatPtr(other) {}
@@ -330,17 +330,20 @@ class MatPtrT : public MatPtr {
 
 template <class FuncT, typename... TArgs>
 decltype(auto) MatPtr::CallUpcasted(FuncT& func, TArgs&&... args) {
-  if (type_ == TypeName<float>()) {
+  if (type_ == TypeEnum<float>()) {
     return func(dynamic_cast<MatPtrT<float>*>(this),
                 std::forward<TArgs>(args)...);
-  } else if (type_ == TypeName<BF16>()) {
+  } else if (type_ == TypeEnum<BF16>()) {
     return func(dynamic_cast<MatPtrT<BF16>*>(this),
                 std::forward<TArgs>(args)...);
-  } else if (type_ == TypeName<SfpStream>()) {
+  } else if (type_ == TypeEnum<SfpStream>()) {
     return func(dynamic_cast<MatPtrT<SfpStream>*>(this),
                 std::forward<TArgs>(args)...);
+  } else if (type_ == TypeEnum<NuqStream>()) {
+    return func(dynamic_cast<MatPtrT<NuqStream>*>(this),
+                std::forward<TArgs>(args)...);
   } else {
-    HWY_ABORT("Type %s unknown.", type_.c_str());
+    HWY_ABORT("Type %d unknown.", type_);
   }
 }
 
@@ -563,9 +566,10 @@ class CacheLoader {
   }
 
   // Returns whether all tensors are successfully loaded from cache.
-  bool ReadAll(hwy::ThreadPool& pool, std::vector<MatStorage>& model_memory) {
+  BlobError ReadAll(hwy::ThreadPool& pool,
+                    std::vector<MatStorage>& model_memory) {
     // reader_ invalid or any Enqueue failed
-    if (err_ != 0) return false;
+    if (err_ != 0) return err_;
     // Setup the model_memory.
     for (int b = 0; b < model_toc_.size(); ++b) {
       const std::string& file_key = file_keys_[b];
@@ -574,12 +578,12 @@ class CacheLoader {
         const MatPtr* toc_blob = file_toc_.Get(file_key);
         if (toc_blob == nullptr) {
           fprintf(stderr, "Blob %s not found in TOC\n", file_key.c_str());
-          return false;
+          return __LINE__;
         }
         if (toc_blob->Rows() != blob->Rows() ||
             toc_blob->Cols() != blob->Cols()) {
           fprintf(stderr, "Blob %s has size mismatch TOC\n", file_key.c_str());
-          return false;
+          return __LINE__;
         }
         MatStorage toc_blob_array(*toc_blob);
         model_memory.push_back(std::move(toc_blob_array));
@@ -603,17 +607,10 @@ class CacheLoader {
                 "Failed to read blob %s (error %d) of size %zu x %zu x %zu\n",
                 blob.Name().c_str(), err_, blob.Rows(), blob.Cols(),
                 blob.ElementSize());
-        return false;
+        return err_;
       }
     }
-
-    err_ = reader_.ReadAll(pool);
-    if (err_ != 0) {
-      fprintf(stderr, "Failed to read all tensors (error %d)\n", err_);
-      return false;
-    }
-
-    return true;
+    return reader_.ReadAll(pool);
   }
 
  private:
diff --git a/compression/compress_weights.cc b/compression/compress_weights.cc
index 51897af..1a4fc52 100644
--- a/compression/compress_weights.cc
+++ b/compression/compress_weights.cc
@@ -24,6 +24,7 @@
 #include "hwy/highway.h"
 // After highway.h
 #include "compression/compress-inl.h"
+#include "gemma/configs.h"
 
 #ifndef GEMMA_COMPRESS_WEIGHTS_ONCE
 #define GEMMA_COMPRESS_WEIGHTS_ONCE
@@ -150,29 +151,22 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-template <class Configs>
+template <typename T>
 void CompressWeights(const Path& weights_path,
                      const Path& compressed_weights_path, Model model_type,
-                     Type weight_type, hwy::ThreadPool& pool) {
+                     hwy::ThreadPool& pool) {
   if (!weights_path.Exists()) {
     HWY_ABORT("The model weights file '%s' does not exist.",
               weights_path.path.c_str());
   }
   printf("Compressing weights from %s to %s\n", weights_path.path.c_str(),
          compressed_weights_path.path.c_str());
-
-  using CConfig = typename Configs::c;
-  using UCConfig = typename Configs::uc;
-  // Allocate compressed weights.
-  using CWeights = CompressedWeights<CConfig>;
-  ByteStorageT c_weights_u8 = AllocateCompressedWeights<CConfig>()(pool);
-  CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
-
-  // Allocate uncompressed weights.
-  using UCWeights = CompressedWeights<UCConfig>;
-  ByteStorageT uc_weights_u8 = AllocateCompressedWeights<UCConfig>()(pool);
-  UCWeights* uc_weights = reinterpret_cast<UCWeights*>(uc_weights_u8.get());
-
+  ModelConfig config = ConfigFromModel(model_type);
+  std::vector<MatStorage> model_storage;
+  ModelWeightsPtrs<T> c_weights(config, pool);
+  c_weights.Allocate(model_storage, pool);
+  ModelWeightsPtrs<float> uc_weights(config, pool);
+  uc_weights.Allocate(model_storage, pool);
   // Get uncompressed weights, compress, and store.
   FILE* fptr = fopen(weights_path.path.c_str(), "rb");
   if (fptr == nullptr) {
@@ -181,22 +175,22 @@ void CompressWeights(const Path& weights_path,
   }
   bool ok = true;
   uint64_t total_size = 0;
-  CompressedWeights<UCConfig>::ForEachTensor(
-      {uc_weights}, ForEachType::kLoadNoToc,
+  ModelWeightsPtrs<float>::ForEachTensor(
+      {&uc_weights}, ForEachType::kLoadNoToc,
       [&](const char* name, hwy::Span<MatPtr*> tensors) {
         fprintf(stderr, "Loading Parameters (size %zu): %s\n",
                 tensors[0]->SizeBytes(), name);
         ok &= 1 == fread(tensors[0]->Ptr(), tensors[0]->SizeBytes(), 1, fptr);
         total_size += tensors[0]->SizeBytes();
       });
-  const bool scale_for_compression = UCConfig::kNumTensorScales > 0;
+  const bool scale_for_compression = config.num_tensor_scales > 0;
   std::vector<float> scales;
   if (scale_for_compression) {
-    uc_weights->GetOrApplyScales(scales);
+    uc_weights.GetOrApplyScales(scales);
   }
   Compressor compressor(pool);
-  CompressedWeights<CConfig>::ForEachTensor(
-      {reinterpret_cast<CompressedWeights<CConfig>*>(uc_weights), c_weights},
+  ModelWeightsPtrs<T>::ForEachTensor(
+      {reinterpret_cast<ModelWeightsPtrs<T>*>(&uc_weights), &c_weights},
       ForEachType::kLoadNoToc,
       [&compressor](const char* name, hwy::Span<MatPtr*> tensors) {
         tensors[1]->CallUpcasted(
@@ -221,9 +215,26 @@ void Run(Args& args) {
     HWY_ABORT("PaliGemma is not supported in compress_weights.");
   }
   const Type weight_type = args.WeightType();
-  GEMMA_EXPORT_AND_DISPATCH(
-      model_type, weight_type, CompressWeights,
-      (args.weights, args.compressed_weights, model_type, weight_type, pool));
+  switch (weight_type) {
+    case Type::kF32:
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<float>)
+      (args.weights, args.compressed_weights, model_type, pool);
+      break;
+    case Type::kBF16:
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<BF16>)
+      (args.weights, args.compressed_weights, model_type, pool);
+      break;
+    case Type::kSFP:
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<SfpStream>)
+      (args.weights, args.compressed_weights, model_type, pool);
+      break;
+    case Type::kNUQ:
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<NuqStream>)
+      (args.weights, args.compressed_weights, model_type, pool);
+      break;
+    default:
+      HWY_ABORT("Weight type %d unsupported.", static_cast<int>(weight_type));
+  }
 }
 
 }  // namespace gcpp
diff --git a/compression/shared.h b/compression/shared.h
index c216d24..74b7454 100644
--- a/compression/shared.h
+++ b/compression/shared.h
@@ -32,11 +32,6 @@ namespace gcpp {
 
 using BF16 = hwy::bfloat16_t;
 
-template <typename Packed>
-constexpr bool IsF32() {
-  return hwy::IsSame<hwy::RemoveCvRef<Packed>, float>();
-}
-
 // Switching Floating Point: a hybrid 8-bit float representation of bf16/f32
 // inputs that combines the advantages of e4m3 and e5m2 into a single format.
 // It supports seeking at a granularity of 1 and decoding to bf16/f32.
@@ -179,29 +174,67 @@ struct NuqStream {
 };
 #pragma pack(pop)
 
+template <typename Packed>
+constexpr bool IsF32() {
+  return hwy::IsSame<hwy::RemoveCvRef<Packed>, float>();
+}
+
+template <typename Packed>
+constexpr bool IsBF16() {
+  return hwy::IsSame<hwy::RemoveCvRef<Packed>, BF16>();
+}
+
+template <typename Packed>
+constexpr bool IsSfpStream() {
+  return hwy::IsSame<hwy::RemoveCvRef<Packed>, SfpStream>();
+}
+
+template <typename Packed>
+constexpr bool IsNuqStream() {
+  return hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>();
+}
+
+// Instruction-tuned models require extra 'turn structure' tokens in prompts.
+enum class ModelTraining { GEMMA_IT, GEMMA_PT, PALIGEMMA };
+
+// Tensor types for loading weights. Note that not all types are supported as
+// weights for a model, but can be used for other purposes, such as types for
+// ModelWeightsPtrs. When adding a new type that is supported, also
+// update gemma.cc, weights.*, and add instantiations/new_one.cc.
+enum class Type { kUnknown, kF32, kBF16, kSFP, kNUQ, kF64, kC64, kU128 };
+constexpr const char* kTypeStrings[] = {"unknown", "f32", "bf16", "sfp",
+                                        "nuq",     "f64", "c64",  "u128"};
+
+// Returns a Type enum for the type of the template parameter.
 template <typename PackedT>
-const char* TypeName() {
+Type TypeEnum() {
   using Packed = hwy::RemoveCvRef<PackedT>;
   if constexpr (hwy::IsSame<Packed, float>()) {
-    return "f32";
+    return Type::kF32;
   } else if constexpr (hwy::IsSame<Packed, BF16>()) {
-    return "b16";
+    return Type::kBF16;
   } else if constexpr (hwy::IsSame<Packed, SfpStream>()) {
-    return "sfp";
+    return Type::kSFP;
   } else if constexpr (hwy::IsSame<Packed, NuqStream>()) {
-    return "nuq";
+    return Type::kNUQ;
   } else if constexpr (hwy::IsSame<Packed, double>()) {
-    return "f64";
+    return Type::kF64;
   } else if constexpr (hwy::IsSame<Packed, std::complex<double>>()) {
-    return "c64";
+    return Type::kC64;
   } else if constexpr (hwy::IsSame<Packed, hwy::uint128_t>()) {
-    return "u128";
+    return Type::kU128;
   } else {
     HWY_DASSERT(false);
-    return "unknown";
+    return Type::kUnknown;
   }
 }
 
+// Returns a string name for the type of the template parameter.
+template <typename PackedT>
+const char* TypeName() {
+  return kTypeStrings[static_cast<int>(TypeEnum<PackedT>())];
+}
+
 template <typename Packed>
 constexpr bool IsCompressed() {
   return hwy::IsSameEither<hwy::RemoveCvRef<Packed>, SfpStream, NuqStream>();
diff --git a/evals/benchmark.cc b/evals/benchmark.cc
index b59079a..1ea4f65 100644
--- a/evals/benchmark.cc
+++ b/evals/benchmark.cc
@@ -128,8 +128,8 @@ int BenchmarkCrossEntropy(GemmaEnv& env, const Path& text,
     size_t num_tokens = std::min<size_t>(prompt.size() - pos, batch_tokens);
     std::vector<int> prompt_slice(prompt.begin() + pos,
                                   prompt.begin() + pos + num_tokens);
-    KVCache kv_cache = KVCache::Create(
-        env.GetModel()->Info().model, env.MutableConfig().prefill_tbatch_size);
+    KVCache kv_cache = KVCache::Create(env.GetModel()->GetModelConfig(),
+                                       env.MutableConfig().prefill_tbatch_size);
     float entropy = ComputeCrossEntropy(
         *env.GetModel(), num_tokens, prompt_slice, kv_cache, env.Verbosity());
     total_entropy += entropy;
diff --git a/evals/benchmark_helper.cc b/evals/benchmark_helper.cc
index 63553aa..abae040 100644
--- a/evals/benchmark_helper.cc
+++ b/evals/benchmark_helper.cc
@@ -69,8 +69,8 @@ GemmaEnv::GemmaEnv(const LoaderArgs& loader, const InferenceArgs& inference,
     model_ = AllocateGemma(mutable_loader, pools_);
     // Only allocate one for starters because GenerateBatch might not be called.
     kv_caches_.resize(1);
-    kv_caches_[0] =
-        KVCache::Create(model_->Info().model, inference.prefill_tbatch_size);
+    kv_caches_[0] = KVCache::Create(model_->GetModelConfig(),
+                                    inference.prefill_tbatch_size);
   }
   InitGenerator(inference, gen_);
   runtime_config_ = {
@@ -163,7 +163,7 @@ std::vector<QueryResult> GemmaEnv::BatchQueryModel(
   }
   for (size_t i = 1; i < num_queries; ++i) {
     if (kv_caches_[i].seq_len == 0) {
-      kv_caches_[i] = KVCache::Create(model_->Info().model,
+      kv_caches_[i] = KVCache::Create(model_->GetModelConfig(),
                                       runtime_config_.prefill_tbatch_size);
     }
   }
diff --git a/evals/cross_entropy.cc b/evals/cross_entropy.cc
index 870f84c..13ff3d3 100644
--- a/evals/cross_entropy.cc
+++ b/evals/cross_entropy.cc
@@ -103,8 +103,7 @@ float ComputeCrossEntropy(Gemma& gemma, size_t max_generated_tokens,
   const StreamFunc stream_token = [](int /*token*/, float) { return true; };
 
   // TWeight is unused, but we have to pass it to Config*.
-  const int vocab_size =
-      CallForModel</*TWeight=*/float, GetVocabSize>(gemma.Info().model);
+  const int vocab_size = gemma.GetModelConfig().vocab_size;
   float cross_entropy = std::log(vocab_size);  // first token
   size_t pos = 1;
 
diff --git a/examples/hello_world/run.cc b/examples/hello_world/run.cc
index 39d4f9c..2ed9b64 100644
--- a/examples/hello_world/run.cc
+++ b/examples/hello_world/run.cc
@@ -24,7 +24,6 @@
 #include <vector>
 
 // Placeholder for internal header, do not modify.
-#include "gemma/common.h"
 #include "gemma/gemma.h"
 #include "gemma/tokenizer.h"
 #include "util/app.h"  // LoaderArgs
@@ -58,7 +57,8 @@ int main(int argc, char** argv) {
   gcpp::PerClusterPools pools(app.max_clusters, app.max_threads, app.pin);
   gcpp::Gemma model = gcpp::CreateGemma(loader, pools);
   gcpp::KVCache kv_cache =
-      gcpp::KVCache::Create(loader.Info().model, inference.prefill_tbatch_size);
+      gcpp::KVCache::Create(model.GetModelConfig(),
+                            inference.prefill_tbatch_size);
   size_t generated = 0;
 
   // Initialize random number generator
diff --git a/gemma/activations.h b/gemma/activations.h
index b10b562..3983924 100644
--- a/gemma/activations.h
+++ b/gemma/activations.h
@@ -21,6 +21,7 @@
 #include <cmath>
 
 #include "compression/shared.h"  // BF16
+#include "gemma/configs.h"
 #include "ops/matmul.h"          // MatMulEnv
 #include "util/allocator.h"      // RowVectorBatch
 #include "util/threading.h"
@@ -30,6 +31,12 @@
 namespace gcpp {
 
 struct Activations {
+  explicit Activations(const ModelConfig& config)
+      : weights_config(config),
+        layer_config(config.layer_configs[0]),
+        seq_len(config.seq_len),
+        cache_pos_size(config.CachePosSize()) {}
+
   RowVectorBatch<float> x;  // input
   RowVectorBatch<float> q;  // query, also KV if MHA.
   RowVectorBatch<float> logits;
@@ -58,23 +65,24 @@ struct Activations {
 
   MatMulEnv env;
 
+  PostQKType post_qk = PostQKType::Rope;
+  // And the config.
+  const ModelConfig& weights_config;
+  const LayerConfig& layer_config;
+  size_t seq_len;
+  size_t cache_pos_size = 0;
+
   // Multi-Head Attention?
-  template <class TConfig>
-  static constexpr bool IsMHA() {
-    return TConfig::kHeads == TConfig::kKVHeads;
-  }
+  bool IsMHA() const { return layer_config.heads == layer_config.kv_heads; }
 
   // Stride between subsequent queries. Each of Q, K, V are of length kQKVDim,
   // but for MHA we store them as Q,K,V, Q,K,V, .. instead of Q..Q, K..K, V..V.
-  template <class TConfig>
-  static constexpr size_t QStride() {
-    return TConfig::kQKVDim * (IsMHA<TConfig>() ? 3 : 1);
-  }
+  size_t QStride() const { return layer_config.qkv_dim * (IsMHA() ? 3 : 1); }
 
-  template <class TConfig>
-  static RowVectorBatch<float> CreateInvTimescale() {
-    constexpr size_t kQKVDim = TConfig::kQKVDim;
-    const size_t rope_dim = TConfig::kUseHalfRope ? kQKVDim / 2 : kQKVDim;
+  static RowVectorBatch<float> CreateInvTimescale(size_t qkv_dim,
+                                                  PostQKType post_qk) {
+    const size_t rope_dim =
+        post_qk == PostQKType::HalfRope ? qkv_dim / 2 : qkv_dim;
     RowVectorBatch<float> inv_timescale(1, rope_dim / 2);
     for (size_t dim = 0; dim < rope_dim / 2; ++dim) {
       const float freq_exponents =
@@ -86,40 +94,38 @@ struct Activations {
     return inv_timescale;
   }
 
-  template <class TConfig>
   void Allocate(size_t batch_size, PerClusterPools& pools) {
-    constexpr size_t kModelDim = TConfig::kModelDim;
-    constexpr size_t kQKVDim = TConfig::kQKVDim;
-    constexpr size_t kHeads = TConfig::kHeads;
-    constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-    constexpr size_t kVocabSize = TConfig::kVocabSize;
-    constexpr size_t kSeqLen = TConfig::kSeqLen;
-    constexpr size_t kGriffinLayers = TConfig::kGriffinLayers;
+    post_qk = layer_config.post_qk;
+    const size_t model_dim = weights_config.model_dim;
+    const size_t ff_hidden_dim = layer_config.ff_hidden_dim;
+    const size_t vocab_size = weights_config.vocab_size;
 
-    x = RowVectorBatch<float>(batch_size, kModelDim);
-    q = RowVectorBatch<float>(batch_size, kHeads * QStride<TConfig>());
-    if constexpr (kVocabSize > 0) {
-     logits = RowVectorBatch<float>(batch_size, kVocabSize);
+    x = RowVectorBatch<float>(batch_size, model_dim);
+    q = RowVectorBatch<float>(batch_size, layer_config.heads * QStride());
+    if (vocab_size > 0) {
+      logits = RowVectorBatch<float>(batch_size, vocab_size);
     }
 
-    pre_att_rms_out = RowVectorBatch<float>(batch_size, kModelDim);
-    att = RowVectorBatch<float>(batch_size, kHeads * kSeqLen);
-    att_out = RowVectorBatch<float>(batch_size, kHeads * kQKVDim);
-    att_sums = RowVectorBatch<float>(batch_size, kModelDim);
+    pre_att_rms_out = RowVectorBatch<float>(batch_size, model_dim);
+    att = RowVectorBatch<float>(batch_size,
+                                layer_config.heads * weights_config.seq_len);
+    att_out = RowVectorBatch<float>(batch_size,
+                                    layer_config.heads * layer_config.qkv_dim);
+    att_sums = RowVectorBatch<float>(batch_size, model_dim);
 
-    bf_pre_ffw_rms_out = RowVectorBatch<BF16>(batch_size, kModelDim);
-    C1 = RowVectorBatch<float>(batch_size, kFFHiddenDim);
-    C2 = RowVectorBatch<float>(batch_size, kFFHiddenDim);
-    ffw_out = RowVectorBatch<float>(batch_size, kModelDim);
+    bf_pre_ffw_rms_out = RowVectorBatch<BF16>(batch_size, model_dim);
+    C1 = RowVectorBatch<float>(batch_size, ff_hidden_dim);
+    C2 = RowVectorBatch<float>(batch_size, ff_hidden_dim);
+    ffw_out = RowVectorBatch<float>(batch_size, model_dim);
 
-    if constexpr (kGriffinLayers > 0) {
-      griffin_x = RowVectorBatch<float>(batch_size, kModelDim);
-      griffin_y = RowVectorBatch<float>(batch_size, kModelDim);
-      griffin_gate_x = RowVectorBatch<float>(batch_size, kModelDim);
-      griffin_multiplier = RowVectorBatch<float>(batch_size, kModelDim);
+    if (layer_config.type == LayerAttentionType::kGriffinRecurrentBlock) {
+      griffin_x = RowVectorBatch<float>(batch_size, model_dim);
+      griffin_y = RowVectorBatch<float>(batch_size, model_dim);
+      griffin_gate_x = RowVectorBatch<float>(batch_size, model_dim);
+      griffin_multiplier = RowVectorBatch<float>(batch_size, model_dim);
     }
 
-    inv_timescale = CreateInvTimescale<TConfig>();
+    inv_timescale = CreateInvTimescale(layer_config.qkv_dim, post_qk);
 
     env = MatMulEnv(pools);
   }
diff --git a/gemma/common.cc b/gemma/common.cc
index e68347b..447deb6 100644
--- a/gemma/common.cc
+++ b/gemma/common.cc
@@ -15,6 +15,7 @@
 
 #include "gemma/common.h"
 
+#include <math.h>  // sqrtf
 #include <stddef.h>
 #include <string.h>
 
@@ -23,6 +24,7 @@
 #include <string>
 #include <vector>
 
+#include "compression/shared.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
@@ -101,8 +103,6 @@ const char* ModelString(Model model, ModelTraining training) {
             static_cast<int>(training));
 }
 
-constexpr const char* kTypeStrings[] = {"f32", "bf16", "sfp"};
-
 const char* StringFromType(Type type) {
   return kTypeStrings[static_cast<size_t>(type)];
 }
@@ -141,4 +141,19 @@ void Wrap(const ModelInfo& info, size_t pos, std::string& prompt) {
     prompt = start + prompt + "<end_of_turn>\n<start_of_turn>model\n";
   }
 }
+
+float EmbeddingScaling(size_t model_dim) {
+  // Round to bf16 to match Gemma's Embedder, which casts before mul.
+  return hwy::ConvertScalarTo<float>(hwy::ConvertScalarTo<hwy::bfloat16_t>(
+      sqrtf(static_cast<float>(model_dim))));
+}
+
+float ChooseQueryScale(const ModelConfig& config) {
+  if (config.query_scale == QueryScaleType::SqrtModelDimDivNumHeads)
+    return 1.0f / sqrtf(static_cast<float>(config.model_dim /
+                                           config.layer_configs[0].heads));
+  // QueryScaleType::SqrtKeySize
+  return 1.0f / sqrtf(static_cast<float>(config.layer_configs[0].qkv_dim));
+}
+
 }  // namespace gcpp
diff --git a/gemma/common.h b/gemma/common.h
index 18ac5d1..e933e8d 100644
--- a/gemma/common.h
+++ b/gemma/common.h
@@ -16,37 +16,15 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_H_
 
-#include <math.h>  // sqrtf
 #include <stddef.h>
 
 #include <string>
 
-#include "compression/compress.h"
 #include "gemma/configs.h"  // IWYU pragma: export
 #include "hwy/base.h"  // ConvertScalarTo
 
 namespace gcpp {
 
-// Model variants: see configs.h for details. When adding a new one, also
-// update GEMMA_FOREACH* and Call* below, and add instantiations/*.cc.
-enum class Model {
-  GEMMA_2B,
-  GEMMA_7B,
-  GEMMA2_9B,
-  GEMMA2_27B,
-  GRIFFIN_2B,
-  GEMMA_TINY,
-  GEMMA2_2B,
-  PALIGEMMA_224,
-};
-
-// Instruction-tuned models require extra 'turn structure' tokens in prompts.
-enum class ModelTraining { GEMMA_IT, GEMMA_PT, PALIGEMMA };
-
-// Tensor types for loading weights. When adding a new one, also
-// update GEMMA_FOREACH* and Call* below, and add instantiations/*.cc.
-enum class Type { kF32, kBF16, kSFP };
-
 // TODO(janwas): merge with functions below.
 struct ModelInfo {
   Model model;
@@ -66,198 +44,12 @@ const char* StringFromType(Type type);
 
 void Wrap(const ModelInfo& info, size_t pos, std::string& prompt);
 
-// Returns the return value of FuncT<Config*<TWeight>>().operator()(args), where
-// Config* is selected via `model`. Typically called by CallForModelAndWeight,
-// but can also be called directly when FuncT does not actually use TWeight.
-//
-// Note that a T prefix indicates a concrete type template argument, whereas a
-// T suffix indicates the argument is itself a template.
-//
-// `FuncT` must be a functor because function templates cannot be passed as a
-// template template argument, and we prefer to avoid the overhead of
-// std::function.
-template <typename TWeight, template <typename TConfig> class FuncT,
-          typename... TArgs>
-decltype(auto) CallForModel(Model model, TArgs&&... args) {
-  switch (model) {
-    case Model::GEMMA_TINY:
-      return FuncT<ConfigGemmaTiny<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GEMMA_2B:
-      return FuncT<ConfigGemma2B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GEMMA_7B:
-      return FuncT<ConfigGemma7B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GEMMA2_9B:
-      return FuncT<ConfigGemma2_9B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GEMMA2_27B:
-      return FuncT<ConfigGemma2_27B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GRIFFIN_2B:
-      return FuncT<ConfigGriffin2B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GEMMA2_2B:
-      return FuncT<ConfigGemma2_2B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::PALIGEMMA_224:
-      return FuncT<ConfigPaliGemma_224<TWeight>>()(
-          std::forward<TArgs>(args)...);
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
-}
-
-// Returns the return value of FuncT<TConfig>().operator()(args),
-// where `TConfig` is selected based on `model` and `weight`.
-
-// This makes it easy to extend `Model` or `Type` without updating callers.
-//
-// Usage example: LoadWeights is type-erased so that it can be called from other
-// .cc files. It uses this function to call the appropriate instantiation of a
-// template functor LoadCompressedWeightsT<TConfig>.
-template <template <typename TConfig> class FuncT, typename... TArgs>
-decltype(auto) CallForModelAndWeight(Model model, Type weight,
-                                     TArgs&&... args) {
-  switch (weight) {
-    case Type::kF32:
-      return CallForModel<float, FuncT, TArgs...>(  //
-          model, std::forward<TArgs>(args)...);
-    case Type::kBF16:
-      return CallForModel<BF16, FuncT, TArgs...>(model,
-                                                 std::forward<TArgs>(args)...);
-    case Type::kSFP:
-      return CallForModel<SfpStream, FuncT, TArgs...>(
-          model, std::forward<TArgs>(args)...);
-    default:
-      HWY_ABORT("Weight type %d unknown.", static_cast<int>(weight));
-  }
-}
-
-#define GEMMA_FOREACH_WEIGHT(X, CONFIGT) \
-  X(CONFIGT, float)                      \
-  X(CONFIGT, BF16)                       \
-  X(CONFIGT, SfpStream)
-
-#define GEMMA_FOREACH_CONFIG_AND_WEIGHT(X)              \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemmaTiny)              \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemma2B)                \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemma7B)                \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGriffin2B)              \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemma2_2B)              \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemma2_9B)              \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemma2_27B)             \
-  GEMMA_FOREACH_WEIGHT(X, ConfigPaliGemma_224)          \
-  static_assert(true, "Allow trailing ;")
-
-// Used by GEMMA_EXPORT_AND_DISPATCH. For a given TWEIGHT (e.g. float),
-// calls FUNC<ConfigT<TWEIGHT>> where ConfigT is chosen via MODEL enum.
-#define GEMMA_DISPATCH_MODEL(MODEL, TWEIGHT, FUNC, ARGS)                       \
-  switch (MODEL) {                                                             \
-    case Model::GEMMA_TINY: {                                                  \
-      using CP = ConfigPair<ConfigGemmaTiny<TWEIGHT>, ConfigGemmaTiny<float>>; \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GEMMA_2B: {                                                    \
-      using CP = ConfigPair<ConfigGemma2B<TWEIGHT>, ConfigGemma2B<float>>;     \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GEMMA_7B: {                                                    \
-      using CP = ConfigPair<ConfigGemma7B<TWEIGHT>, ConfigGemma7B<float>>;     \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GRIFFIN_2B: {                                                  \
-      using CP = ConfigPair<ConfigGriffin2B<TWEIGHT>, ConfigGriffin2B<float>>; \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GEMMA2_2B: {                                                   \
-      using CP = ConfigPair<ConfigGemma2_2B<TWEIGHT>, ConfigGemma2_2B<float>>; \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GEMMA2_9B: {                                                   \
-      using CP = ConfigPair<ConfigGemma2_9B<TWEIGHT>, ConfigGemma2_9B<float>>; \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GEMMA2_27B: {                                                  \
-      using CP =                                                               \
-          ConfigPair<ConfigGemma2_27B<TWEIGHT>, ConfigGemma2_27B<float>>;      \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::PALIGEMMA_224: {                                               \
-      using CP = ConfigPair<ConfigPaliGemma_224<TWEIGHT>,                      \
-                            ConfigPaliGemma_224<float>>;                       \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    default:                                                                   \
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(MODEL));            \
-  }
-
-// Like CallForModelAndWeight, but for SIMD function templates. This is a macro
-// because it boils down to N_SSE4::FUNC, which would not work if FUNC was a
-// normal function argument. MODEL and WEIGHT are enums.
-// For gemma.cc, we use overloaded extern functions for faster builds. However,
-// this is still used in compress_weights because its compile time is OK.
-#define GEMMA_EXPORT_AND_DISPATCH(MODEL, WEIGHT, FUNC, ARGS)          \
-  switch (WEIGHT) {                                                   \
-    case Type::kF32:                                                  \
-      GEMMA_DISPATCH_MODEL(MODEL, float, FUNC, ARGS);                 \
-      break;                                                          \
-    case Type::kBF16:                                                 \
-      GEMMA_DISPATCH_MODEL(MODEL, BF16, FUNC, ARGS);                  \
-      break;                                                          \
-    case Type::kSFP:                                                  \
-      GEMMA_DISPATCH_MODEL(MODEL, SfpStream, FUNC, ARGS);             \
-      break;                                                          \
-    default:                                                          \
-      HWY_ABORT("Weight type %d unknown.", static_cast<int>(WEIGHT)); \
-  }
-
 // ----------------------------------------------------------------------------
 //
 
-// __builtin_sqrt is not constexpr as of Clang 17.
-#if HWY_COMPILER_GCC_ACTUAL
-#define GEMMA_CONSTEXPR_SQRT constexpr
-static GEMMA_CONSTEXPR_SQRT HWY_INLINE float Sqrt(float x) {
-  return __builtin_sqrt(x);
-}
-#else
-#define GEMMA_CONSTEXPR_SQRT
-static GEMMA_CONSTEXPR_SQRT HWY_INLINE float Sqrt(float x) { return sqrtf(x); }
-#endif
+float EmbeddingScaling(size_t model_dim);
 
-// `EmbeddingScaling` can be constexpr only if `Sqrt` and `hwy::ConvertScalarTo`
-// are both constexpr
-#if HWY_COMPILER_GCC_ACTUAL
-#define GEMMA_CONSTEXPR_EMBSCALING HWY_BF16_CONSTEXPR
-#else
-#define GEMMA_CONSTEXPR_EMBSCALING
-#endif
-
-template <typename TConfig>
-GEMMA_CONSTEXPR_EMBSCALING float EmbeddingScaling() {
-  // Round to bf16 to match Gemma's Embedder, which casts before mul.
-  return hwy::ConvertScalarTo<float>(
-      hwy::ConvertScalarTo<BF16>(Sqrt(static_cast<float>(TConfig::kModelDim))));
-}
-
-static HWY_INLINE GEMMA_CONSTEXPR_EMBSCALING float EmbeddingScaling(
-    size_t model_dim) {
-  // Round to bf16 to match Gemma's Embedder, which casts before mul.
-  return hwy::ConvertScalarTo<float>(
-      hwy::ConvertScalarTo<BF16>(Sqrt(static_cast<float>(model_dim))));
-}
-
-template <class TConfig>
-GEMMA_CONSTEXPR_SQRT float ChooseQueryScale() {
-  if (TConfig::kQueryScale == QueryScaleType::SqrtModelDimDivNumHeads)
-    return 1.0f /
-           Sqrt(static_cast<float>(TConfig::kModelDim / TConfig::kHeads));
-  // QueryScaleType::SqrtKeySize
-  return 1.0f / Sqrt(static_cast<float>(TConfig::kQKVDim));
-}
+float ChooseQueryScale(const ModelConfig& config);
 
 }  // namespace gcpp
 
diff --git a/gemma/configs.cc b/gemma/configs.cc
new file mode 100644
index 0000000..bc4eee6
--- /dev/null
+++ b/gemma/configs.cc
@@ -0,0 +1,246 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gemma/configs.h"
+
+#include "hwy/base.h"
+
+namespace gcpp {
+
+static ModelConfig ConfigNoSSM() {
+  ModelConfig config = {.scale_names = {"att_ein", "qkv_ein", "gr_lin_x_w",
+                                        "gr_lin_y_w", "gr_lin_out_w",
+                                        "gr_gate_w", "gating_ein", "linear_w"}};
+  return config;
+}
+
+static ModelConfig ConfigBaseGemmaV1() { return ConfigNoSSM(); }
+
+static ModelConfig ConfigBaseGemmaV2() {
+  ModelConfig config = ConfigNoSSM();
+  config.att_cap = 50.0f;
+  config.final_cap = 30.0f;
+  return config;
+}
+
+static ModelConfig ConfigGemma2_27B() {
+  ModelConfig config = ConfigBaseGemmaV2();
+  config.model_name = "Gemma2_27B";
+  config.model = Model::GEMMA2_27B;
+  config.model_dim = 4608;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = 8192;
+  LayerConfig layer_config = {.model_dim = config.model_dim,
+                              .ff_hidden_dim = 16 * 4608 / 2,  // = 36864
+                              .heads = 32,
+                              .kv_heads = 16,
+                              .qkv_dim = 128,
+                              .post_norm = PostNormType::Scale};
+  config.layer_configs = {46, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtModelDimDivNumHeads;
+  config.attention_window_sizes =
+      RepeatedAttentionWindowSizes<46, 2>({4096, 8192});
+  return config;
+}
+
+static ModelConfig ConfigGemma2_9B() {
+  ModelConfig config = ConfigBaseGemmaV2();
+  config.model_name = "Gemma2_9B";
+  config.model = Model::GEMMA2_9B;
+  config.model_dim = 3584;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = 8192;
+  LayerConfig layer_config = {.model_dim = config.model_dim,
+                              .ff_hidden_dim = 8 * 3584 / 2,  // = 14336
+                              .heads = 16,
+                              .kv_heads = 8,
+                              .qkv_dim = 256,
+                              .post_norm = PostNormType::Scale};
+  config.layer_configs = {42, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes =
+      RepeatedAttentionWindowSizes<42, 2>({4096, 8192});
+  return config;
+}
+
+static ModelConfig ConfigGemma2_2B() {
+  ModelConfig config = ConfigBaseGemmaV2();
+  config.model_name = "Gemma2_2B";
+  config.model = Model::GEMMA2_2B;
+  config.model_dim = 2304;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = 8192;
+  LayerConfig layer_config = {.model_dim = config.model_dim,
+                              .ff_hidden_dim = 8 * 2304 / 2,  // = 9216
+                              .heads = 8,
+                              .kv_heads = 4,
+                              .qkv_dim = 256,
+                              .post_norm = PostNormType::Scale};
+  config.layer_configs = {26, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes =
+      RepeatedAttentionWindowSizes<26, 2>({4096, 8192});
+  return config;
+}
+
+static ModelConfig ConfigGemma7B() {
+  ModelConfig config = ConfigBaseGemmaV1();
+  config.model_name = "Gemma7B";
+  config.model = Model::GEMMA_7B;
+  config.model_dim = 3072;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = gcpp::kSeqLen;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .ff_hidden_dim = 16 * 3072 / 2,  // = 24576
+      .heads = 16,
+      .kv_heads = 16,
+      .qkv_dim = 256,
+  };
+  config.layer_configs = {28, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes = FixedAttentionWindowSizes<28>(gcpp::kSeqLen);
+  return config;
+}
+
+static ModelConfig ConfigGemma2B() {
+  ModelConfig config = ConfigBaseGemmaV1();
+  config.model_name = "Gemma2B";
+  config.model = Model::GEMMA_2B;
+  config.model_dim = 2048;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = gcpp::kSeqLen;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .ff_hidden_dim = 16 * 2048 / 2,  // = 16384
+      .heads = 8,
+      .kv_heads = 1,
+      .qkv_dim = 256,
+  };
+  config.layer_configs = {18, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.attention_window_sizes = FixedAttentionWindowSizes<18>(gcpp::kSeqLen);
+  return config;
+}
+
+static ModelConfig ConfigGemmaTiny() {
+  ModelConfig config = ConfigNoSSM();
+  config.model_name = "GemmaTiny";
+  config.model = Model::GEMMA_TINY;
+  config.model_dim = 128;
+  config.vocab_size = 64;
+  config.seq_len = 32;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .ff_hidden_dim = 256,
+      .heads = 4,
+      .kv_heads = 1,
+      .qkv_dim = 16,
+  };
+  config.layer_configs = {3, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes = FixedAttentionWindowSizes<3>(32);
+  // This is required for optimize_test to pass.
+  config.final_cap = 30.0f;
+  return config;
+}
+
+static ModelConfig ConfigGriffin2B() {
+  ModelConfig config = ConfigNoSSM();
+  config.model_name = "Griffin2B";
+  config.model = Model::GRIFFIN_2B;
+  // Griffin uses local attention, so kSeqLen is actually the local attention
+  // window.
+  config.model_dim = 2560;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = 2048;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .griffin_dim = config.model_dim,
+      .ff_hidden_dim = 7680,
+      .heads = 10,
+      .kv_heads = 1,
+      .qkv_dim = 256,
+      .conv1d_width = 4,
+      .ff_biases = true,
+      .softmax_attn_output_biases = true,
+      .type = LayerAttentionType::kGriffinRecurrentBlock,
+      .activation = ActivationType::Gelu,
+      .post_qk = PostQKType::Rope,
+  };
+  config.layer_configs = {26, layer_config};
+  for (size_t i = 2; i < config.layer_configs.size(); i += 3) {
+    config.layer_configs[i].type = LayerAttentionType::kGemma;
+    config.layer_configs[i].griffin_dim = 0;
+  }
+  config.num_tensor_scales = 140;
+  config.attention_window_sizes = FixedAttentionWindowSizes<26>(config.seq_len);
+  config.use_local_attention = true;
+  // This is required for optimize_test to pass.
+  config.final_cap = 0.0f;
+  return config;
+}
+
+static ModelConfig ConfigPaliGemma_224() {
+  ModelConfig config = ConfigGemma2B();
+  config.model_name = "PaliGemma_224";
+  config.model = Model::PALIGEMMA_224;
+  config.vit_model_dim = 1152;
+  config.vocab_size = 256000 + 1024 + 128;  // = 257152
+  config.vit_seq_len = 16 * 16;
+  LayerConfig layer_config = {
+      .model_dim = config.vit_model_dim,
+      .ff_hidden_dim = 4304,
+      .heads = 16,
+      .kv_heads = 16,
+      .qkv_dim = 72,
+      .type = LayerAttentionType::kVit,
+      .patch_width = 14,
+      .image_size = 224,
+  };
+  config.vit_layer_configs = {27, layer_config};
+  config.num_vit_scales = 4 * config.vit_layer_configs.size();
+  return config;
+}
+
+ModelConfig ConfigFromModel(Model model) {
+  switch (model) {
+    case Model::GEMMA_2B:
+      return ConfigGemma2B();
+    case Model::GEMMA_7B:
+      return ConfigGemma7B();
+    case Model::GEMMA2_2B:
+      return ConfigGemma2_2B();
+    case Model::GEMMA2_9B:
+      return ConfigGemma2_9B();
+    case Model::GEMMA2_27B:
+      return ConfigGemma2_27B();
+    case Model::GRIFFIN_2B:
+      return ConfigGriffin2B();
+    case Model::GEMMA_TINY:
+      return ConfigGemmaTiny();
+    case Model::PALIGEMMA_224:
+      return ConfigPaliGemma_224();
+    default:
+      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
+  }
+}
+
+}  // namespace gcpp
diff --git a/gemma/configs.h b/gemma/configs.h
index 7c1ce88..ac82ab4 100644
--- a/gemma/configs.h
+++ b/gemma/configs.h
@@ -21,6 +21,9 @@
 #include <stddef.h>
 
 #include <array>
+#include <string>
+#include <unordered_set>
+#include <vector>
 
 #include "compression/shared.h"  // BF16
 
@@ -57,6 +60,7 @@ enum class PostNormType {
 // Post qk projection operation type.
 enum class PostQKType {
   Rope,
+  HalfRope,
 };
 
 // FFW activation function.
@@ -76,358 +80,115 @@ enum class ResidualType {
 };
 
 template <size_t kNum>
-constexpr std::array<LayerAttentionType, kNum> FixedLayerConfig(
-    LayerAttentionType type) {
-  std::array<LayerAttentionType, kNum> config = {};
-  for (LayerAttentionType& l : config) {
-    l = type;
-  }
-  return config;
+std::vector<LayerAttentionType> FixedLayerConfig(LayerAttentionType type) {
+  return std::vector<LayerAttentionType>(kNum, type);
 }
 
 template <size_t kNum>
-constexpr std::array<size_t, kNum> FixedAttentionWindowSizes(
-    size_t window_size) {
-  std::array<size_t, kNum> window_size_configs = {};
-  for (size_t& l : window_size_configs) {
-    l = window_size;
-  }
-  return window_size_configs;
+std::vector<size_t> FixedAttentionWindowSizes(size_t window_size) {
+  return std::vector<size_t>(kNum, window_size);
 }
 
 // Repeat window_size_pattern for kNum / kPatternSize times.
 template <size_t kNum, size_t kPatternSize>
-constexpr std::array<size_t, kNum> RepeatedAttentionWindowSizes(
+std::vector<size_t> RepeatedAttentionWindowSizes(
     const std::array<size_t, kPatternSize>& window_size_pattern) {
   static_assert(kNum % kPatternSize == 0,
                 "kNum must be a multiple of kPatternSize");
-  std::array<size_t, kNum> window_size_configs = {};
+  std::vector<size_t> window_size_configs(kNum);
   for (size_t i = 0; i < kNum; ++i) {
     window_size_configs[i] = window_size_pattern[i % kPatternSize];
   }
   return window_size_configs;
 }
 
-template <size_t kNumLayers>
-constexpr size_t NumLayersOfTypeBefore(
-    const std::array<LayerAttentionType, kNumLayers>& layers,
-    LayerAttentionType type, size_t num) {
-  size_t count = 0;
-  for (size_t i = 0; i < num; i++) {
-    if (layers[i] == type) count++;
+// Model variants: see configs.cc for details.
+enum class Model {
+  UNKNOWN,
+  GEMMA_2B,
+  GEMMA_7B,
+  GEMMA2_9B,
+  GEMMA2_27B,
+  GRIFFIN_2B,
+  GEMMA_TINY,
+  GEMMA2_2B,
+  PALIGEMMA_224,
+};
+
+struct LayerConfig {
+  size_t CacheLayerSize() const { return kv_heads * qkv_dim * 2; }
+
+  size_t model_dim = 0;
+  size_t griffin_dim = 0;
+  size_t ff_hidden_dim = 0;
+  size_t heads = 0;
+  size_t kv_heads = 0;
+  size_t qkv_dim = 0;
+  size_t conv1d_width = 0;
+  bool ff_biases = false;
+  bool softmax_attn_output_biases = false;
+  PostNormType post_norm = PostNormType::None;
+  LayerAttentionType type = LayerAttentionType::kGemma;
+  ActivationType activation = ActivationType::Gelu;
+  PostQKType post_qk = PostQKType::Rope;
+  // Dimensions related to image processing.
+  int patch_width = 14;
+  int image_size = 224;
+};
+
+struct ModelConfig {
+  size_t CachePosSize() const {
+    size_t num_layers = layer_configs.size();
+    return num_layers * layer_configs[0].CacheLayerSize();
   }
-  return count;
-}
 
-template <class TConfig, typename = void>
-struct CacheLayerSize {
-  constexpr size_t operator()() const {
-    return TConfig::kKVHeads * TConfig::kQKVDim * 2;
+  size_t NumLayersOfTypeBefore(LayerAttentionType type, size_t num) const {
+    size_t count = 0;
+    for (size_t i = 0; i < num; i++) {
+      if (layer_configs[i].type == type) ++count;
+    }
+    return count;
   }
-};
 
-template <class TConfig, typename = void>
-struct CachePosSize {
-  constexpr size_t operator()() const {
-    return TConfig::kGemmaLayers * CacheLayerSize<TConfig>()();
+  size_t NumLayersOfType(LayerAttentionType type) const {
+    return NumLayersOfTypeBefore(type, layer_configs.size());
   }
+
+  size_t NumHeads() const {
+    size_t num_heads = 0;
+    for (const auto& layer_config : layer_configs) {
+      num_heads = std::max(num_heads, layer_config.heads);
+    }
+    return num_heads;
+  }
+
+  std::string model_name;
+  Model model;
+  ModelTraining training;
+  Type weight;
+  size_t model_dim = 0;
+  size_t vit_model_dim = 0;
+  size_t vocab_size = 0;
+  size_t seq_len = 0;
+  size_t vit_seq_len = 0;
+  size_t num_tensor_scales = 0;
+  size_t num_vit_scales = 0;
+  size_t top_k = kTopK;
+  float att_cap = 0.0f;
+  float final_cap = 0.0f;
+  bool absolute_pe = false;
+  bool use_local_attention = false;
+  QueryScaleType query_scale = QueryScaleType::SqrtKeySize;
+  std::vector<LayerConfig> layer_configs;
+  std::vector<size_t> attention_window_sizes;
+  std::vector<LayerConfig> vit_layer_configs;
+  std::unordered_set<std::string> scale_names;
+  int norm_num_groups = 1;
+  int model_family_version = 1;
 };
 
-struct ConfigNoVit {
-  struct VitConfig {
-    // Some of these are needed to make the compiler happy when trying to
-    // generate code that will actually never be used.
-    using Weight = float;
-    static constexpr int kLayers = 0;
-    static constexpr std::array<LayerAttentionType, 0> kLayerConfig =
-        FixedLayerConfig<0>(LayerAttentionType::kVit);
-    static constexpr int kModelDim = 0;
-    static constexpr int kFFHiddenDim = 0;
-    static constexpr int kHeads = 1;  // Avoid division by 0 in griffin gate_w.
-    static constexpr int kKVHeads = 0;
-    static constexpr int kQKVDim = 0;
-    static constexpr int kSeqLen = 0;
-    static constexpr ResidualType kResidual = ResidualType::Add;
-    static constexpr int kGriffinLayers = 0;
-    static constexpr int kConv1dWidth = 0;
-    static constexpr bool kFFBiases = false;
-    static constexpr bool kSoftmaxAttnOutputBiases = false;
-    static constexpr PostNormType kPostNorm = PostNormType::None;
-  };
-};
-
-struct ConfigNoSSM : ConfigNoVit {
-  static constexpr int kGriffinLayers = 0;
-
-  static constexpr int kConv1dWidth = 0;
-  static constexpr bool kFFBiases = false;
-  static constexpr bool kSoftmaxAttnOutputBiases = false;
-  static constexpr bool kUseHalfRope = false;
-  static constexpr bool kUseLocalAttention = false;
-  static constexpr bool kInterleaveQKV = true;
-  static constexpr PostQKType kPostQK = PostQKType::Rope;
-  static constexpr ActivationType kActivation = ActivationType::Gelu;
-  static constexpr ResidualType kResidual = ResidualType::Add;
-};
-
-struct ConfigBaseGemmaV1 : ConfigNoSSM {
-  static constexpr float kAttCap = 0.0f;
-  static constexpr float kFinalCap = 0.0f;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-};
-
-struct ConfigBaseGemmaV2 : ConfigNoSSM {
-  static constexpr float kAttCap = 50.0f;
-  static constexpr float kFinalCap = 30.0f;
-  static constexpr PostNormType kPostNorm = PostNormType::Scale;
-};
-
-template <typename TWeight>
-struct ConfigGemma2_27B : public ConfigBaseGemmaV2 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 8192;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 46> kLayerConfig =
-      FixedLayerConfig<46>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 46> kAttentionWindowSizes =
-      RepeatedAttentionWindowSizes<46, 2>({4096, kSeqLen});
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 4608;
-  static constexpr int kFFHiddenDim = 16 * 4608 / 2;  // = 36864
-  static constexpr int kHeads = 32;
-  static constexpr int kKVHeads = 16;
-  static constexpr int kQKVDim = 128;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr QueryScaleType kQueryScale =
-      QueryScaleType::SqrtModelDimDivNumHeads;
-};
-
-template <typename TWeight>
-struct ConfigGemma2_9B : public ConfigBaseGemmaV2 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 8192;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 42> kLayerConfig =
-      FixedLayerConfig<42>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 42> kAttentionWindowSizes =
-      RepeatedAttentionWindowSizes<42, 2>({4096, kSeqLen});
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 3584;
-  static constexpr int kFFHiddenDim = 8 * 3584 / 2;  // = 14336
-  static constexpr int kHeads = 16;
-  static constexpr int kKVHeads = 8;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-};
-
-template <typename TWeight>
-struct ConfigGemma7B : public ConfigBaseGemmaV1 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = gcpp::kSeqLen;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 28> kLayerConfig =
-      FixedLayerConfig<28>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 28> kAttentionWindowSizes =
-      FixedAttentionWindowSizes<28>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 3072;
-  static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
-  static constexpr int kHeads = 16;
-  static constexpr int kKVHeads = 16;  // standard MHA
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-};
-
-template <typename TWeight>
-struct ConfigGemma2B : public ConfigBaseGemmaV1 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = gcpp::kSeqLen;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 18> kLayerConfig =
-      FixedLayerConfig<18>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 18> kAttentionWindowSizes =
-      FixedAttentionWindowSizes<18>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 2048;
-  static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
-  static constexpr int kHeads = 8;
-  static constexpr int kKVHeads = 1;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-};
-
-template <typename TWeight>
-struct ConfigPaliGemma_224 : public ConfigGemma2B<TWeight> {
-  // On the LM side, the vocab size is one difference to Gemma1-2B in the
-  // architecture. PaliGemma adds 1024 <locNNNN> and 128 <segNNN> tokens.
-  static constexpr int kVocabSize = 256000 + 1024 + 128;  // = 257152
-
-  // Sub-config for the Vision-Transformer part.
-  struct VitConfig : public ConfigNoSSM {
-    using Weight = TWeight;
-    // The ViT parts. https://arxiv.org/abs/2305.13035
-    // "SoViT-400m/14 [...] has a width of 1152, depth 27, and MLP dim 4304."
-    static constexpr std::array<LayerAttentionType, 27> kLayerConfig =
-        FixedLayerConfig<27>(LayerAttentionType::kVit);
-    static constexpr int kLayers = kLayerConfig.size();
-    static constexpr int kNumTensorScales = 4 * kLayers;
-    static constexpr int kModelDim = 1152;
-    static constexpr int kFFHiddenDim = 4304;
-    static constexpr int kHeads = 16;
-    static constexpr int kKVHeads = 16;  // standard MHA
-    static constexpr int kQKVDim = 72;
-    static constexpr int kSeqLen = 16 * 16;  // 256
-    static constexpr bool kFFBiases = true;
-    // The Vit part does not have a vocabulary, the image patches are embedded.
-    static constexpr int kVocabSize = 0;
-    // Dimensions related to image processing.
-    static constexpr int kPatchWidth = 14;
-    static constexpr int kImageSize = 224;
-    // Necessary constant for the layer configuration.
-    static constexpr PostNormType kPostNorm = PostNormType::None;
-  };
-};
-
-template <typename TWeight>
-struct ConfigGemma2_2B : public ConfigBaseGemmaV2 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 8192;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 26> kLayerConfig =
-      FixedLayerConfig<26>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
-      RepeatedAttentionWindowSizes<26, 2>({4096, kSeqLen});
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 2304;
-  static constexpr int kFFHiddenDim = 8 * 2304 / 2;  // = 9216
-  static constexpr int kHeads = 8;
-  static constexpr int kKVHeads = 4;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-};
-
-template <typename TWeight>
-struct ConfigGemmaTiny : public ConfigNoSSM {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 32;
-  static constexpr int kVocabSize = 64;
-  static constexpr std::array<LayerAttentionType, 3> kLayerConfig =
-      FixedLayerConfig<3>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 3> kAttentionWindowSizes =
-      FixedAttentionWindowSizes<3>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 128;
-  static constexpr int kFFHiddenDim = 256;
-  static constexpr int kHeads = 4;
-  static constexpr int kKVHeads = 1;
-  static constexpr int kQKVDim = 16;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-
-  static constexpr float kAttCap = 0.0f;
-  // This is required for optimize_test to pass.
-  static constexpr float kFinalCap = 30.0f;
-};
-
-template <typename TWeight>
-struct ConfigGriffin2B : ConfigNoVit {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  // Griffin uses local attention, so kSeqLen is actually the local attention
-  // window.
-  static constexpr int kSeqLen = 2048;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 26> kLayerConfig = {
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-  };
-  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
-      FixedAttentionWindowSizes<26>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kGemmaLayers =
-      NumLayersOfTypeBefore(kLayerConfig, LayerAttentionType::kGemma, kLayers);
-  static constexpr int kGriffinLayers =
-      NumLayersOfTypeBefore(kLayerConfig,
-                            LayerAttentionType::kGriffinRecurrentBlock,
-                            kLayers);
-  static constexpr int kModelDim = 2560;
-  static constexpr int kFFHiddenDim = 7680;
-  static constexpr int kHeads = 10;
-  static constexpr int kKVHeads = 1;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-
-  // No SoftCap.
-  static constexpr float kAttCap = 0.0f;
-  static constexpr float kFinalCap = 0.0f;
-
-  // SSM config.
-  static constexpr int kConv1dWidth = 4;
-  static constexpr bool kFFBiases = true;
-  static constexpr bool kSoftmaxAttnOutputBiases = true;
-  static constexpr bool kUseHalfRope = true;
-  static constexpr bool kUseLocalAttention = true;
-  static constexpr bool kInterleaveQKV = false;
-  static constexpr int kNumTensorScales = 140;
-  static constexpr PostQKType kPostQK = PostQKType::Rope;
-  static constexpr ActivationType kActivation = ActivationType::Gelu;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-  static constexpr ResidualType kResidual = ResidualType::Add;
-};
+// Returns the config for the given model.
+ModelConfig ConfigFromModel(Model model);
 
 }  // namespace gcpp
 
diff --git a/gemma/configs_test.cc b/gemma/configs_test.cc
new file mode 100644
index 0000000..a6668a4
--- /dev/null
+++ b/gemma/configs_test.cc
@@ -0,0 +1,445 @@
+#include "gemma/configs.h"
+
+#include <array>
+#include <cstddef>
+#include <type_traits>
+
+#include "gtest/gtest.h"
+
+namespace gcpp {
+
+template <size_t kNum>
+constexpr std::array<LayerAttentionType, kNum> OldFixedLayerConfig(
+    LayerAttentionType type) {
+  std::array<LayerAttentionType, kNum> config = {};
+  for (LayerAttentionType& l : config) {
+    l = type;
+  }
+  return config;
+}
+
+template <size_t kNum>
+constexpr std::array<size_t, kNum> OldFixedAttentionWindowSizes(
+    size_t window_size) {
+  std::array<size_t, kNum> window_size_configs = {};
+  for (size_t& l : window_size_configs) {
+    l = window_size;
+  }
+  return window_size_configs;
+}
+
+// Repeat window_size_pattern for kNum / kPatternSize times.
+template <size_t kNum, size_t kPatternSize>
+constexpr std::array<size_t, kNum> OldRepeatedAttentionWindowSizes(
+    const std::array<size_t, kPatternSize>& window_size_pattern) {
+  static_assert(kNum % kPatternSize == 0,
+                "kNum must be a multiple of kPatternSize");
+  std::array<size_t, kNum> window_size_configs = {};
+  for (size_t i = 0; i < kNum; ++i) {
+    window_size_configs[i] = window_size_pattern[i % kPatternSize];
+  }
+  return window_size_configs;
+}
+
+template <size_t kNumLayers>
+constexpr size_t OldNumLayersOfTypeBefore(
+    const std::array<LayerAttentionType, kNumLayers>& layers,
+    LayerAttentionType type, size_t num) {
+  size_t count = 0;
+  for (size_t i = 0; i < num; i++) {
+    if (layers[i] == type) count++;
+  }
+  return count;
+}
+
+template <class TConfig, typename = void>
+struct CacheLayerSize {
+  constexpr size_t operator()() const {
+    return TConfig::kKVHeads * TConfig::kQKVDim * 2;
+  }
+};
+
+template <class TConfig, typename = void>
+struct CachePosSize {
+  constexpr size_t operator()() const {
+    return TConfig::kGemmaLayers * CacheLayerSize<TConfig>()();
+  }
+};
+
+struct OldConfigNoVit {
+  struct VitConfig {
+    // Some of these are needed to make the compiler happy when trying to
+    // generate code that will actually never be used.
+    using Weight = float;
+    static constexpr int kLayers = 0;
+    static constexpr std::array<LayerAttentionType, 0> kLayerConfig =
+        OldFixedLayerConfig<0>(LayerAttentionType::kVit);
+    static constexpr int kModelDim = 0;
+    static constexpr int kFFHiddenDim = 0;
+    static constexpr int kHeads = 1;  // Avoid division by 0 in griffin gate_w.
+    static constexpr int kKVHeads = 0;
+    static constexpr int kQKVDim = 0;
+    static constexpr int kSeqLen = 0;
+    static constexpr ResidualType kResidual = ResidualType::Add;
+    static constexpr int kGriffinLayers = 0;
+    static constexpr int kConv1dWidth = 0;
+    static constexpr bool kFFBiases = false;
+    static constexpr bool kSoftmaxAttnOutputBiases = false;
+    static constexpr PostNormType kPostNorm = PostNormType::None;
+  };
+};
+
+struct OldConfigNoSSM : OldConfigNoVit {
+  static constexpr int kGriffinLayers = 0;
+
+  static constexpr int kConv1dWidth = 0;
+  static constexpr bool kFFBiases = false;
+  static constexpr bool kSoftmaxAttnOutputBiases = false;
+  static constexpr bool kUseHalfRope = false;
+  static constexpr bool kUseLocalAttention = false;
+  static constexpr bool kInterleaveQKV = true;
+  static constexpr PostQKType kPostQK = PostQKType::Rope;
+  static constexpr ActivationType kActivation = ActivationType::Gelu;
+  static constexpr ResidualType kResidual = ResidualType::Add;
+};
+
+struct OldConfigBaseGemmaV1 : OldConfigNoSSM {
+  static constexpr float kAttCap = 0.0f;
+  static constexpr float kFinalCap = 0.0f;
+  static constexpr PostNormType kPostNorm = PostNormType::None;
+  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
+};
+
+struct OldConfigBaseGemmaV2 : OldConfigNoSSM {
+  static constexpr float kAttCap = 50.0f;
+  static constexpr float kFinalCap = 30.0f;
+  static constexpr PostNormType kPostNorm = PostNormType::Scale;
+};
+
+template <typename TWeight>
+struct OldConfigGemma2_27B : public OldConfigBaseGemmaV2 {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = 8192;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 46> kLayerConfig =
+      OldFixedLayerConfig<46>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 46> kAttentionWindowSizes =
+      OldRepeatedAttentionWindowSizes<46, 2>({4096, kSeqLen});
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 4608;
+  static constexpr int kFFHiddenDim = 16 * 4608 / 2;  // = 36864
+  static constexpr int kHeads = 32;
+  static constexpr int kKVHeads = 16;
+  static constexpr int kQKVDim = 128;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+  static constexpr QueryScaleType kQueryScale =
+      QueryScaleType::SqrtModelDimDivNumHeads;
+};
+
+template <typename TWeight>
+struct OldConfigGemma2_9B : public OldConfigBaseGemmaV2 {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = 8192;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 42> kLayerConfig =
+      OldFixedLayerConfig<42>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 42> kAttentionWindowSizes =
+      OldRepeatedAttentionWindowSizes<42, 2>({4096, kSeqLen});
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 3584;
+  static constexpr int kFFHiddenDim = 8 * 3584 / 2;  // = 14336
+  static constexpr int kHeads = 16;
+  static constexpr int kKVHeads = 8;
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
+};
+
+template <typename TWeight>
+struct OldConfigGemma7B : public OldConfigBaseGemmaV1 {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = gcpp::kSeqLen;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 28> kLayerConfig =
+      OldFixedLayerConfig<28>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 28> kAttentionWindowSizes =
+      OldFixedAttentionWindowSizes<28>(kSeqLen);
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 3072;
+  static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
+  static constexpr int kHeads = 16;
+  static constexpr int kKVHeads = 16;  // standard MHA
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+};
+
+template <typename TWeight>
+struct OldConfigGemma2B : public OldConfigBaseGemmaV1 {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = gcpp::kSeqLen;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 18> kLayerConfig =
+      OldFixedLayerConfig<18>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 18> kAttentionWindowSizes =
+      OldFixedAttentionWindowSizes<18>(kSeqLen);
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 2048;
+  static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
+  static constexpr int kHeads = 8;
+  static constexpr int kKVHeads = 1;
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+};
+
+template <typename TWeight>
+struct OldConfigPaliGemma_224 : public OldConfigGemma2B<TWeight> {
+  // On the LM side, the vocab size is one difference to Gemma1-2B in the
+  // architecture. PaliGemma adds 1024 <locNNNN> and 128 <segNNN> tokens.
+  static constexpr int kVocabSize = 256000 + 1024 + 128;  // = 257152
+
+  // Sub-config for the Vision-Transformer part.
+  struct VitConfig : public OldConfigNoSSM {
+    using Weight = TWeight;
+    // The ViT parts. https://arxiv.org/abs/2305.13035
+    // "SoViT-400m/14 [...] has a width of 1152, depth 27, and MLP dim 4304."
+    static constexpr std::array<LayerAttentionType, 27> kLayerConfig =
+        OldFixedLayerConfig<27>(LayerAttentionType::kVit);
+    static constexpr int kLayers = kLayerConfig.size();
+    static constexpr int kNumTensorScales = 4 * kLayers;
+    static constexpr int kModelDim = 1152;
+    static constexpr int kFFHiddenDim = 4304;
+    static constexpr int kHeads = 16;
+    static constexpr int kKVHeads = 16;  // standard MHA
+    static constexpr int kQKVDim = 72;
+    static constexpr int kSeqLen = 16 * 16;  // 256
+    static constexpr bool kFFBiases = true;
+    // The Vit part does not have a vocabulary, the image patches are embedded.
+    static constexpr int kVocabSize = 0;
+    // Dimensions related to image processing.
+    static constexpr int kPatchWidth = 14;
+    static constexpr int kImageSize = 224;
+    // Necessary constant for the layer configuration.
+    static constexpr PostNormType kPostNorm = PostNormType::None;
+  };
+};
+
+template <typename TWeight>
+struct OldConfigGemma2_2B : public OldConfigBaseGemmaV2 {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = 8192;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 26> kLayerConfig =
+      OldFixedLayerConfig<26>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
+      OldRepeatedAttentionWindowSizes<26, 2>({4096, kSeqLen});
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 2304;
+  static constexpr int kFFHiddenDim = 8 * 2304 / 2;  // = 9216
+  static constexpr int kHeads = 8;
+  static constexpr int kKVHeads = 4;
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
+};
+
+template <typename TWeight>
+struct OldConfigGemmaTiny : public OldConfigNoSSM {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = 32;
+  static constexpr int kVocabSize = 64;
+  static constexpr std::array<LayerAttentionType, 3> kLayerConfig =
+      OldFixedLayerConfig<3>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 3> kAttentionWindowSizes =
+      OldFixedAttentionWindowSizes<3>(kSeqLen);
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 128;
+  static constexpr int kFFHiddenDim = 256;
+  static constexpr int kHeads = 4;
+  static constexpr int kKVHeads = 1;
+  static constexpr int kQKVDim = 16;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+  static constexpr PostNormType kPostNorm = PostNormType::None;
+  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
+
+  static constexpr float kAttCap = 0.0f;
+  // This is required for optimize_test to pass.
+  static constexpr float kFinalCap = 30.0f;
+};
+
+template <typename TWeight>
+struct OldConfigGriffin2B : OldConfigNoVit {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  // Griffin uses local attention, so kSeqLen is actually the local attention
+  // window.
+  static constexpr int kSeqLen = 2048;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 26> kLayerConfig = {
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+  };
+  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
+      OldFixedAttentionWindowSizes<26>(kSeqLen);
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kGemmaLayers = OldNumLayersOfTypeBefore(
+      kLayerConfig, LayerAttentionType::kGemma, kLayers);
+  static constexpr int kGriffinLayers = OldNumLayersOfTypeBefore(
+      kLayerConfig, LayerAttentionType::kGriffinRecurrentBlock, kLayers);
+  static constexpr int kModelDim = 2560;
+  static constexpr int kFFHiddenDim = 7680;
+  static constexpr int kHeads = 10;
+  static constexpr int kKVHeads = 1;
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+  static constexpr PostNormType kPostNorm = PostNormType::None;
+
+  // No SoftCap.
+  static constexpr float kAttCap = 0.0f;
+  static constexpr float kFinalCap = 0.0f;
+
+  // SSM config.
+  static constexpr int kConv1dWidth = 4;
+  static constexpr bool kFFBiases = true;
+  static constexpr bool kSoftmaxAttnOutputBiases = true;
+  static constexpr bool kUseHalfRope = true;
+  static constexpr bool kUseLocalAttention = true;
+  static constexpr bool kInterleaveQKV = false;
+  static constexpr int kNumTensorScales = 140;
+  static constexpr PostQKType kPostQK = PostQKType::Rope;
+  static constexpr ActivationType kActivation = ActivationType::Gelu;
+  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
+  static constexpr ResidualType kResidual = ResidualType::Add;
+};
+
+template <class TConfig>
+void AssertMatch(const ModelConfig& config) {
+  ASSERT_EQ(TConfig::kModelDim, config.model_dim);
+  if constexpr (TConfig::VitConfig::kModelDim != 0) {
+    ASSERT_EQ(TConfig::VitConfig::kModelDim, config.vit_model_dim);
+    ASSERT_EQ(TConfig::VitConfig::kSeqLen, config.vit_seq_len);
+    ASSERT_EQ(TConfig::VitConfig::kNumTensorScales, config.num_vit_scales);
+    for (size_t i = 0; i < config.vit_layer_configs.size(); ++i) {
+      ASSERT_EQ(TConfig::VitConfig::kLayerConfig[i],
+                config.vit_layer_configs[i].type);
+    }
+  }
+  ASSERT_EQ(TConfig::kVocabSize, config.vocab_size);
+  ASSERT_EQ(TConfig::kSeqLen, config.seq_len);
+  ASSERT_EQ(TConfig::kTopK, config.top_k);
+  ASSERT_EQ(TConfig::kAttCap, config.att_cap);
+  ASSERT_EQ(TConfig::kFinalCap, config.final_cap);
+  ASSERT_EQ(TConfig::kAbsolutePE, config.absolute_pe);
+  ASSERT_EQ(TConfig::kUseLocalAttention, config.use_local_attention);
+  ASSERT_EQ(TConfig::kQueryScale, config.query_scale);
+  ASSERT_EQ(TConfig::kGemmaLayers,
+            config.NumLayersOfType(LayerAttentionType::kGemma));
+  ASSERT_EQ(TConfig::kGriffinLayers,
+            config.NumLayersOfType(LayerAttentionType::kGriffinRecurrentBlock));
+  for (size_t i = 0; i < config.layer_configs.size(); ++i) {
+    ASSERT_EQ(TConfig::kModelDim, config.layer_configs[i].model_dim);
+    ASSERT_EQ(TConfig::kFFHiddenDim, config.layer_configs[i].ff_hidden_dim);
+    ASSERT_EQ(TConfig::kHeads, config.layer_configs[i].heads);
+    ASSERT_EQ(TConfig::kKVHeads, config.layer_configs[i].kv_heads);
+    ASSERT_EQ(TConfig::kQKVDim, config.layer_configs[i].qkv_dim);
+    ASSERT_EQ(TConfig::kConv1dWidth, config.layer_configs[i].conv1d_width);
+    ASSERT_EQ(TConfig::kFFBiases, config.layer_configs[i].ff_biases);
+    ASSERT_EQ(TConfig::kSoftmaxAttnOutputBiases,
+              config.layer_configs[i].softmax_attn_output_biases);
+    ASSERT_EQ(TConfig::kPostNorm, config.layer_configs[i].post_norm);
+    ASSERT_EQ(TConfig::kLayerConfig[i], config.layer_configs[i].type);
+    ASSERT_EQ(TConfig::kActivation, config.layer_configs[i].activation);
+    ASSERT_EQ(TConfig::kPostQK, config.layer_configs[i].post_qk);
+  }
+
+  ASSERT_EQ(TConfig::kAttentionWindowSizes.size(),
+            config.attention_window_sizes.size());
+  for (size_t i = 0; i < config.attention_window_sizes.size(); ++i) {
+    ASSERT_EQ(TConfig::kAttentionWindowSizes[i],
+              config.attention_window_sizes[i]);
+  }
+  ASSERT_EQ(TConfig::kNumTensorScales, config.num_tensor_scales);
+}
+
+TEST(ConfigsTest, OldConfigGemma2B) {
+  AssertMatch<OldConfigGemma2B<float>>(ConfigFromModel(Model::GEMMA_2B));
+}
+
+TEST(ConfigsTest, OldConfigGemma7B) {
+  AssertMatch<OldConfigGemma7B<float>>(ConfigFromModel(Model::GEMMA_7B));
+}
+
+TEST(ConfigsTest, OldConfigGemma2_2B) {
+  AssertMatch<OldConfigGemma2_2B<float>>(ConfigFromModel(Model::GEMMA2_2B));
+}
+
+TEST(ConfigsTest, OldConfigGemma2_9B) {
+  AssertMatch<OldConfigGemma2_9B<float>>(ConfigFromModel(Model::GEMMA2_9B));
+}
+
+TEST(ConfigsTest, OldConfigGemma2_27B) {
+  AssertMatch<OldConfigGemma2_27B<float>>(ConfigFromModel(Model::GEMMA2_27B));
+}
+
+TEST(ConfigsTest, OldConfigGriffin2B) {
+  AssertMatch<OldConfigGriffin2B<float>>(ConfigFromModel(Model::GRIFFIN_2B));
+}
+
+TEST(ConfigsTest, OldConfigGemmaTiny) {
+  AssertMatch<OldConfigGemmaTiny<float>>(ConfigFromModel(Model::GEMMA_TINY));
+}
+
+TEST(ConfigsTest, OldConfigPaliGemma_224) {
+  AssertMatch<OldConfigPaliGemma_224<float>>(
+      ConfigFromModel(Model::PALIGEMMA_224));
+}
+
+}  // namespace gcpp
diff --git a/gemma/gemma-inl.h b/gemma/gemma-inl.h
index 5e3135b..ce72a04 100644
--- a/gemma/gemma-inl.h
+++ b/gemma/gemma-inl.h
@@ -15,6 +15,7 @@
 
 // SIMD functions for Gemma/Griffin transformers.
 
+#include <math.h>  // sqrtf
 #include <stddef.h>
 #include <stdio.h>
 
@@ -53,14 +54,14 @@
 #include "ops/ops-inl.h"
 #include "hwy/profiler.h"  // also uses SIMD
 
-#ifndef GEMMA_CONFIG
+#ifndef GEMMA_TYPE
 #if HWY_IDE
 // Provide a definition so the IDE does not complain.
-#define GEMMA_CONFIG ConfigGemmaTiny<float>
+#define GEMMA_TYPE float
 #else
-#error "Only include from instantiations/*.cc, which must define GEMMA_CONFIG"
+#error "Only include from instantiations/*.cc, which must define GEMMA_TYPE"
 #endif  // HWY_IDE
-#endif  // GEMMA_CONFIG
+#endif  // GEMMA_TYPE
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
@@ -72,31 +73,31 @@ namespace HWY_NAMESPACE {
 // `Attention`, use separate `num_tokens` and `num_queries`.
 
 // TODO: add batch query support for Griffin (QueriesPos).
-template <class TConfig>
-HWY_NOINLINE void GriffinRecurrent(
-    size_t batch_start, size_t num_tokens, size_t layer,
-    Activations& activations, const CompressedLayer<TConfig>* layer_weights,
-    const KVCaches& kv_caches) {
+template <typename T>
+HWY_NOINLINE void GriffinRecurrent(size_t batch_start, size_t num_tokens,
+                                   size_t layer, Activations& activations,
+                                   const LayerWeightsPtrs<T>* layer_weights,
+                                   const KVCaches& kv_caches) {
   PROFILER_ZONE("Gen.Griffin");
   KVCache& kv_cache = kv_caches[0];
   hwy::ThreadPool& pool = activations.env.Pool();
   namespace hn = hwy::HWY_NAMESPACE;
   using D = hn::ScalableTag<float>;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
-  static constexpr size_t kHeads = TConfig::kHeads;
+  const size_t model_dim = layer_weights->layer_config.model_dim;
+  const size_t conv_1d_width = layer_weights->layer_config.conv1d_width;
+  const size_t heads = layer_weights->layer_config.heads;
 
   // X / Y linear layers.
   for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
     float* HWY_RESTRICT y = activations.griffin_y.Batch(batch_idx);
     float* HWY_RESTRICT x = activations.griffin_x.Batch(batch_idx);
-    TwoMatVecAdd<kModelDim, kModelDim>(
-        layer_weights->griffin.linear_x_w, layer_weights->griffin.linear_y_w, 0,
-        activations.pre_att_rms_out.Batch(batch_idx),
-        /*add0=*/layer_weights->griffin.linear_x_biases.data_scale1(),
-        /*add1=*/layer_weights->griffin.linear_y_biases.data_scale1(),
-        /*out0=*/x, /*out1=*/y, pool);
-    Gelu(y, kModelDim);
+    TwoMatVecAdd(layer_weights->griffin.linear_x_w,
+                 layer_weights->griffin.linear_y_w, 0, model_dim, model_dim,
+                 activations.pre_att_rms_out.Batch(batch_idx),
+                 /*add0=*/layer_weights->griffin.linear_x_biases.data_scale1(),
+                 /*add1=*/layer_weights->griffin.linear_y_biases.data_scale1(),
+                 /*out0=*/x, /*out1=*/y, pool);
+    Gelu(y, model_dim);
   }
 
   // Conv1D.
@@ -104,33 +105,35 @@ HWY_NOINLINE void GriffinRecurrent(
     const size_t pos = batch_start + batch_idx;
     float* HWY_RESTRICT x = activations.griffin_x.Batch(batch_idx);
     HWY_FULL(float) df;
-    HWY_DASSERT(kModelDim % hn::Lanes(df) == 0);
-    const size_t layer_offset = layer * kModelDim * (kConv1dWidth - 1);
+    HWY_DASSERT(model_dim % hn::Lanes(df) == 0);
+    const size_t layer_offset = layer * model_dim * (conv_1d_width - 1);
 
     // cache[i] = input at time t-i.
-    float* HWY_RESTRICT cache[HWY_MAX(kConv1dWidth, 1)];
+    float* HWY_RESTRICT cache[HWY_MAX(conv_1d_width, 1)];
     cache[0] = x;
-    for (size_t i = 1; i < kConv1dWidth; i++) {
+    for (size_t i = 1; i < conv_1d_width; i++) {
       cache[i] =
           kv_cache.conv1d_cache.get() + layer_offset +
-          ((pos + kConv1dWidth - 1 - i) % (kConv1dWidth - 1)) * kModelDim;
+          ((pos + conv_1d_width - 1 - i) % (conv_1d_width - 1)) * model_dim;
     }
-    for (size_t i = 0; i < kModelDim; i += hn::Lanes(df)) {
+    for (size_t i = 0; i < model_dim; i += hn::Lanes(df)) {
       auto xv = hn::Load(df, x + i);
       auto accum0 =
           hn::Load(df, layer_weights->griffin.conv_biases.data_scale1() + i);
       auto accum1 = hn::Zero(df);
-      static_assert(kConv1dWidth % 2 == 0, "Conv width must be even");
-      for (size_t l = 0; 2 * l < kConv1dWidth; l++) {
-        auto wv0 = hn::Load(df, layer_weights->griffin.conv_w.data_scale1() +
-                                (kConv1dWidth - 1 - 2 * l) * kModelDim + i);
-        auto wv1 = hn::Load(df, layer_weights->griffin.conv_w.data_scale1() +
-                                (kConv1dWidth - 2 - 2 * l) * kModelDim + i);
+      HWY_ASSERT_M(conv_1d_width % 2 == 0, "Conv width must be even");
+      for (size_t l = 0; 2 * l < conv_1d_width; l++) {
+        auto wv0 =
+            hn::Load(df, layer_weights->griffin.conv_w.data_scale1() +
+                             (conv_1d_width - 1 - 2 * l) * model_dim + i);
+        auto wv1 =
+            hn::Load(df, layer_weights->griffin.conv_w.data_scale1() +
+                             (conv_1d_width - 2 - 2 * l) * model_dim + i);
         accum0 = hn::MulAdd(wv0, hn::Load(df, cache[l * 2] + i), accum0);
         accum1 = hn::MulAdd(wv1, hn::Load(df, cache[l * 2 + 1] + i), accum1);
       }
       hn::Store(hn::Add(accum0, accum1), df, x + i);
-      hn::Store(xv, df, cache[HWY_MAX(kConv1dWidth, 1) - 1] + i);
+      hn::Store(xv, df, cache[HWY_MAX(conv_1d_width, 1) - 1] + i);
     }
   }
 
@@ -142,19 +145,19 @@ HWY_NOINLINE void GriffinRecurrent(
     float* HWY_RESTRICT gate_x = activations.griffin_gate_x.Batch(batch_idx);
     float* HWY_RESTRICT a = activations.griffin_multiplier.Batch(batch_idx);
     float* HWY_RESTRICT rnn_state =
-        kv_cache.rglru_cache.get() + layer * kModelDim;
+        kv_cache.rglru_cache.get() + layer * model_dim;
 
-    pool.Run(0, kHeads, [&](const uint64_t head, size_t /*thread*/) HWY_ATTR {
-      constexpr size_t kHeadDim = kModelDim / kHeads;
-      constexpr size_t kMatrixSize = kHeadDim * kHeadDim;
+    pool.Run(0, heads, [&](const uint64_t head, size_t /*thread*/) HWY_ATTR {
+      const size_t kHeadDim = model_dim / heads;
+      const size_t kMatrixSize = kHeadDim * kHeadDim;
       size_t head_offset = head * kHeadDim;
-      TwoOfsMatVecAddLoop<kHeadDim, kHeadDim>(
+      TwoOfsMatVecAddLoop(
           layer_weights->griffin.gate_w, kMatrixSize * head,
-          kMatrixSize * (kHeads + head), x + head_offset,
+          kMatrixSize * (heads + head), kHeadDim, kHeadDim, x + head_offset,
           /*add0=*/layer_weights->griffin.gate_biases.data_scale1() +
               head_offset,
           /*add1=*/layer_weights->griffin.gate_biases.data_scale1() +
-              kModelDim + head_offset,
+              model_dim + head_offset,
           /*out0=*/gate_x + head_offset, /*out1=*/a + head_offset);
       Sigmoid(gate_x + head_offset, kHeadDim);
       Sigmoid(a + head_offset, kHeadDim);
@@ -192,89 +195,86 @@ HWY_NOINLINE void GriffinRecurrent(
   for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
     float* HWY_RESTRICT x = activations.griffin_x.Batch(batch_idx);
     float* out_ptr = activations.att_sums.Batch(batch_idx);
-    MatVecAdd<kModelDim, kModelDim>(
-        layer_weights->griffin.linear_out_w, 0, x,
-        layer_weights->griffin.linear_out_biases.data_scale1(), out_ptr, pool);
+    MatVecAdd(layer_weights->griffin.linear_out_w, 0, model_dim, model_dim, x,
+              layer_weights->griffin.linear_out_biases.data_scale1(), out_ptr,
+              pool);
   }
 }
 
 // Wrapper class; holds arguments in member variables to shorten call sites.
-template <class TConfig>
+template <typename T>
 class GemmaAttention {
-  static constexpr size_t kCacheLayerSize = CacheLayerSize<TConfig>()();
-  static constexpr size_t kCachePosSize = CachePosSize<TConfig>()();
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kKVHeads = TConfig::kKVHeads;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kQStride = Activations::QStride<TConfig>();
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr bool kIsMHA = Activations::IsMHA<TConfig>();
-
   // The attention window usually starts at 0 unless `pos` is larger than
   // the attention window size, then it is `pos` - window_size + 1.
-  static HWY_INLINE size_t StartPos(size_t pos, size_t layer) {
-    const size_t att_window_size = TConfig::kAttentionWindowSizes[layer];
+  HWY_INLINE size_t StartPos(size_t pos, size_t layer) {
+    const size_t att_window_size =
+        activations_.weights_config.attention_window_sizes[layer];
     return pos - std::min(att_window_size - 1, pos);
   }
 
-  template <typename T>
-  HWY_INLINE void PositionalEncodingQK(const T* qk, size_t pos, size_t layer,
-                                       const float mul, T* qk_out) {
+  template <typename U>
+  HWY_INLINE void PositionalEncodingQK(const U* qk, size_t pos, size_t layer,
+                                       const float mul, U* qk_out) {
     const float* inv_timescale = activations_.inv_timescale.Const();
     // PostQKType::Rope
     (void)layer;
-    if (TConfig::kUseHalfRope) {
-      hwy::CopyBytes(qk, qk_out, kQKVDim * sizeof(*qk));
-      Rope(qk_out, kQKVDim / 2, inv_timescale, pos);
-      MulByConst(mul, qk_out, kQKVDim);
+    if (layer_weights_.layer_config.post_qk == PostQKType::HalfRope) {
+      hwy::CopyBytes(qk, qk_out, layer_config_.qkv_dim * sizeof(*qk));
+      Rope(qk_out, layer_config_.qkv_dim / 2, inv_timescale, pos);
+      MulByConst(mul, qk_out, layer_config_.qkv_dim);
     } else {
-      RopeAndMulBy(mul, qk, kQKVDim, inv_timescale, pos, qk_out);
+      RopeAndMulBy(mul, qk, layer_config_.qkv_dim, inv_timescale, pos, qk_out);
     }
   }
 
-  // Fills activations.q and computes KV. For kIsMHA, a single MatMul suffices
+  // Fills activations.q and computes KV. For is_mha_, a single MatMul suffices
   // and we later copy KV from q to KVCache. Otherwise, a second MatMul writes
   // KV directly to KVCache.
   HWY_NOINLINE void ComputeQKV(const size_t num_interleaved) {
     PROFILER_ZONE("Gen.Attention.QKV");
     // For the computation of Q, K, and V, it is useful to remember that
-    // qkv_einsum_w has shape [(kHeads + kKVHeads * 2), kKQVDim, kModelDim]
-    // and kQStride = kQKVDim * (kIsMHA ? 3 : 1);
+    // qkv_einsum_w has shape [(layer_config_.heads + layer_config_.kv_heads *
+    // 2), kKQVDim, layer_config_.model_dim] and q_stride_ =
+    // layer_config_.qkv_dim * (is_mha_ ? 3 : 1);
 
     const auto pre_att_rms_out =
-        ConstMat(activations_.pre_att_rms_out.All(), kModelDim);
-    const auto w_q1 =
-        layer_weights_.qkv_einsum_w.data() == nullptr
-            ? ConstMat(layer_weights_.qkv_einsum_w1.data(), kModelDim)
-            : ConstMat(layer_weights_.qkv_einsum_w.data(), kModelDim);
+        ConstMat(activations_.pre_att_rms_out.All(), layer_config_.model_dim);
+    const auto w_q1 = layer_weights_.qkv_einsum_w.data() == nullptr
+                          ? ConstMat(layer_weights_.qkv_einsum_w1.data(),
+                                     layer_config_.model_dim)
+                          : ConstMat(layer_weights_.qkv_einsum_w.data(),
+                                     layer_config_.model_dim);
     const auto w_q2 =
         layer_weights_.qkv_einsum_w.data() == nullptr
-            ? ConstMat(layer_weights_.qkv_einsum_w2.data(), kModelDim)
-            : ConstMat(layer_weights_.qkv_einsum_w.data(), kModelDim, kModelDim,
-                       kHeads * kQKVDim * kModelDim);
-    MatMul</*kAdd=*/false>(num_interleaved, pre_att_rms_out, w_q1,
-                           layer_weights_.qkv_einsum_w.scale(), /*add=*/nullptr,
-                           activations_.env,
-                           MutableMat(activations_.q.All(), kHeads * kQStride));
+            ? ConstMat(layer_weights_.qkv_einsum_w2.data(),
+                       layer_config_.model_dim)
+            : ConstMat(layer_weights_.qkv_einsum_w.data(),
+                       layer_config_.model_dim, layer_config_.model_dim,
+                       layer_config_.heads * layer_config_.qkv_dim *
+                           layer_config_.model_dim);
+    MatMul</*kAdd=*/false>(
+        num_interleaved, pre_att_rms_out, w_q1,
+        layer_weights_.qkv_einsum_w.scale(), /*add=*/nullptr, activations_.env,
+        MutableMat(activations_.q.All(), layer_config_.heads * q_stride_));
 
-    if constexpr (kIsMHA) {
-      static_assert(TConfig::kInterleaveQKV, "MHA implies interleaved");
+    if (is_mha_) {
       // Multi-Head Attention a.k.a. "use_qkv_einsum" computed QKV already.
     } else {
       // Single query and no wraparound means we can use a matmul and write
-      // directly into the KV cache with a stride of kCachePosSize.
+      // directly into the KV cache with a stride of cache_pos_size_.
       if (num_queries_ == 1 &&
           queries_pos_[0] + num_tokens_ <= div_seq_len_.GetDivisor()) {
         const size_t kv_ofs =
-            queries_pos_[0] * kCachePosSize + layer_ * kCacheLayerSize;
-        // KV structure is [k, v, k, v, ....] = kKVHeads pairs of (k, v).
+            queries_pos_[0] * cache_pos_size_ + layer_ * cache_layer_size_;
+        // KV structure is [k, v, k, v, ....] = layer_config_.kv_heads pairs of
+        // (k, v).
         float* HWY_RESTRICT kv = kv_caches_[0].kv_cache.get() + kv_ofs;
         MatMul</*kAdd=*/false>(
             num_tokens_, pre_att_rms_out, w_q2,
             layer_weights_.qkv_einsum_w.scale(), /*add=*/nullptr,
             activations_.env,
-            MutableMat(kv, kKVHeads * 2 * kQKVDim, kCachePosSize));
+            MutableMat(kv, layer_config_.kv_heads * 2 * layer_config_.qkv_dim,
+                       cache_pos_size_));
       } else {
         // Proceed row by row because there will be wraparound.
         for (size_t interleaved_idx = 0; interleaved_idx < num_interleaved;
@@ -286,71 +286,77 @@ class GemmaAttention {
           const size_t cache_pos =
               div_seq_len_.Remainder(queries_pos_[query_idx] + batch_idx);
           const size_t kv_offset =
-              cache_pos * kCachePosSize + layer_ * kCacheLayerSize;
+              cache_pos * cache_pos_size_ + layer_ * cache_layer_size_;
           float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + kv_offset;
-          // KV structure is [k, v, k, v, ....] = kKVHeads pairs of (k, v).
+          // KV structure is [k, v, k, v, ....] = layer_config_.kv_heads pairs
+          // of (k, v).
           if (layer_weights_.qkv_einsum_w.data() == nullptr) {
-            MatVec<kKVHeads * 2 * kQKVDim, kModelDim>(
-                layer_weights_.qkv_einsum_w2, 0, x, kv, pool_);
+            MatVec(layer_weights_.qkv_einsum_w2, 0,
+                   layer_config_.kv_heads * 2 * layer_config_.qkv_dim,
+                   layer_config_.model_dim, x, kv, pool_);
           } else {
-            MatVec<kKVHeads * 2 * kQKVDim, kModelDim>(
-                layer_weights_.qkv_einsum_w, kHeads * kQKVDim * kModelDim, x,
-                kv, pool_);
+            MatVec(layer_weights_.qkv_einsum_w,
+                   layer_config_.heads * layer_config_.qkv_dim *
+                       layer_config_.model_dim,
+                   layer_config_.kv_heads * 2 * layer_config_.qkv_dim,
+                   layer_config_.model_dim, x, kv, pool_);
           }
         }
       }
     }
 
     // Apply positional encodings for K (and copy KV to cache if MHA).
-    pool_.Run(
-        0, kKVHeads * num_interleaved,
-        [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-          const size_t head = task % kKVHeads;
-          const size_t interleaved_idx = task / kKVHeads;
-          const size_t query_idx = interleaved_idx % num_queries_;
-          const size_t batch_idx = interleaved_idx / num_queries_;
-          const size_t pos = queries_pos_[query_idx] + batch_idx;
-          const size_t cache_pos = div_seq_len_.Remainder(pos);
-          const size_t kv_offset = cache_pos * kCachePosSize +
-                                   layer_ * kCacheLayerSize +
-                                   head * kQKVDim * 2;
-          KVCache& kv_cache = kv_caches_[query_idx];
-          float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + kv_offset;
-          const float* HWY_RESTRICT mha_kv =
-              activations_.q.Batch(interleaved_idx) + head * kQStride + kQKVDim;
+    pool_.Run(0, layer_config_.kv_heads * num_interleaved,
+              [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
+                const size_t head = task % layer_config_.kv_heads;
+                const size_t interleaved_idx = task / layer_config_.kv_heads;
+                const size_t query_idx = interleaved_idx % num_queries_;
+                const size_t batch_idx = interleaved_idx / num_queries_;
+                const size_t pos = queries_pos_[query_idx] + batch_idx;
+                const size_t cache_pos = div_seq_len_.Remainder(pos);
+                const size_t kv_offset = cache_pos * cache_pos_size_ +
+                                         layer_ * cache_layer_size_ +
+                                         head * layer_config_.qkv_dim * 2;
+                KVCache& kv_cache = kv_caches_[query_idx];
+                float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + kv_offset;
+                const float* HWY_RESTRICT mha_kv =
+                    activations_.q.Batch(interleaved_idx) + head * q_stride_ +
+                    layer_config_.qkv_dim;
 
-          // Copy from `q` if MHA, or apply in-place.
-          PositionalEncodingQK(kIsMHA ? mha_kv : kv, pos, layer_, 1.0f, kv);
+                // Copy from `q` if MHA, or apply in-place.
+                PositionalEncodingQK(is_mha_ ? mha_kv : kv, pos, layer_, 1.0f,
+                                     kv);
 
-          // If MHA, also copy V into KVCache.
-          if (kIsMHA) {
-            hwy::CopyBytes(mha_kv + kQKVDim, kv + kQKVDim,
-                           kQKVDim * sizeof(*kv));
-          }
-        });
+                // If MHA, also copy V into KVCache.
+                if (is_mha_) {
+                  hwy::CopyBytes(mha_kv + layer_config_.qkv_dim,
+                                 kv + layer_config_.qkv_dim,
+                                 layer_config_.qkv_dim * sizeof(*kv));
+                }
+              });
   }
 
   // Computes Q.K scores, which are "logits" (or scores) stored to head_att.
   HWY_INLINE void QDotK(const size_t start_pos, const size_t last_pos,
                         const size_t head_offset, const float* HWY_RESTRICT q,
                         const KVCache& kv_cache, float* HWY_RESTRICT head_att) {
-    if (HWY_LIKELY(last_pos < kSeqLen)) {
+    if (HWY_LIKELY(last_pos < activations_.seq_len)) {
       // Slightly faster: no wraparound.
       for (size_t pos = start_pos; pos <= last_pos; ++pos) {
         const size_t kv_offset =
-            pos * kCachePosSize + layer_ * kCacheLayerSize + head_offset;
+            pos * cache_pos_size_ + layer_ * cache_layer_size_ + head_offset;
         const float* HWY_RESTRICT k = &kv_cache.kv_cache[kv_offset];
-        const float score = Dot(q, k, kQKVDim);
+        const float score = Dot(q, k, layer_config_.qkv_dim);
         head_att[pos] = score;
       }
     } else {
       for (size_t pos = start_pos; pos <= last_pos; ++pos) {
         const size_t cache_pos = div_seq_len_.Remainder(pos);
-        const size_t kv_offset =
-            cache_pos * kCachePosSize + layer_ * kCacheLayerSize + head_offset;
+        const size_t kv_offset = cache_pos * cache_pos_size_ +
+                                 layer_ * cache_layer_size_ + head_offset;
         const float* HWY_RESTRICT k = &kv_cache.kv_cache[kv_offset];
-        const float score = Dot(q, k, kQKVDim);
-        head_att[pos % kSeqLen] = score;
+        const float score = Dot(q, k, layer_config_.qkv_dim);
+        head_att[pos % activations_.seq_len] = score;
       }
     }
   }
@@ -358,59 +364,60 @@ class GemmaAttention {
   // Accumulates the sum of v (from `kv_cache`) * probability (`head_att`) into
   // `att_out`. Equivalent in gemma/modules.py:
   // encoded = jnp.einsum('BTNS,BSNH->BTNH', probs, value_proj)
-  static HWY_INLINE void WeightedSumV(
-      const size_t start_pos, const size_t last_pos,
-      const float* HWY_RESTRICT head_att, const size_t layer,
-      const size_t head_offset, const hwy::Divisor& div_seq_len,
-      const KVCache& kv_cache, float* HWY_RESTRICT att_out) {
-    hwy::ZeroBytes(att_out, kQKVDim * sizeof(*att_out));
+  HWY_INLINE void WeightedSumV(const size_t start_pos, const size_t last_pos,
+                               const float* HWY_RESTRICT head_att,
+                               const size_t layer, const size_t head_offset,
+                               const hwy::Divisor& div_seq_len,
+                               const KVCache& kv_cache,
+                               float* HWY_RESTRICT att_out) const {
+    hwy::ZeroBytes(att_out, layer_config_.qkv_dim * sizeof(*att_out));
 
-    if (HWY_LIKELY(last_pos < kSeqLen)) {
+    if (HWY_LIKELY(last_pos < activations_.seq_len)) {
       // Slightly faster: no wraparound.
       for (size_t pos = start_pos; pos <= last_pos; ++pos) {
         const size_t kv_offset =
-            pos * kCachePosSize + layer * kCacheLayerSize + head_offset;
+            pos * cache_pos_size_ + layer * cache_layer_size_ + head_offset;
         const float* HWY_RESTRICT v =
-            kv_cache.kv_cache.get() + kv_offset + kQKVDim;
-        MulByConstAndAdd(head_att[pos], v, att_out, kQKVDim);
+            kv_cache.kv_cache.get() + kv_offset + layer_config_.qkv_dim;
+        MulByConstAndAdd(head_att[pos], v, att_out, layer_config_.qkv_dim);
       }
     } else {
       for (size_t pos = start_pos; pos <= last_pos; ++pos) {
         const size_t cache_pos = div_seq_len.Remainder(pos);
-        const size_t kv_offset =
-            cache_pos * kCachePosSize + layer * kCacheLayerSize + head_offset;
+        const size_t kv_offset = cache_pos * cache_pos_size_ +
+                                 layer * cache_layer_size_ + head_offset;
         const float* HWY_RESTRICT v =
-            kv_cache.kv_cache.get() + kv_offset + kQKVDim;
-        MulByConstAndAdd(head_att[pos % kSeqLen], v, att_out, kQKVDim);
+            kv_cache.kv_cache.get() + kv_offset + layer_config_.qkv_dim;
+        MulByConstAndAdd(head_att[pos % activations_.seq_len], v, att_out,
+                         layer_config_.qkv_dim);
       }
     }
   }
 
   HWY_NOINLINE void DotSoftmaxWeightedSum(const size_t num_interleaved) {
     PROFILER_ZONE("Gen.Attention.DotSoftmax");
-    GEMMA_CONSTEXPR_SQRT float kQueryScale = ChooseQueryScale<TConfig>();
+    const float query_scale = ChooseQueryScale(activations_.weights_config);
 
     // A "head group" in the context of GQA refers to a collection of query
     // heads that share the same key and value heads.
-    static_assert((kHeads % kKVHeads) == 0,
-                  "query heads must be a multiple of key-value heads");
-    constexpr size_t kHeadGroups = kHeads / kKVHeads;
+    const size_t kHeadGroups = layer_config_.heads / layer_config_.kv_heads;
 
     // For each head (token, query), compute Q.K, softmax, and weighted V.
-    pool_.Run(0, kHeads * num_interleaved,
+    pool_.Run(0, layer_config_.heads * num_interleaved,
               [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-                const size_t head = task % kHeads;
-                const size_t interleaved_idx = task / kHeads;
+                const size_t head = task % layer_config_.heads;
+                const size_t interleaved_idx = task / layer_config_.heads;
                 const size_t query_idx = interleaved_idx % num_queries_;
                 const size_t batch_idx = interleaved_idx / num_queries_;
-                const size_t head_offset = (head / kHeadGroups) * kQKVDim * 2;
+                const size_t head_offset =
+                    (head / kHeadGroups) * layer_config_.qkv_dim * 2;
                 KVCache& kv_cache = kv_caches_[query_idx];
                 float* HWY_RESTRICT q =
-                    activations_.q.Batch(interleaved_idx) + head * kQStride;
+                    activations_.q.Batch(interleaved_idx) + head * q_stride_;
 
                 // Apply rope and scaling to Q.
                 const size_t pos = queries_pos_[query_idx] + batch_idx;
-                PositionalEncodingQK(q, pos, layer_, kQueryScale, q);
+                PositionalEncodingQK(q, pos, layer_, query_scale, q);
 
                 const size_t start_pos = StartPos(pos, layer_);
                 size_t last_pos = pos;
@@ -421,39 +428,62 @@ class GemmaAttention {
                 }
 
                 float* HWY_RESTRICT head_att =
-                    activations_.att.Batch(interleaved_idx) + head * kSeqLen;
+                    activations_.att.Batch(interleaved_idx) +
+                    head * activations_.seq_len;
                 QDotK(start_pos, last_pos, head_offset, q, kv_cache, head_att);
                 // SoftMax with optional SoftCap yields "probabilities" in
                 // head_att.
-                const size_t head_att_len = std::min(last_pos + 1, kSeqLen);
-                MaybeLogitsSoftCap(TConfig::kAttCap, head_att, head_att_len);
+                const size_t head_att_len =
+                    std::min(last_pos + 1, activations_.seq_len);
+                MaybeLogitsSoftCap(activations_.weights_config.att_cap,
+                                   head_att, head_att_len);
                 Softmax(head_att, head_att_len);
 
                 float* HWY_RESTRICT att_out =
                     activations_.att_out.Batch(interleaved_idx) +
-                    head * kQKVDim;
+                    head * layer_config_.qkv_dim;
                 WeightedSumV(start_pos, last_pos, head_att, layer_, head_offset,
                              div_seq_len_, kv_cache, att_out);
               });
   }
 
-  // Sums encoded (`att_out`) over num_heads (`kHeads`) and head_dim (`kQKVDim`)
-  // into output (`layer_out`).
+  // Sums encoded (`att_out`) over num_heads (`layer_config_.heads`) and
+  // head_dim
+  // (`layer_config_.qkv_dim`) into output (`layer_out`).
   HWY_NOINLINE void SumHeads(const size_t num_interleaved) {
     PROFILER_ZONE("Gen.Attention.SumHeads");
-    constexpr bool kAdd = TConfig::kSoftmaxAttnOutputBiases;
-    const float* bias =
-        kAdd ? layer_weights_.attention_output_biases.data_scale1() : nullptr;
-
-    // att_weights and att_out are concatenated heads, each of length kQKVDim.
-    // Thus the [num_interleaved, kModelDim] matmul output is the sum over
-    // heads. Compare gemma/modules.py:
-    // attn_output = self.attn_vec_einsum('BTNH,NHD->BTD', encoded)
-    MatMul<kAdd>(
-        num_interleaved, ConstMat(activations_.att_out.All(), kHeads * kQKVDim),
-        ConstMat(layer_weights_.att_weights.data(), kHeads * kQKVDim),
-        layer_weights_.att_weights.scale(), bias, activations_.env,
-        MutableMat(activations_.att_sums.All(), kModelDim));
+    // att_weights and att_out are concatenated heads, each of length
+    // layer_config_.qkv_dim. Thus the [num_interleaved,
+    // layer_config_.model_dim] matmul output is the sum over heads. Compare
+    // gemma/modules.py: attn_output = self.attn_vec_einsum('BTNH,NHD->BTD',
+    // encoded)
+    HWY_DASSERT(layer_config_.model_dim > 0);
+    HWY_DASSERT(layer_config_.heads > 0);
+    HWY_DASSERT(layer_config_.qkv_dim > 0);
+    HWY_DASSERT(layer_weights_.att_weights.data() != nullptr);
+    HWY_DASSERT(activations_.att_out.All() != nullptr);
+    HWY_DASSERT(activations_.att_sums.All() != nullptr);
+    if (layer_weights_.layer_config.softmax_attn_output_biases) {
+      MatMul</*kAdd=*/true>(
+          num_interleaved,
+          ConstMat(activations_.att_out.All(),
+                   layer_config_.heads * layer_config_.qkv_dim),
+          ConstMat(layer_weights_.att_weights.data(),
+                   layer_config_.heads * layer_config_.qkv_dim),
+          layer_weights_.att_weights.scale(),
+          layer_weights_.attention_output_biases.data_scale1(),
+          activations_.env,
+          MutableMat(activations_.att_sums.All(), layer_config_.model_dim));
+    } else {
+      MatMul</*kAdd=*/false>(
+          num_interleaved,
+          ConstMat(activations_.att_out.All(),
+                   layer_config_.heads * layer_config_.qkv_dim),
+          ConstMat(layer_weights_.att_weights.data(),
+                   layer_config_.heads * layer_config_.qkv_dim),
+          layer_weights_.att_weights.scale(), nullptr, activations_.env,
+          MutableMat(activations_.att_sums.All(), layer_config_.model_dim));
+    }
   }
 
  public:
@@ -463,39 +493,17 @@ class GemmaAttention {
   GemmaAttention(const QueriesPos& queries_pos,
                  const QueriesPos& queries_prefix_end, size_t num_tokens,
                  size_t layer, Activations& activations,
-                 const CompressedLayer<TConfig>* layer_weights,
+                 const LayerWeightsPtrs<T>* layer_weights,
                  const hwy::Divisor& div_seq_len, const KVCaches& kv_caches)
-      : queries_pos_(queries_pos),
-        queries_prefix_end_(queries_prefix_end),
-        num_queries_(queries_pos.size()),
-        num_tokens_(num_tokens),
-        layer_(layer),
-        activations_(activations),
-        layer_weights_(*layer_weights),
-        div_seq_len_(div_seq_len),
-        kv_caches_(kv_caches),
-        pool_(activations.env.Pool()) {
-    HWY_DASSERT(num_queries_ <= kv_caches_.size());
-  }
+      : GemmaAttention(queries_pos, &queries_prefix_end, num_tokens, layer,
+                       activations, layer_weights, div_seq_len, kv_caches) {}
   // Constructor with default initialization to 0 for queries_prefix_end.
   GemmaAttention(const QueriesPos& queries_pos, size_t num_tokens, size_t layer,
                  Activations& activations,
-                 const CompressedLayer<TConfig>* layer_weights,
+                 const LayerWeightsPtrs<T>* layer_weights,
                  const hwy::Divisor& div_seq_len, const KVCaches& kv_caches)
-      : queries_pos_(queries_pos),
-        queries_prefix_end_vec_(queries_pos.size(), 0),
-        queries_prefix_end_(queries_prefix_end_vec_.data(),
-                            queries_prefix_end_vec_.size()),
-        num_queries_(queries_pos.size()),
-        num_tokens_(num_tokens),
-        layer_(layer),
-        activations_(activations),
-        layer_weights_(*layer_weights),
-        div_seq_len_(div_seq_len),
-        kv_caches_(kv_caches),
-        pool_(activations.env.Pool()) {
-    HWY_DASSERT(num_queries_ <= kv_caches_.size());
-  }
+      : GemmaAttention(queries_pos, nullptr, num_tokens, layer, activations,
+                       layer_weights, div_seq_len, kv_caches) {}
 
   // Full attention computation in three steps.
   HWY_INLINE void operator()() {
@@ -506,37 +514,76 @@ class GemmaAttention {
   }
 
  private:
+  // Delegated Constructor that does most of the common work.
+  GemmaAttention(const QueriesPos& queries_pos,
+                 const QueriesPos* queries_prefix_end, size_t num_tokens,
+                 size_t layer, Activations& activations,
+                 const LayerWeightsPtrs<T>* layer_weights,
+                 const hwy::Divisor& div_seq_len, const KVCaches& kv_caches)
+      : queries_pos_(queries_pos),
+        num_queries_(queries_pos.size()),
+        num_tokens_(num_tokens),
+        layer_(layer),
+        q_stride_(activations.QStride()),
+        cache_layer_size_(layer_weights->layer_config.CacheLayerSize()),
+        cache_pos_size_(activations.cache_pos_size),
+        is_mha_(activations.IsMHA()),
+        activations_(activations),
+        layer_weights_(*layer_weights),
+        layer_config_(layer_weights->layer_config),
+        div_seq_len_(div_seq_len),
+        kv_caches_(kv_caches),
+        pool_(activations.env.Pool()) {
+    HWY_DASSERT(num_queries_ <= kv_caches_.size());
+    HWY_DASSERT_M((layer_config_.heads % layer_config_.kv_heads) == 0,
+                  "query heads must be a multiple of key-value heads");
+    if (queries_prefix_end != nullptr) {
+      queries_prefix_end_ = *queries_prefix_end;
+    } else {
+      queries_prefix_end_vec_.assign(num_queries_, 0);
+      queries_prefix_end_ = QueriesPos(queries_prefix_end_vec_.data(),
+                                       queries_prefix_end_vec_.size());
+    }
+  }
+
   const QueriesPos& queries_pos_;
-  const std::vector<size_t> queries_prefix_end_vec_;
-  const QueriesPos queries_prefix_end_;
+  std::vector<size_t> queries_prefix_end_vec_;
+  QueriesPos queries_prefix_end_;
   const size_t num_queries_;
   const size_t num_tokens_;
   const size_t layer_;
+  const size_t q_stride_ = 0;
+  const size_t cache_layer_size_ = 0;
+  const size_t cache_pos_size_ = 0;
+  const bool is_mha_ = false;
+
   Activations& activations_;
-  const CompressedLayer<TConfig>& layer_weights_;
+  const LayerWeightsPtrs<T>& layer_weights_;
+  const LayerConfig& layer_config_;
   const hwy::Divisor& div_seq_len_;
   const KVCaches& kv_caches_;
   hwy::ThreadPool& pool_;
 };
 
-template <class TConfig>
+template <typename T>
 HWY_NOINLINE void Attention(
     LayerAttentionType type, const QueriesPos& queries_pos,
     const QueriesPos& queries_prefix_end, size_t num_tokens, size_t layer,
-    Activations& activations, const CompressedLayer<TConfig>* layer_weights,
+    Activations& activations, const LayerWeightsPtrs<T>* layer_weights,
     const hwy::Divisor& div_seq_len, const KVCaches& kv_caches) {
   if (type == LayerAttentionType::kGemma) {
-    GemmaAttention<TConfig>(queries_pos, queries_prefix_end, num_tokens, layer,
-                            activations, layer_weights, div_seq_len,
-                            kv_caches)();
+    GemmaAttention<T>(queries_pos, queries_prefix_end, num_tokens, layer,
+                      activations, layer_weights, div_seq_len, kv_caches)();
   } else {
-    // Only reached if the model is Griffin. `if constexpr` prevents generating
-    // this code for non-Griffin models.
-    if constexpr (TConfig::kGriffinLayers > 0) {
-      HWY_ASSERT(queries_pos.size() == 1);
-      GriffinRecurrent<TConfig>(queries_pos[0], num_tokens, layer, activations,
-                                layer_weights, kv_caches);
-    }
+    // Only reached if the model is Griffin.
+    // The kv_caches are allocated only for the griffin layers, so we need to
+    // map the layer index to the griffin layer index.
+    auto type = layer_weights->layer_config.type;
+    size_t layer_of_type =
+        activations.weights_config.NumLayersOfTypeBefore(type, layer);
+    HWY_ASSERT(queries_pos.size() == 1);
+    GriffinRecurrent(queries_pos[0], num_tokens, layer_of_type, activations,
+                     layer_weights, kv_caches);
   }
 }
 
@@ -549,90 +596,93 @@ HWY_NOINLINE void Attention(
 // This results in a much simpler implementation. However, to avoid duplicating
 // code, we should still consider merging the two classes.
 // TODO(keysers): Refactor to share code with GemmaAttention.
-template <class TConfig>
+template <typename T>
 class VitAttention {
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kKVHeads = TConfig::kKVHeads;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kQStride = 3 * kQKVDim;
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-
   // Computes Q, K, V for all heads, stored in activations_.q.
   HWY_NOINLINE void ComputeQKV() {
     PROFILER_ZONE("Gen.VitAttention.QKV");
     const auto y =
-        ConstMat(activations_.pre_att_rms_out.All(), kModelDim);
+        ConstMat(activations_.pre_att_rms_out.All(), layer_config_.model_dim);
     auto& qkv = activations_.q;
     HWY_ASSERT(qkv.BatchSize() == num_tokens_);
-    HWY_ASSERT(qkv.Len() == kHeads * kQStride);
+    HWY_ASSERT(qkv.Len() == layer_config_.heads * 3 * layer_config_.qkv_dim);
     MatMul</*kAdd=*/true>(
         num_tokens_, y,
-        ConstMat(layer_weights_.vit.qkv_einsum_w.data_scale1(), kModelDim),
+        ConstMat(layer_weights_.vit.qkv_einsum_w.data_scale1(),
+                 layer_config_.model_dim),
         /*scale=*/1.0f, layer_weights_.vit.qkv_einsum_b.data_scale1(),
         activations_.env, MutableMat(qkv.All(), qkv.Len()));
   }
 
   HWY_NOINLINE void DotSoftmaxWeightedSum() {
-    GEMMA_CONSTEXPR_SQRT float kQueryScale =
-        1.0f / Sqrt(static_cast<float>(TConfig::kQKVDim));
+    const float query_scale =
+        1.0f / sqrtf(static_cast<float>(layer_config_.qkv_dim));
     PROFILER_ZONE("Gen.VitAttention.DotSoftmax");
     // A "head group" in the context of GQA refers to a collection of query
     // heads that share the same key and value heads.
-    static_assert(kHeads == kKVHeads, "Vit expects MHA");
+    HWY_ASSERT_M(layer_config_.heads == layer_config_.kv_heads,
+                 "Vit expects MHA");
 
     // Compute Q.K, softmax, and weighted V.
-    pool_.Run(0, kHeads * num_tokens_,
-              [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-                const size_t head = task % kHeads;
-                const size_t token = task / kHeads;
-                // Compute Q.K scores, which are "logits" stored in head_att.
-                float* HWY_RESTRICT q =
-                    activations_.q.Batch(token) + head * kQStride;
-                MulByConst(kQueryScale, q, kQKVDim);
-                float* HWY_RESTRICT head_att =
-                    activations_.att.Batch(token) + head * kSeqLen;
-                for (size_t i = 0; i < kSeqLen; ++i) {
-                  float* HWY_RESTRICT k =
-                      activations_.q.Batch(i) + head * kQStride + kQKVDim;
-                  head_att[i] = Dot(q, k, kQKVDim);  // score = q.k
-                }
-                // SoftMax yields "probabilities" in head_att.
-                Softmax(head_att, kSeqLen);
-                // Compute weighted sum of v into att_out.
-                float* HWY_RESTRICT att_out =
-                    activations_.att_out.Batch(token) + head * kQKVDim;
-                hwy::ZeroBytes(att_out, kQKVDim * sizeof(*att_out));
-                for (size_t i = 0; i < kSeqLen; ++i) {
-                  float* HWY_RESTRICT v =
-                      activations_.q.Batch(i) + head * kQStride + 2 * kQKVDim;
-                  MulByConstAndAdd(head_att[i], v, att_out, kQKVDim);
-                }
-              });
+    pool_.Run(
+        0, layer_config_.heads * num_tokens_,
+        [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
+          const size_t head = task % layer_config_.heads;
+          const size_t token = task / layer_config_.heads;
+          // Compute Q.K scores, which are "logits" stored in head_att.
+          float* HWY_RESTRICT q =
+              activations_.q.Batch(token) + head * 3 * layer_config_.qkv_dim;
+          MulByConst(query_scale, q, layer_config_.qkv_dim);
+          float* HWY_RESTRICT head_att =
+              activations_.att.Batch(token) + head * activations_.seq_len;
+          for (size_t i = 0; i < activations_.seq_len; ++i) {
+            float* HWY_RESTRICT k = activations_.q.Batch(i) +
+                                    head * 3 * layer_config_.qkv_dim +
+                                    layer_config_.qkv_dim;
+            head_att[i] = Dot(q, k, layer_config_.qkv_dim);  // score = q.k
+          }
+          // SoftMax yields "probabilities" in head_att.
+          Softmax(head_att, activations_.seq_len);
+          // Compute weighted sum of v into att_out.
+          float* HWY_RESTRICT att_out =
+              activations_.att_out.Batch(token) + head * layer_config_.qkv_dim;
+          hwy::ZeroBytes(att_out, layer_config_.qkv_dim * sizeof(*att_out));
+          for (size_t i = 0; i < activations_.seq_len; ++i) {
+            float* HWY_RESTRICT v = activations_.q.Batch(i) +
+                                    head * 3 * layer_config_.qkv_dim +
+                                    2 * layer_config_.qkv_dim;
+            MulByConstAndAdd(head_att[i], v, att_out, layer_config_.qkv_dim);
+          }
+        });
   }
 
-  // Sums encoded (`att_out`) over num_heads (`kHeads`) and head_dim (`kQKVDim`)
-  // into output (`att_sums`).
+  // Sums encoded (`att_out`) over num_heads (`layer_config_.heads`) and
+  // head_dim
+  // (`layer_config_.qkv_dim`) into output (`att_sums`).
   HWY_NOINLINE void SumHeads() {
     PROFILER_ZONE("Gen.VitAttention.SumHeads");
     auto* bias = layer_weights_.vit.attn_out_b.data_scale1();
-    auto att_out = ConstMat(activations_.att_out.All(), kHeads * kQKVDim);
+    auto att_out = ConstMat(activations_.att_out.All(),
+                            layer_config_.heads * layer_config_.qkv_dim);
     auto att_weights = ConstMat(layer_weights_.vit.attn_out_w.data_scale1(),
-                                kHeads * kQKVDim);
-    auto att_sums = MutableMat(activations_.att_sums.All(), kModelDim);
-    // att_weights and att_out are concatenated heads, each of length kQKVDim.
-    // Thus the [num_tokens_, kModelDim] matmul output is the sum over heads.
+                                layer_config_.heads * layer_config_.qkv_dim);
+    auto att_sums =
+        MutableMat(activations_.att_sums.All(), layer_config_.model_dim);
+    // att_weights and att_out are concatenated heads, each of length
+    // layer_config_.qkv_dim. Thus the [num_tokens_, layer_config_.model_dim]
+    // matmul output is the sum over heads.
     MatMul</*kAdd=*/true>(num_tokens_, att_out, att_weights, /*scale=*/1.0f,
                           bias, activations_.env, att_sums);
   }
 
  public:
   VitAttention(size_t num_tokens, size_t layer, Activations& activations,
-               const CompressedLayer<TConfig>* layer_weights)
+               const LayerWeightsPtrs<T>* layer_weights)
       : num_tokens_(num_tokens),
         layer_(layer),
         activations_(activations),
         layer_weights_(*layer_weights),
+        layer_config_(layer_weights->layer_config),
         pool_(activations.env.Pool()) {}
 
   HWY_INLINE void operator()() {
@@ -645,13 +695,14 @@ class VitAttention {
   const size_t num_tokens_;
   const size_t layer_;
   Activations& activations_;
-  const CompressedLayer<TConfig>& layer_weights_;
+  const LayerWeightsPtrs<T>& layer_weights_;
+  const LayerConfig& layer_config_;
   hwy::ThreadPool& pool_;
 };
 
-template <class TConfig, typename T>
-HWY_NOINLINE void Activation(T* HWY_RESTRICT c1, T* HWY_RESTRICT c2,
-                             size_t count) {
+template <typename T>
+HWY_NOINLINE void Activation(ActivationType activation, T* HWY_RESTRICT c1,
+                             T* HWY_RESTRICT c2, size_t count) {
   PROFILER_ZONE("Gen.Activation");
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<T>;
@@ -667,22 +718,18 @@ HWY_NOINLINE void Activation(T* HWY_RESTRICT c1, T* HWY_RESTRICT c2,
   });
 }
 
-template <class TConfig>
-HWY_NOINLINE void FFW(Activations& activations, size_t num_interleaved,
-                      const CompressedLayer<TConfig>* layer_weights) {
+template <typename T>
+HWY_NOINLINE void FFWNoVit(Activations& activations, size_t num_interleaved,
+                           const LayerWeightsPtrs<T>* layer_weights) {
   PROFILER_ZONE("Gen.FFW");
-  constexpr size_t kModelDim = TConfig::kModelDim;
-  constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-  constexpr bool kAddBias = TConfig::kFFBiases;
-  constexpr bool kIsVit = TConfig::kLayerConfig[0] == LayerAttentionType::kVit;
-  using WeightType =
-      hwy::If<kIsVit,
-              typename CompressedLayer<TConfig>::WeightF32OrBF16,
-              typename CompressedLayer<TConfig>::Weight>;
+  const size_t model_dim = layer_weights->layer_config.model_dim;
+  const size_t ffh_hidden_dim = layer_weights->layer_config.ff_hidden_dim;
+  const bool add_bias = layer_weights->layer_config.ff_biases;
+  using WeightType = T;
   HWY_DASSERT(num_interleaved <= activations.bf_pre_ffw_rms_out.BatchSize());
 
   // Define slightly more readable names for the weights and activations.
-  const auto x = ConstMat(activations.bf_pre_ffw_rms_out.All(), kModelDim);
+  const auto x = ConstMat(activations.bf_pre_ffw_rms_out.All(), model_dim);
   Mat<const WeightType> w1;
   const float* bias1 = nullptr;
   Mat<const WeightType> w2;
@@ -691,63 +738,120 @@ HWY_NOINLINE void FFW(Activations& activations, size_t num_interleaved,
   Mat<const WeightType> w_output;
   const float* output_bias = nullptr;
   float output_scale = 1.0f;
-  auto hidden_activations = MutableMat(activations.C1.All(), kFFHiddenDim);
-  auto multiplier = MutableMat(activations.C2.All(), kFFHiddenDim);
-  auto ffw_out = MutableMat(activations.ffw_out.All(), kModelDim);
+  auto hidden_activations = MutableMat(activations.C1.All(), ffh_hidden_dim);
+  auto multiplier = MutableMat(activations.C2.All(), ffh_hidden_dim);
+  auto ffw_out = MutableMat(activations.ffw_out.All(), model_dim);
 
   // For some of the weights and activations, it depends on the config where to
   // get them from or whether to use them at all.
-  if constexpr (kAddBias && !kIsVit) {
-    bias1 = layer_weights->ffw_gating_biases.data_scale1();
-    bias2 = bias1 + kFFHiddenDim;
-    output_bias = layer_weights->ffw_output_biases.data_scale1();
-  }
-  if constexpr (!kIsVit) {
-    w1 = layer_weights->gating_einsum_w.data() == nullptr
-             ? ConstMat(layer_weights->gating_einsum_w1.data(), kModelDim)
-             : ConstMat(layer_weights->gating_einsum_w.data(), kModelDim);
-    w2 = layer_weights->gating_einsum_w.data() == nullptr
-             ? ConstMat(layer_weights->gating_einsum_w2.data(), kModelDim)
-             : ConstMat(layer_weights->gating_einsum_w.data(), kModelDim,
-                        kModelDim, kModelDim * kFFHiddenDim);
-    scale = layer_weights->gating_einsum_w.data() == nullptr
-                ? layer_weights->gating_einsum_w1.scale()
-                : layer_weights->gating_einsum_w.scale();
-    w_output = ConstMat(layer_weights->linear_w.data(), kFFHiddenDim);
-    output_scale = layer_weights->linear_w.scale();
-  } else {
-    w1 = ConstMat(layer_weights->vit.linear_0_w.data_scale1(), kModelDim);
-    bias1 = layer_weights->vit.linear_0_b.data_scale1();
-    multiplier.ptr = nullptr;
-    w_output =
-        ConstMat(layer_weights->vit.linear_1_w.data_scale1(), kFFHiddenDim);
-    output_bias = layer_weights->vit.linear_1_b.data_scale1();
-  }
+  bias1 = layer_weights->ffw_gating_biases.data_scale1();
+  bias2 = bias1 + ffh_hidden_dim;
+  output_bias = layer_weights->ffw_output_biases.data_scale1();
+  w1 = layer_weights->gating_einsum_w.data() == nullptr
+           ? ConstMat(layer_weights->gating_einsum_w1.data(), model_dim)
+           : ConstMat(layer_weights->gating_einsum_w.data(), model_dim);
+  w2 = layer_weights->gating_einsum_w.data() == nullptr
+           ? ConstMat(layer_weights->gating_einsum_w2.data(), model_dim)
+           : ConstMat(layer_weights->gating_einsum_w.data(), model_dim,
+                      model_dim, model_dim * ffh_hidden_dim);
+  scale = layer_weights->gating_einsum_w.data() == nullptr
+              ? layer_weights->gating_einsum_w1.scale()
+              : layer_weights->gating_einsum_w.scale();
+  w_output = ConstMat(layer_weights->linear_w.data(), ffh_hidden_dim);
+  output_scale = layer_weights->linear_w.scale();
 
   // Compute the hidden layer activations.
-  MatMul<kAddBias>(num_interleaved, x, w1, scale, bias1, activations.env,
-                   hidden_activations);
-  if constexpr (!kIsVit) {
-    MatMul<kAddBias>(num_interleaved, x, w2, scale, bias2, activations.env,
-                     multiplier);
+  if (add_bias) {
+    MatMul</*kAddBias=*/true>(num_interleaved, x, w1, scale, bias1,
+                              activations.env, hidden_activations);
+    MatMul</*kAddBias=*/true>(num_interleaved, x, w2, scale, bias2,
+                              activations.env, multiplier);
+  } else {
+    MatMul</*kAddBias=*/false>(num_interleaved, x, w1, scale, bias1,
+                               activations.env, hidden_activations);
+    MatMul</*kAddBias=*/false>(num_interleaved, x, w2, scale, bias2,
+                               activations.env, multiplier);
   }
 
   // Activation (Gelu) and maybe multiply by gate. Store activations in act.
-  Activation<TConfig>(hidden_activations.ptr, multiplier.ptr,
-                      kFFHiddenDim * num_interleaved);
+  Activation(layer_weights->layer_config.activation, hidden_activations.ptr,
+             multiplier.ptr, ffh_hidden_dim * num_interleaved);
 
   // Hidden layer -> output layer.
-  MatMul<kAddBias>(num_interleaved, ConstMat(hidden_activations), w_output,
-                   output_scale, output_bias, activations.env, ffw_out);
+  if (add_bias) {
+    MatMul</*kAddBias=*/true>(num_interleaved, ConstMat(hidden_activations),
+                              w_output, output_scale, output_bias,
+                              activations.env, ffw_out);
+  } else {
+    MatMul</*kAddBias=*/false>(num_interleaved, ConstMat(hidden_activations),
+                               w_output, output_scale, output_bias,
+                               activations.env, ffw_out);
+  }
+}
+
+template <typename T>
+HWY_NOINLINE void FFWVit(Activations& activations, size_t num_interleaved,
+                         const LayerWeightsPtrs<T>* layer_weights) {
+  PROFILER_ZONE("Gen.FFW");
+  const size_t model_dim = layer_weights->layer_config.model_dim;
+  const size_t ff_hidden_dim = layer_weights->layer_config.ff_hidden_dim;
+  const bool add_bias = layer_weights->layer_config.ff_biases;
+  using WeightType = typename LayerWeightsPtrs<T>::WeightF32OrBF16;
+  HWY_DASSERT(num_interleaved <= activations.bf_pre_ffw_rms_out.BatchSize());
+
+  // Define slightly more readable names for the weights and activations.
+  const auto x = ConstMat(activations.bf_pre_ffw_rms_out.All(), model_dim);
+  Mat<const WeightType> w1;
+  const float* bias1 = nullptr;
+  float scale = 1.0f;
+  Mat<const WeightType> w_output;
+  const float* output_bias = nullptr;
+  float output_scale = 1.0f;
+  auto hidden_activations = MutableMat(activations.C1.All(), ff_hidden_dim);
+  auto multiplier = MutableMat(activations.C2.All(), ff_hidden_dim);
+  auto ffw_out = MutableMat(activations.ffw_out.All(), model_dim);
+
+  // For some of the weights and activations, it depends on the config where to
+  // get them from or whether to use them at all.
+  w1 = ConstMat(layer_weights->vit.linear_0_w.data_scale1(), model_dim);
+  bias1 = layer_weights->vit.linear_0_b.data_scale1();
+  multiplier.ptr = nullptr;
+  w_output =
+      ConstMat(layer_weights->vit.linear_1_w.data_scale1(), ff_hidden_dim);
+  output_bias = layer_weights->vit.linear_1_b.data_scale1();
+
+  // Compute the hidden layer activations.
+  if (add_bias) {
+    MatMul</*kAddBias=*/true>(num_interleaved, x, w1, scale, bias1,
+                              activations.env, hidden_activations);
+  } else {
+    MatMul</*kAddBias=*/false>(num_interleaved, x, w1, scale, bias1,
+                               activations.env, hidden_activations);
+  }
+
+  // Activation (Gelu) and maybe multiply by gate. Store activations in act.
+  Activation(layer_weights->layer_config.activation, hidden_activations.ptr,
+             multiplier.ptr, ff_hidden_dim * num_interleaved);
+
+  // Hidden layer -> output layer.
+  if (add_bias) {
+    MatMul</*kAddBias=*/true>(num_interleaved, ConstMat(hidden_activations),
+                              w_output, output_scale, output_bias,
+                              activations.env, ffw_out);
+  } else {
+    MatMul</*kAddBias=*/false>(num_interleaved, ConstMat(hidden_activations),
+                               w_output, output_scale, output_bias,
+                               activations.env, ffw_out);
+  }
 }
 
 // `batch_idx` indicates which row of `x` to write to.
 // `pos` is the *token*'s position, not the start of the batch, because this is
 // called for batches of tokens in prefill, but batches of queries in decode.
-template <class TConfig>
+template <typename T>
 HWY_NOINLINE void EmbedToken(int token, size_t batch_idx, size_t pos,
                              size_t pos_in_prompt,
-                             const CompressedWeights<TConfig>& weights,
+                             const ModelWeightsPtrs<T>& weights,
                              RowVectorBatch<float>& x,
                              const ImageTokens* image_tokens) {
   // Image tokens just need to be copied.
@@ -757,82 +861,85 @@ HWY_NOINLINE void EmbedToken(int token, size_t batch_idx, size_t pos,
     return;
   }
 
-  constexpr size_t kModelDim = TConfig::kModelDim;
-  constexpr size_t kVocabSize = TConfig::kVocabSize;
-  GEMMA_CONSTEXPR_EMBSCALING const float kEmbScaling =
-      EmbeddingScaling<TConfig>();
+  const size_t model_dim = weights.weights_config.model_dim;
+  const size_t vocab_size = weights.weights_config.vocab_size;
+  const float emb_scaling = EmbeddingScaling(model_dim);
 
   HWY_DASSERT(token >= 0);
-  HWY_DASSERT(token < static_cast<int>(kVocabSize));
+  HWY_DASSERT(token < static_cast<int>(vocab_size));
 
   const hn::ScalableTag<float> df;
   DecompressAndZeroPad(
       df,
-      MakeSpan(weights.embedder_input_embedding.data(), kVocabSize * kModelDim),
-      token * kModelDim, x.Batch(batch_idx), kModelDim);
-  MulByConst(kEmbScaling * weights.embedder_input_embedding.scale(),
-             x.Batch(batch_idx), kModelDim);
-  if constexpr (TConfig::kAbsolutePE) {
-    AddAbsolutePositionalEmbeddings(x.Batch(batch_idx), kModelDim, pos);
+      MakeSpan(weights.embedder_input_embedding.data(), vocab_size * model_dim),
+      token * model_dim, x.Batch(batch_idx), model_dim);
+  MulByConst(emb_scaling * weights.embedder_input_embedding.scale(),
+             x.Batch(batch_idx), model_dim);
+  if (weights.weights_config.absolute_pe) {
+    AddAbsolutePositionalEmbeddings(x.Batch(batch_idx), model_dim, pos);
   }
 }
 
-template <class TConfig, typename T>
+template <typename Weights, typename T>
 HWY_NOINLINE void ResidualConnection(
     size_t num_interleaved, T* HWY_RESTRICT other, T* HWY_RESTRICT x,
-    const CompressedLayer<TConfig>* layer_weights, bool is_attention) {
-  constexpr size_t kModelDim = TConfig::kModelDim;
+    const LayerWeightsPtrs<Weights>* layer_weights, bool is_attention) {
   // ResidualType::Add
-  AddFromBatched(num_interleaved, other, x, kModelDim);
+  AddFromBatched(num_interleaved, other, x,
+                 layer_weights->layer_config.model_dim);
 }
 
-template <class TConfig, typename WeightT, typename InOutT>
-void PostNorm(size_t num_interleaved, const WeightT& weights, InOutT* inout) {
-  if (TConfig::kPostNorm == PostNormType::Scale) {
+template <typename WeightT, typename InOutT>
+void PostNorm(PostNormType post_norm, size_t num_interleaved,
+              const WeightT& weights, InOutT* inout) {
+  if (post_norm == PostNormType::Scale) {
     RMSNormInplaceBatched(num_interleaved, weights.data_scale1(), inout,
-                          TConfig::kModelDim);
+                          weights.NumElements());
   }
 }
 
-template <class TConfig>
-HWY_NOINLINE void TransformerLayer(
-    const QueriesPos& queries_pos, const QueriesPos& queries_prefix_end,
-    size_t num_tokens, size_t layer,
-    const CompressedLayer<TConfig>* layer_weights, Activations& activations,
-    const hwy::Divisor& div_seq_len, const KVCaches& kv_caches) {
-  constexpr size_t kModelDim = TConfig::kModelDim;
+template <typename T>
+HWY_NOINLINE void TransformerLayer(const QueriesPos& queries_pos,
+                                   const QueriesPos& queries_prefix_end,
+                                   size_t num_tokens, size_t cache_layer_idx,
+                                   const LayerWeightsPtrs<T>* layer_weights,
+                                   Activations& activations,
+                                   const hwy::Divisor& div_seq_len,
+                                   const KVCaches& kv_caches) {
+  const size_t model_dim = activations.weights_config.model_dim;
   const size_t num_interleaved = num_tokens * queries_pos.size();
-  auto type = TConfig::kLayerConfig[layer];
-  size_t layer_of_type =
-      NumLayersOfTypeBefore(TConfig::kLayerConfig, type, layer);
+  auto type = layer_weights->layer_config.type;
 
   RMSNormBatched(num_interleaved, activations.x.All(),
                  layer_weights->pre_attention_norm_scale.data_scale1(),
-                 activations.pre_att_rms_out.All(), kModelDim);
+                 activations.pre_att_rms_out.All(), model_dim);
 
-  Attention<TConfig>(type, queries_pos, queries_prefix_end, num_tokens,
-                     layer_of_type, activations, layer_weights, div_seq_len,
-                     kv_caches);
+  Attention(type, queries_pos, queries_prefix_end, num_tokens, cache_layer_idx,
+            activations, layer_weights, div_seq_len, kv_caches);
 
-  PostNorm<TConfig>(num_interleaved, layer_weights->post_attention_norm_scale,
-                    activations.att_sums.All());
+  PostNorm(layer_weights->layer_config.post_norm, num_interleaved,
+           layer_weights->post_attention_norm_scale,
+           activations.att_sums.All());
 
-  ResidualConnection<TConfig>(num_interleaved, activations.att_sums.All(),
-                              activations.x.All(), layer_weights,
-                              /*is_attention=*/true);
+  ResidualConnection(num_interleaved, activations.att_sums.All(),
+                     activations.x.All(), layer_weights, /*is_attention=*/true);
 
   RMSNormBatched(num_interleaved, activations.x.All(),
                  layer_weights->pre_ffw_norm_scale.data_scale1(),
-                 activations.bf_pre_ffw_rms_out.All(), kModelDim);
+                 activations.bf_pre_ffw_rms_out.All(), model_dim);
 
-  FFW<TConfig>(activations, num_interleaved, layer_weights);
+  if (layer_weights->layer_config.type == LayerAttentionType::kVit) {
+    FFWVit(activations, num_interleaved, layer_weights);
+  } else {
+    FFWNoVit(activations, num_interleaved, layer_weights);
+  }
 
-  PostNorm<TConfig>(num_interleaved, layer_weights->post_ffw_norm_scale,
-                    activations.ffw_out.All());
+  PostNorm(layer_weights->layer_config.post_norm, num_interleaved,
+           layer_weights->post_ffw_norm_scale, activations.ffw_out.All());
 
-  ResidualConnection<TConfig>(num_interleaved, activations.ffw_out.All(),
-                              activations.x.All(), layer_weights,
-                              /*is_attention=*/false);
+  ResidualConnection(num_interleaved, activations.ffw_out.All(),
+                     activations.x.All(), layer_weights,
+                     /*is_attention=*/false);
 }
 
 // Vit transformer layer. Some comments below refer to the Vit implementation in
@@ -840,62 +947,62 @@ HWY_NOINLINE void TransformerLayer(
 // github.com/google-research/big_vision/blob/main/big_vision/models/vit.py
 // TODO(keysers): consider adding a wrapper for both LayerNorm with RMSNorm and
 // try mergig this with TransformerLayer.
-template <class TConfig>
-HWY_NOINLINE void VitTransformerLayer(
-    size_t num_tokens, size_t layer,
-    const CompressedLayer<TConfig>* layer_weights, Activations& activations) {
-  constexpr size_t kModelDim = TConfig::kModelDim;
-  auto type = TConfig::kLayerConfig[layer];
-  HWY_ASSERT(type == LayerAttentionType::kVit);
+template <typename T>
+HWY_NOINLINE void VitTransformerLayer(size_t num_tokens, size_t layer,
+                                      const LayerWeightsPtrs<T>* layer_weights,
+                                      Activations& activations) {
+  const size_t model_dim = activations.weights_config.model_dim;
+  auto type = layer_weights->layer_config.type;
+  HWY_DASSERT(type == LayerAttentionType::kVit);
 
   auto& x = activations.x;
-  HWY_ASSERT(x.BatchSize() == num_tokens);
-  HWY_ASSERT(x.Len() == kModelDim);
+  HWY_DASSERT(x.BatchSize() == num_tokens);
+  HWY_DASSERT(x.Len() == model_dim);
 
   // y = nn.LayerNorm()(x)
   // y ~ pre_att_rms_out
   LayerNormBatched(num_tokens, x.All(),
                    layer_weights->vit.layer_norm_0_scale.data_scale1(),
                    layer_weights->vit.layer_norm_0_bias.data_scale1(),
-                   activations.pre_att_rms_out.All(), kModelDim);
+                   activations.pre_att_rms_out.All(), model_dim);
   // y = out["sa"] = nn.MultiHeadDotProductAttention(...)(y, y)
   // y ~ att_sums
-  VitAttention<TConfig>(num_tokens, layer, activations, layer_weights)();
+  VitAttention<T>(num_tokens, layer, activations, layer_weights)();
 
   // x = out["+sa"] = x + y
-  AddFromBatched(num_tokens, activations.att_sums.All(), x.All(), kModelDim);
+  AddFromBatched(num_tokens, activations.att_sums.All(), x.All(), model_dim);
 
   // y = nn.LayerNorm()(x)
   // y ~ bf_pre_ffw_rms_out
   LayerNormBatched(num_tokens, x.All(),
                    layer_weights->vit.layer_norm_1_scale.data_scale1(),
                    layer_weights->vit.layer_norm_1_bias.data_scale1(),
-                   activations.bf_pre_ffw_rms_out.All(), kModelDim);
+                   activations.bf_pre_ffw_rms_out.All(), model_dim);
 
   // y = out["mlp"] = MlpBlock(...)(y)
   // y ~ ffw_out
-  FFW<TConfig>(activations, num_tokens, layer_weights);
+  FFWVit(activations, num_tokens, layer_weights);
 
   // x = out["+mlp"] = x + y
-  AddFromBatched(num_tokens, activations.ffw_out.All(), x.All(), kModelDim);
+  AddFromBatched(num_tokens, activations.ffw_out.All(), x.All(), model_dim);
 }
 
 // Prefill() and Transformer() increment positions in-place.
 using QueriesMutablePos = hwy::Span<size_t>;
 
 // Populates KV cache for batches of tokens from one query at a time.
-template <class TConfig>
+template <typename T>
 HWY_NOINLINE void Prefill(
     const QueriesPromptTokens& queries_prompt,
     const QueriesMutablePos& queries_pos, const QueriesPos& queries_prefix_end,
-    const size_t query_idx_start, const CompressedWeights<TConfig>& weights,
+    const size_t query_idx_start, const ModelWeightsPtrs<T>& weights,
     Activations& activations, const RuntimeConfig& runtime_config,
     const hwy::Divisor& div_seq_len, const KVCaches& kv_caches) {
   PROFILER_ZONE("Gen.Prefill");
   const size_t num_queries = queries_prompt.size();
-  HWY_ASSERT(queries_pos.size() == num_queries);
-  HWY_ASSERT(queries_prefix_end.size() == num_queries);
-  HWY_ASSERT(kv_caches.size() == num_queries);
+  HWY_DASSERT(queries_pos.size() == num_queries);
+  HWY_DASSERT(queries_prefix_end.size() == num_queries);
+  HWY_DASSERT(kv_caches.size() == num_queries);
 
   // Batches are important for amortizing loading weights over multiple tokens.
   // This is possible in prefill because we know all tokens beforehand, whereas
@@ -949,16 +1056,17 @@ HWY_NOINLINE void Prefill(
         const size_t pos = queries_pos[qi] + ti;
         const size_t pos_in_prompt = tbatch_start + ti;
         const int token = queries_prompt[qi][pos_in_prompt];
-        EmbedToken<TConfig>(token, ti, pos, pos_in_prompt, weights,
-                            activations.x, runtime_config.image_tokens);
+        EmbedToken(token, ti, pos, pos_in_prompt, weights, activations.x,
+                   runtime_config.image_tokens);
       }
 
       // Transformer with one batch of tokens from a single query.
-      for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
+      for (size_t layer = 0;
+           layer < weights.weights_config.layer_configs.size(); ++layer) {
         const auto* layer_weights = weights.GetLayer(layer);
-        TransformerLayer<TConfig>(single_query_pos, single_query_prefix_end,
-                                  tbatch_size, layer, layer_weights,
-                                  activations, div_seq_len, single_kv_cache);
+        TransformerLayer(single_query_pos, single_query_prefix_end, tbatch_size,
+                         layer, layer_weights, activations, div_seq_len,
+                         single_kv_cache);
       }
 
       // NOTE: we unconditionally call StreamToken, even if EOS.
@@ -991,20 +1099,21 @@ HWY_NOINLINE void Prefill(
 
 // Gets the patches of the image and embeds them with the image embedding
 // kernel. The result is stored in activations.x.
-template <class TConfig>
+template <typename T>
 HWY_NOINLINE void EmbedImagePatches(const Image& image,
-                                    const CompressedWeights<TConfig>& weights,
+                                    const ModelWeightsPtrs<T>& weights,
                                     Activations& activations) {
-  static constexpr size_t kModelDim = TConfig::VitConfig::kModelDim;
-  static constexpr size_t kPatchWidth = TConfig::VitConfig::kPatchWidth;
-  static constexpr size_t kSeqLen = TConfig::VitConfig::kSeqLen;
-  constexpr size_t kPatchSize = kPatchWidth * kPatchWidth * 3;
-  HWY_ASSERT(weights.vit_img_embedding_kernel.NumElements() ==
-             kPatchSize * kModelDim);
-  HWY_ASSERT(activations.x.Len() == kModelDim);
-  std::vector<hwy::AlignedFreeUniquePtr<float[]>> image_patches(kSeqLen);
-  for (size_t i = 0; i < kSeqLen; ++i) {
-    image_patches[i] = hwy::AllocateAligned<float>(kPatchSize);
+  const size_t model_dim = weights.weights_config.vit_model_dim;
+  const size_t patch_width =
+      weights.weights_config.vit_layer_configs[0].patch_width;
+  const size_t seq_len = weights.weights_config.vit_seq_len;
+  const size_t patch_size = patch_width * patch_width * 3;
+  HWY_DASSERT(weights.vit_img_embedding_kernel.NumElements() ==
+              patch_size * model_dim);
+  HWY_DASSERT(activations.x.Len() == model_dim);
+  std::vector<hwy::AlignedFreeUniquePtr<float[]>> image_patches(seq_len);
+  for (size_t i = 0; i < seq_len; ++i) {
+    image_patches[i] = hwy::AllocateAligned<float>(patch_size);
     image.GetPatch(i, image_patches[i].get());
   }
   // img/embedding/kernel has original shape (14, 14, 3, 1152)
@@ -1022,60 +1131,59 @@ HWY_NOINLINE void EmbedImagePatches(const Image& image,
   //   A.cols % (2 * hn::Lanes(hn::ScalableTag<MulT>())) == 0
   // which is not the case here. We should relax that requirement on MatMul and
   // then use the above. For now, we rely on MatVecAdd instead.
-  for (size_t i = 0; i < kSeqLen; ++i) {
-    MatVecAdd<kModelDim, kPatchSize>(
-        weights.vit_img_embedding_kernel, 0, image_patches[i].get(),
-        weights.vit_img_embedding_bias.data_scale1(), activations.x.Batch(i),
-        activations.env.Pools().Outer());
+  for (size_t i = 0; i < seq_len; ++i) {
+    MatVecAdd(weights.vit_img_embedding_kernel, 0, model_dim, patch_size,
+              image_patches[i].get(),
+              weights.vit_img_embedding_bias.data_scale1(),
+              activations.x.Batch(i), activations.env.Pools().Outer());
   }
   // Add position embeddings.
   AddFrom(weights.vit_img_pos_embedding.data_scale1(), activations.x.All(),
-          kSeqLen * kModelDim);
+          seq_len * model_dim);
 }
 
 // Prefills the image tokens with the ViT encoder.
-template <class TConfig>
-HWY_NOINLINE void PrefillVit(const CompressedWeights<TConfig>& weights,
+template <typename T>
+HWY_NOINLINE void PrefillVit(const ModelWeightsPtrs<T>& weights,
                              const RuntimeConfig& runtime_config,
                              const Image& image, ImageTokens& image_tokens,
                              Activations& activations) {
   PROFILER_ZONE("Gen.PrefillVit");
-  const size_t num_tokens = TConfig::VitConfig::kSeqLen;
-  const size_t kVitModelDim = TConfig::VitConfig::kModelDim;
+  const size_t num_tokens = weights.weights_config.vit_seq_len;
+  const size_t vit_model_dim = weights.weights_config.vit_model_dim;
   HWY_ASSERT(num_tokens == activations.x.BatchSize());
   // Embed the image patches.
-  EmbedImagePatches<TConfig>(image, weights, activations);
+  EmbedImagePatches(image, weights, activations);
   // Go through all layers.
-  for (size_t layer = 0; layer < TConfig::VitConfig::kLayers; ++layer) {
+  for (size_t layer = 0;
+       layer < weights.weights_config.vit_layer_configs.size(); ++layer) {
     const auto* layer_weights = weights.GetVitLayer(layer);
-    VitTransformerLayer<typename TConfig::VitConfig>(
-        num_tokens, layer, layer_weights, activations);
+    VitTransformerLayer(num_tokens, layer, layer_weights, activations);
   }
   // Final Layernorm.
   LayerNormBatched(num_tokens, activations.x.All(),
                    weights.vit_encoder_norm_scale.data_scale1(),
                    weights.vit_encoder_norm_bias.data_scale1(),
-                   activations.x.All(), kVitModelDim);
+                   activations.x.All(), vit_model_dim);
 
   // Apply head embedding into image_tokens of size of the LLM kModelDim.
   MatMul</*kAdd=*/true>(
-      num_tokens, ConstMat(activations.x.All(), kVitModelDim),
-      ConstMat(weights.vit_img_head_kernel.data_scale1(), kVitModelDim),
+      num_tokens, ConstMat(activations.x.All(), vit_model_dim),
+      ConstMat(weights.vit_img_head_kernel.data_scale1(), vit_model_dim),
       /*scale=*/1.0f, weights.vit_img_head_bias.data_scale1(), activations.env,
-      MutableMat(image_tokens.All(), TConfig::kModelDim));
+      MutableMat(image_tokens.All(), weights.weights_config.model_dim));
 }
 
 // Generates one token for each query. `queries_token` is the previous token
 // from each query, and `queries_pos` are their position in the sequence.
-template <class TConfig>
+template <typename T>
 HWY_NOINLINE void Transformer(
     const QueriesToken& queries_token, const QueriesMutablePos& queries_pos,
-    const QueriesPos& queries_prefix_end,
-    const CompressedWeights<TConfig>& weights, Activations& activations,
-    const hwy::Divisor& div_seq_len, const KVCaches& kv_caches,
-    const LayersOutputFunc& layers_output,
+    const QueriesPos& queries_prefix_end, const ModelWeightsPtrs<T>& weights,
+    Activations& activations, const hwy::Divisor& div_seq_len,
+    const KVCaches& kv_caches, const LayersOutputFunc& layers_output,
     const ActivationsObserverFunc& activations_observer) {
-  constexpr size_t kModelDim = TConfig::kModelDim;
+  const size_t model_dim = weights.weights_config.model_dim;
   const size_t num_queries = queries_token.size();
   HWY_DASSERT(queries_pos.size() == num_queries);
   HWY_DASSERT(queries_prefix_end.size() == num_queries);
@@ -1089,16 +1197,15 @@ HWY_NOINLINE void Transformer(
   }
 
   for (size_t query_idx = 0; query_idx < num_queries; ++query_idx) {
-    EmbedToken<TConfig>(queries_token[query_idx], query_idx,
-                        queries_pos[query_idx], /*pos_in_prompt=*/0, weights,
-                        activations.x, /*image_tokens=*/nullptr);
+    EmbedToken(queries_token[query_idx], query_idx, queries_pos[query_idx],
+               /*pos_in_prompt=*/0, weights, activations.x,
+               /*image_tokens=*/nullptr);
   }
 
-  for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
-    const CompressedLayer<TConfig>* layer_weights = weights.GetLayer(layer);
-    TransformerLayer<TConfig>(queries_pos, queries_prefix_end, /*num_tokens=*/1,
-                              layer, layer_weights, activations, div_seq_len,
-                              kv_caches);
+  for (size_t layer = 0; layer < weights.c_layers.size(); ++layer) {
+    const LayerWeightsPtrs<T>* layer_weights = weights.GetLayer(layer);
+    TransformerLayer(queries_pos, queries_prefix_end, /*num_tokens=*/1, layer,
+                     layer_weights, activations, div_seq_len, kv_caches);
 
     if (activations_observer) {
       activations_observer(queries_pos, layer, activations);
@@ -1106,7 +1213,7 @@ HWY_NOINLINE void Transformer(
   }
 
   RMSNormInplaceBatched(num_queries, weights.final_norm_scale.data_scale1(),
-                        activations.x.All(), kModelDim);
+                        activations.x.All(), model_dim);
 
   if (activations_observer) {
     activations_observer(queries_pos, -1, activations);
@@ -1116,19 +1223,6 @@ HWY_NOINLINE void Transformer(
   }
 }
 
-template <class TConfig>
-void RangeChecks(size_t& max_generated_tokens, const size_t prompt_size) {
-  if (!TConfig::kUseLocalAttention) {
-    if (max_generated_tokens > TConfig::kSeqLen) {
-      fprintf(stderr,
-              "WARNING: max_generated_tokens %zu > kSeqLen %d, truncating.\n",
-              max_generated_tokens, TConfig::kSeqLen);
-      max_generated_tokens = static_cast<size_t>(TConfig::kSeqLen);
-    }
-  }
-  HWY_ASSERT(prompt_size > 0);
-}
-
 // Placeholder for internal test3, do not remove
 
 // Returns the min and max number of tokens for all queries.
@@ -1165,15 +1259,13 @@ class TokenStreamer {
   hwy::BitSet4096<> is_eos_;
 };
 
-template <class TConfig>
-SampleFunc ChooseSampleFunc(const RuntimeConfig& runtime_config) {
-  constexpr size_t kTopK = TConfig::kTopK;
-
+HWY_INLINE SampleFunc ChooseSampleFunc(int top_k,
+                                       const RuntimeConfig& runtime_config) {
   // If user provided a sample_func, use it.
   if (runtime_config.sample_func) return runtime_config.sample_func;
 
   // Fast path for top-1 with no accept_token.
-  if (kTopK == 1 && !runtime_config.accept_token) {
+  if (top_k == 1 && !runtime_config.accept_token) {
     return [](float* logits, size_t vocab_size) HWY_ATTR -> TokenAndProb {
       PROFILER_ZONE("Gen.Sample Top1");
       return Top1OfSoftmax(logits, vocab_size);
@@ -1181,13 +1273,13 @@ SampleFunc ChooseSampleFunc(const RuntimeConfig& runtime_config) {
   }
 
   // General case: Softmax with top-k sampling.
-  return [&runtime_config](float* logits,
-                           size_t vocab_size) HWY_ATTR -> TokenAndProb {
+  return [top_k, &runtime_config](float* logits,
+                                  size_t vocab_size) HWY_ATTR -> TokenAndProb {
     PROFILER_ZONE("Gen.Sample general");
     Softmax(logits, vocab_size);
-    const int token = SampleTopK<kTopK>(logits, vocab_size, *runtime_config.gen,
-                                        runtime_config.temperature,
-                                        runtime_config.accept_token);
+    const int token =
+        SampleTopK(logits, top_k, vocab_size, *runtime_config.gen,
+                   runtime_config.temperature, runtime_config.accept_token);
     return TokenAndProb{.token = token, .prob = logits[token]};
   };
 }
@@ -1203,18 +1295,17 @@ SampleFunc ChooseSampleFunc(const RuntimeConfig& runtime_config) {
 // `StreamFunc` gets the global query index, not relative to the batch.
 //
 // `kv_caches` is for the batch, size must match `queries_prompt`.
-template <class TConfig>
-void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
+template <typename T>
+void GenerateT(const ModelWeightsStorage& model, Activations& activations,
                const RuntimeConfig& runtime_config,
                const QueriesPromptTokens& queries_prompt,
                const QueriesPos& queries_pos_in,
                const QueriesPos& queries_prefix_end,
                const size_t query_idx_start, const KVCaches& kv_caches,
                TimingInfo& timing_info) {
-  constexpr size_t kModelDim = TConfig::kModelDim;
-  constexpr size_t kVocabSize = TConfig::kVocabSize;
-  const CompressedWeights<TConfig>& weights =
-      *reinterpret_cast<const CompressedWeights<TConfig>*>(weights_u8.get());
+  const size_t model_dim = model.Config().model_dim;
+  const size_t vocab_size = model.Config().vocab_size;
+  const ModelWeightsPtrs<T>& weights = *model.GetWeightsOfType<T>();
 
   // Copy so we can increment without requiring users to pass in a mutable span.
   std::vector<size_t> queries_pos_copy(queries_pos_in.cbegin(),
@@ -1244,8 +1335,9 @@ void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
 
   size_t max_prompt_size = MaxQueryLength(queries_prompt);
   size_t max_generated_tokens = runtime_config.max_generated_tokens;
-  RangeChecks<TConfig>(max_generated_tokens, max_prompt_size);
-  const SampleFunc sample_token = ChooseSampleFunc<TConfig>(runtime_config);
+  RangeChecks(weights.weights_config, max_generated_tokens, max_prompt_size);
+  const SampleFunc sample_token =
+      ChooseSampleFunc(weights.weights_config.top_k, runtime_config);
 
   // Prefill stops before min_prompt_size - 1 because the last prompt
   // token is the first input token for generation.
@@ -1254,15 +1346,15 @@ void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
   // allocate prefill_activations, otherwise reuse.
   const bool use_prefill_activations =
       runtime_config.prefill_tbatch_size > activations.x.BatchSize();
-  Activations prefill_activations;
+  Activations prefill_activations(weights.weights_config);
   if (use_prefill_activations) {
-    prefill_activations.Allocate<TConfig>(runtime_config.prefill_tbatch_size,
-                                          activations.env.Pools());
+    prefill_activations.Allocate(runtime_config.prefill_tbatch_size,
+                                 activations.env.Pools());
   }
-  Prefill<TConfig>(queries_prompt, queries_mutable_pos, queries_prefix_end,
-                   query_idx_start, weights,
-                   use_prefill_activations ? prefill_activations : activations,
-                   runtime_config, div_seq_len, kv_caches);
+  Prefill(queries_prompt, queries_mutable_pos, queries_prefix_end,
+          query_idx_start, weights,
+          use_prefill_activations ? prefill_activations : activations,
+          runtime_config, div_seq_len, kv_caches);
   // Compute the number of tokens that were prefilled and notify timing_info.
   size_t prefilled_tokens = 0;
   for (size_t qi = 0; qi < num_queries; ++qi) {
@@ -1289,10 +1381,10 @@ void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
   const double gen_start = hwy::platform::Now();
   for (size_t gen = 0; gen < max_generated_tokens; ++gen) {
     // Decode generates one token per query and increments queries_mutable_pos.
-    Transformer<TConfig>(
-        QueriesToken(gen_tokens.data(), num_queries), queries_mutable_pos,
-        queries_prefix_end, weights, activations, div_seq_len, kv_caches,
-        runtime_config.layers_output, runtime_config.activations_observer);
+    Transformer(QueriesToken(gen_tokens.data(), num_queries),
+                queries_mutable_pos, queries_prefix_end, weights, activations,
+                div_seq_len, kv_caches, runtime_config.layers_output,
+                runtime_config.activations_observer);
     // queries_pos are incremented by Transformer.
 
     bool all_queries_eos = true;
@@ -1300,16 +1392,16 @@ void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
       PROFILER_ZONE("Gen.EmbeddingMatmul");
       // Compute logits from last layer activations.
       MatMul</*kAdd=*/false>(
-          num_queries, ConstMat(activations.x.All(), kModelDim),
-          ConstMat(weights.embedder_input_embedding.data(), kModelDim),
+          num_queries, ConstMat(activations.x.All(), model_dim),
+          ConstMat(weights.embedder_input_embedding.data(), model_dim),
           weights.embedder_input_embedding.scale(), /*add=*/nullptr,
-          activations.env, MutableMat(activations.logits.All(), kVocabSize));
+          activations.env, MutableMat(activations.logits.All(), vocab_size));
     }
     PROFILER_ZONE("Gen.Softcap+Sample+Stream");
     for (size_t query_idx = 0; query_idx < num_queries; ++query_idx) {
       float* HWY_RESTRICT logits = activations.logits.Batch(query_idx);
-      MaybeLogitsSoftCap(TConfig::kFinalCap, logits, kVocabSize);
-      const TokenAndProb tp = sample_token(logits, kVocabSize);
+      MaybeLogitsSoftCap(weights.weights_config.final_cap, logits, vocab_size);
+      const TokenAndProb tp = sample_token(logits, vocab_size);
       timing_info.NotifyGenerated(prefill_start, gen_start);
 
       const bool is_eos =
@@ -1324,8 +1416,8 @@ void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
   timing_info.NotifyGenerateDone(gen_start);
 }
 
-template <class TConfig>
-void GenerateSingleT(const ByteStorageT& weights_u8,
+template <typename T>
+void GenerateSingleT(const ModelWeightsStorage& model,
                      const RuntimeConfig& runtime_config,
                      const PromptTokens& prompt, size_t pos, size_t prefix_end,
                      KVCache& kv_cache, PerClusterPools& pools,
@@ -1334,21 +1426,20 @@ void GenerateSingleT(const ByteStorageT& weights_u8,
   const size_t qbatch_start = 0;
 
   // TODO: move into Gemma?
-  Activations activations;
-  activations.Allocate<TConfig>(kNumQueries, pools);
+  Activations activations(model.Config());
+  activations.Allocate(kNumQueries, pools);
 
   const QueriesPromptTokens queries_prompt(&prompt, kNumQueries);
   QueriesPos queries_pos(&pos, kNumQueries);
   const QueriesPos queries_prefix_end(&prefix_end, kNumQueries);
   const KVCaches kv_caches{&kv_cache, kNumQueries};
 
-  GenerateT<TConfig>(weights_u8, activations, runtime_config, queries_prompt,
-                     queries_pos, queries_prefix_end, qbatch_start, kv_caches,
-                     timing_info);
+  GenerateT<T>(model, activations, runtime_config, queries_prompt, queries_pos,
+               queries_prefix_end, qbatch_start, kv_caches, timing_info);
 }
 
-template <class TConfig>
-void GenerateBatchT(const ByteStorageT& weights_u8,
+template <typename T>
+void GenerateBatchT(const ModelWeightsStorage& model,
                     const RuntimeConfig& runtime_config,
                     const QueriesPromptTokens& queries_prompt,
                     const QueriesPos& queries_pos,
@@ -1359,11 +1450,16 @@ void GenerateBatchT(const ByteStorageT& weights_u8,
   HWY_ASSERT(queries_pos.size() == num_queries);
   HWY_ASSERT(kv_caches.size() == num_queries);
   // Griffin does not support query batching.
-  const size_t max_qbatch_size =
-      (TConfig::kGriffinLayers > 0) ? 1 : runtime_config.decode_qbatch_size;
+  size_t max_qbatch_size = runtime_config.decode_qbatch_size;
+  for (const auto& layer_config : model.Config().layer_configs) {
+    if (layer_config.type == LayerAttentionType::kGriffinRecurrentBlock) {
+      max_qbatch_size = 1;
+      break;
+    }
+  }
 
-  Activations activations;
-  activations.Allocate<TConfig>(max_qbatch_size, pools);
+  Activations activations(model.Config());
+  activations.Allocate(max_qbatch_size, pools);
 
   for (size_t qbatch_start = 0; qbatch_start < num_queries;
        qbatch_start += max_qbatch_size) {
@@ -1376,30 +1472,27 @@ void GenerateBatchT(const ByteStorageT& weights_u8,
     const QueriesPos qbatch_prefix_end(&queries_prefix_end[qbatch_start],
                                              qbatch_size);
     const KVCaches qbatch_kv(&kv_caches[qbatch_start], qbatch_size);
-    GenerateT<TConfig>(weights_u8, activations, runtime_config, qbatch_prompts,
-                       qbatch_pos, qbatch_prefix_end, qbatch_start, qbatch_kv,
-                       timing_info);
+    GenerateT<T>(model, activations, runtime_config, qbatch_prompts, qbatch_pos,
+                 qbatch_prefix_end, qbatch_start, qbatch_kv, timing_info);
   }
 }
 
-template <class TConfig>
-void GenerateImageTokensT(const ByteStorageT& weights_u8,
+template <typename T>
+void GenerateImageTokensT(const ModelWeightsStorage& model,
                           const RuntimeConfig& runtime_config,
                           const Image& image, ImageTokens& image_tokens,
                           PerClusterPools& pools) {
-  if constexpr (TConfig::VitConfig::kLayers == 0) {
+  if (model.Config().vit_layer_configs.empty()) {
     return;
   } else {
-    Activations prefill_activations;
+    Activations prefill_activations(model.Config());
     RuntimeConfig prefill_runtime_config = runtime_config;
-    prefill_runtime_config.prefill_tbatch_size = TConfig::VitConfig::kSeqLen;
-    prefill_activations.Allocate<typename TConfig::VitConfig>(
-        prefill_runtime_config.prefill_tbatch_size, pools);
+    prefill_runtime_config.prefill_tbatch_size = model.Config().vit_seq_len;
+    prefill_activations.Allocate(prefill_runtime_config.prefill_tbatch_size,
+                                 pools);
     // Weights are for the full PaliGemma model, not just the ViT part.
-    const CompressedWeights<TConfig>& weights =
-        *reinterpret_cast<const CompressedWeights<TConfig>*>(weights_u8.get());
-    PrefillVit<TConfig>(weights, prefill_runtime_config, image, image_tokens,
-                        prefill_activations);
+    PrefillVit(*model.GetWeightsOfType<T>(), prefill_runtime_config, image,
+               image_tokens, prefill_activations);
   }
 }
 
@@ -1410,32 +1503,32 @@ void GenerateImageTokensT(const ByteStorageT& weights_u8,
 // These are extern functions defined by instantiations/*.cc, which include this
 // 'header' after defining GEMMA_CONFIG, which is for function overloading.
 void GenerateSingle(  // NOLINT(misc-definitions-in-headers)
-    GEMMA_CONFIG, const ByteStorageT& weights_u8,
+    GEMMA_TYPE, const ModelWeightsStorage& model,
     const RuntimeConfig& runtime_config, const PromptTokens& prompt, size_t pos,
     size_t prefix_end, KVCache& kv_cache, PerClusterPools& pools,
     TimingInfo& timing_info) {
-  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateSingleT<GEMMA_CONFIG>)
-  (weights_u8, runtime_config, prompt, pos, prefix_end, kv_cache, pools,
+  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateSingleT<GEMMA_TYPE>)
+  (model, runtime_config, prompt, pos, prefix_end, kv_cache, pools,
    timing_info);
 }
 
 void GenerateBatch(  // NOLINT(misc-definitions-in-headers)
-    GEMMA_CONFIG, const ByteStorageT& weights_u8,
+    GEMMA_TYPE, const ModelWeightsStorage& model,
     const RuntimeConfig& runtime_config,
     const QueriesPromptTokens& queries_prompt, const QueriesPos& queries_pos,
     const QueriesPos& queries_prefix_end, const KVCaches& kv_caches,
     PerClusterPools& pools, TimingInfo& timing_info) {
-  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateBatchT<GEMMA_CONFIG>)
-  (weights_u8, runtime_config, queries_prompt, queries_pos, queries_prefix_end,
+  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateBatchT<GEMMA_TYPE>)
+  (model, runtime_config, queries_prompt, queries_pos, queries_prefix_end,
    kv_caches, pools, timing_info);
 }
 
 void GenerateImageTokens(  // NOLINT(misc-definitions-in-headers)
-    GEMMA_CONFIG, const ByteStorageT& weights_u8,
+    GEMMA_TYPE, const ModelWeightsStorage& model,
     const RuntimeConfig& runtime_config, const Image& image,
     ImageTokens& image_tokens, PerClusterPools& pools) {
-  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateImageTokensT<GEMMA_CONFIG>)
-  (weights_u8, runtime_config, image, image_tokens, pools);
+  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateImageTokensT<GEMMA_TYPE>)
+  (model, runtime_config, image, image_tokens, pools);
 }
 
 #endif  // HWY_ONCE
diff --git a/gemma/gemma.cc b/gemma/gemma.cc
index 722adcb..0c5f089 100644
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@@ -29,88 +29,90 @@
 #include "compression/io.h"  // Path
 #include "gemma/common.h"
 #include "gemma/weights.h"
+#include "ops/ops-inl.h"
 #include "paligemma/image.h"
 #include "util/threading.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
+#include "hwy/profiler.h"  // also uses SIMD
 
 namespace gcpp {
 
 Gemma::Gemma(const Path& tokenizer_path, const Path& weights,
              const ModelInfo& info, PerClusterPools& pools)
     : pools_(pools), tokenizer_(tokenizer_path), info_(info) {
-  weights_u8_ =
-      LoadCompressedWeights(weights, info.model, info.weight, pools_.Inner(0));
+  model_.Load(weights, info.model, info.weight, pools_.Inner(0));
 }
 
 Gemma::Gemma(GemmaTokenizer&& tokenizer, const ModelInfo& info,
              PerClusterPools& pools)
     : pools_(pools), tokenizer_(std::move(tokenizer)), info_(info) {
   HWY_ASSERT(info.weight == Type::kF32);
-  weights_u8_ = CallForModel<float, AllocateCompressedWeights>(info.model,
-                                                               pools_.Inner(0));
+  model_.Allocate(info.model, info.weight, pools_.Inner(0));
 }
 
 Gemma::~Gemma() {
 }
 
-// There are >100 instantiations of the inference code. To reduce compile time,
+// There are >=3 types of the inference code. To reduce compile time,
 // we shard them across multiple translation units in instantiations/*.cc.
 // This declares the functions defined there. We use overloading because
 // explicit instantiations are still too slow to compile.
-#define GEMMA_DECLARE(CONFIGT, TWEIGHT)                                        \
-  extern void GenerateSingle(CONFIGT<TWEIGHT>, const ByteStorageT& weights_u8, \
+#define GEMMA_DECLARE(TWEIGHT)                                                 \
+  extern void GenerateSingle(TWEIGHT, const ModelWeightsStorage& model,        \
                              const RuntimeConfig& runtime_config,              \
                              const PromptTokens& prompt, size_t pos,           \
                              size_t prefix_end, KVCache& kv_cache,             \
                              PerClusterPools& pools, TimingInfo& timing_info); \
   extern void GenerateBatch(                                                   \
-      CONFIGT<TWEIGHT>, const ByteStorageT& weights_u8,                        \
+      TWEIGHT, const ModelWeightsStorage& model,                               \
       const RuntimeConfig& runtime_config, const QueriesPromptTokens& prompts, \
-      const QueriesPos& queries_pos,                                           \
-      const QueriesPos& queries_prefix_end, const KVCaches& kv_caches,   \
-      PerClusterPools& pools, TimingInfo& timing_info);                        \
+      const QueriesPos& queries_pos, const QueriesPos& queries_prefix_end,     \
+      const KVCaches& kv_caches, PerClusterPools& pools,                       \
+      TimingInfo& timing_info);                                                \
   extern void GenerateImageTokens(                                             \
-      CONFIGT<TWEIGHT>, const ByteStorageT& weights_u8,                        \
+      TWEIGHT, const ModelWeightsStorage& model,                               \
       const RuntimeConfig& runtime_config, const Image& image,                 \
       ImageTokens& image_tokens, PerClusterPools& pools);
-GEMMA_FOREACH_CONFIG_AND_WEIGHT(GEMMA_DECLARE);
+GEMMA_DECLARE(float)
+GEMMA_DECLARE(BF16)
+GEMMA_DECLARE(NuqStream)
+GEMMA_DECLARE(SfpStream)
 
-// Adapters to select from the above overloads via CallForModelAndWeight.
+// Adapters to select from the above overloads via CallForModelWeight.
 template <class TConfig>
 struct GenerateSingleT {
-  void operator()(const ByteStorageT& weights_u8,
+  void operator()(const ModelWeightsStorage& model,
                   const RuntimeConfig& runtime_config,
                   const PromptTokens& prompt, size_t pos, size_t prefix_end,
                   KVCache& kv_cache, PerClusterPools& pools,
                   TimingInfo& timing_info) const {
-    GenerateSingle(TConfig(), weights_u8, runtime_config, prompt, pos,
-                   prefix_end, kv_cache, pools, timing_info);
+    GenerateSingle(TConfig(), model, runtime_config, prompt, pos, prefix_end,
+                   kv_cache, pools, timing_info);
   }
 };
 
 template <class TConfig>
 struct GenerateBatchT {
-  void operator()(const ByteStorageT& weights_u8,
+  void operator()(const ModelWeightsStorage& model,
                   const RuntimeConfig& runtime_config,
                   const QueriesPromptTokens& queries_prompt,
                   const QueriesPos& queries_pos,
                   const QueriesPos& queries_prefix_end,
                   const KVCaches& kv_caches, PerClusterPools& pools,
                   TimingInfo& timing_info) const {
-    GenerateBatch(TConfig(), weights_u8, runtime_config, queries_prompt,
-                  queries_pos, queries_prefix_end, kv_caches, pools,
-                  timing_info);
+    GenerateBatch(TConfig(), model, runtime_config, queries_prompt, queries_pos,
+                  queries_prefix_end, kv_caches, pools, timing_info);
   }
 };
 
 template <class TConfig>
 struct GenerateImageTokensT {
-  void operator()(const ByteStorageT& weights_u8,
+  void operator()(const ModelWeightsStorage& model,
                   const RuntimeConfig& runtime_config, const Image& image,
                   ImageTokens& image_tokens, PerClusterPools& pools) const {
-    GenerateImageTokens(TConfig(), weights_u8, runtime_config, image,
-                        image_tokens, pools);
+    GenerateImageTokens(TConfig(), model, runtime_config, image, image_tokens,
+                        pools);
   }
 };
 
@@ -119,9 +121,8 @@ void Gemma::Generate(const RuntimeConfig& runtime_config,
                      KVCache& kv_cache, TimingInfo& timing_info) {
   if (runtime_config.use_spinning) pools_.StartSpinning();
 
-  CallForModelAndWeight<GenerateSingleT>(
-      info_.model, info_.weight, weights_u8_, runtime_config, prompt, pos,
-      prefix_end, kv_cache, pools_, timing_info);
+  model_.CallForModelWeight<GenerateSingleT>(
+      runtime_config, prompt, pos, prefix_end, kv_cache, pools_, timing_info);
 
   if (runtime_config.use_spinning) pools_.StopSpinning();
 }
@@ -142,9 +143,9 @@ void Gemma::GenerateBatch(const RuntimeConfig& runtime_config,
 
   if (runtime_config.use_spinning) pools_.StartSpinning();
 
-  CallForModelAndWeight<GenerateBatchT>(
-      info_.model, info_.weight, weights_u8_, runtime_config, queries_prompt,
-      queries_pos, mutable_queries_prefix_end, kv_caches, pools_, timing_info);
+  model_.CallForModelWeight<GenerateBatchT>(
+      runtime_config, queries_prompt, queries_pos, mutable_queries_prefix_end,
+      kv_caches, pools_, timing_info);
 
   if (runtime_config.use_spinning) pools_.StopSpinning();
 }
@@ -153,28 +154,25 @@ void Gemma::GenerateImageTokens(const RuntimeConfig& runtime_config,
                                 const Image& image, ImageTokens& image_tokens) {
   if (runtime_config.use_spinning) pools_.StartSpinning();
 
-  CallForModelAndWeight<GenerateImageTokensT>(info_.model, info_.weight,
-                                              weights_u8_, runtime_config,
-                                              image, image_tokens, pools_);
+  model_.CallForModelWeight<GenerateImageTokensT>(runtime_config, image,
+                                                  image_tokens, pools_);
 
   if (runtime_config.use_spinning) pools_.StopSpinning();
 }
 
-template <typename TConfig>
-struct GetModelConfig {
-  ModelConfigInfo operator()() const {
-    return ModelConfigInfo{
-        .layers = TConfig::kLayers,
-        .model_dim = TConfig::kModelDim,
-        .heads = TConfig::kHeads,
-        .kv_heads = TConfig::kKVHeads,
-        .qkv_dim = TConfig::kQKVDim,
-    };
-  }
-};
+// Non-template functions moved from gemma-inl.h to avoid ODR violations.
 
-ModelConfigInfo Gemma::ModelConfig() const {
-  return CallForModel<float, GetModelConfig>(info_.model);
+void RangeChecks(const ModelConfig& weights_config,
+                 size_t& max_generated_tokens, const size_t prompt_size) {
+  if (!weights_config.use_local_attention) {
+    if (max_generated_tokens > weights_config.seq_len) {
+      fprintf(stderr,
+              "WARNING: max_generated_tokens %zu > kSeqLen %zu, truncating.\n",
+              max_generated_tokens, weights_config.seq_len);
+      max_generated_tokens = weights_config.seq_len;
+    }
+  }
+  HWY_ASSERT(prompt_size > 0);
 }
 
 }  // namespace gcpp
diff --git a/gemma/gemma.h b/gemma/gemma.h
index 6b74008..ce7b835 100644
--- a/gemma/gemma.h
+++ b/gemma/gemma.h
@@ -27,6 +27,7 @@
 #include "gemma/common.h"
 #include "gemma/kv_cache.h"
 #include "gemma/tokenizer.h"
+#include "gemma/weights.h"
 #include "paligemma/image.h"
 #include "util/allocator.h"  // RowVectorBatch
 #include "util/basics.h"     // TokenAndProb
@@ -179,15 +180,6 @@ struct TimingInfo {
   size_t tokens_generated = 0;
 };
 
-// ModelConfigInfo holds model configuration details: number of layers, etc.
-struct ModelConfigInfo {
-  const int layers;
-  const int model_dim;
-  const int heads;
-  const int kv_heads;
-  const int qkv_dim;
-};
-
 class Gemma {
  public:
   Gemma(const Path& tokenizer_path, const Path& weights, const ModelInfo& info,
@@ -198,11 +190,11 @@ class Gemma {
         PerClusterPools& pools);
   ~Gemma();
 
-  ModelConfigInfo ModelConfig() const;
+  const ModelConfig& GetModelConfig() const { return model_.Config(); }
   const ModelInfo& Info() const { return info_; }
   const GemmaTokenizer& Tokenizer() const { return tokenizer_; }
-  const ByteStorageT& Weights() const { return weights_u8_; }
-  ByteStorageT& MutableWeights() { return weights_u8_; }
+  const ModelWeightsStorage& Weights() const { return model_; }
+  ModelWeightsStorage& MutableWeights() { return model_; }
 
   // `pos` is the position in the KV cache. Users are responsible for
   // incrementing it in the `*StreamFunc`, or setting to zero for single-turn.
@@ -241,7 +233,7 @@ class Gemma {
 
   GemmaTokenizer tokenizer_;
   // Type-erased so that this can be defined in the header.
-  ByteStorageT weights_u8_;
+  ModelWeightsStorage model_;
   ModelInfo info_;
 };
 
@@ -251,6 +243,8 @@ class Gemma {
 std::vector<int> WrapAndTokenize(const GemmaTokenizer& tokenizer,
                                  const ModelInfo& info, size_t pos,
                                  std::string& prompt);
+void RangeChecks(const ModelConfig& weights_config,
+                 size_t& max_generated_tokens, size_t prompt_size);
 
 }  // namespace gcpp
 
diff --git a/gemma/instantiations/27b_bf16.cc b/gemma/instantiations/27b_bf16.cc
deleted file mode 100644
index 8698c7a..0000000
--- a/gemma/instantiations/27b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/27b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_27B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/27b_f32.cc b/gemma/instantiations/27b_f32.cc
deleted file mode 100644
index f4b5d6c..0000000
--- a/gemma/instantiations/27b_f32.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/27b_f32.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_27B<float>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/27b_sfp.cc b/gemma/instantiations/27b_sfp.cc
deleted file mode 100644
index 7d0072a..0000000
--- a/gemma/instantiations/27b_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/27b_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_27B<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/2b_bf16.cc b/gemma/instantiations/2b_bf16.cc
deleted file mode 100644
index bd03de7..0000000
--- a/gemma/instantiations/2b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/2b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/7b_bf16.cc b/gemma/instantiations/7b_bf16.cc
deleted file mode 100644
index 03bc369..0000000
--- a/gemma/instantiations/7b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/7b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma7B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/7b_sfp.cc b/gemma/instantiations/7b_sfp.cc
deleted file mode 100644
index 78a768d..0000000
--- a/gemma/instantiations/7b_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/7b_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma7B<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/9b_bf16.cc b/gemma/instantiations/9b_bf16.cc
deleted file mode 100644
index 1cd5d13..0000000
--- a/gemma/instantiations/9b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/9b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_9B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/9b_sfp.cc b/gemma/instantiations/9b_sfp.cc
deleted file mode 100644
index b822524..0000000
--- a/gemma/instantiations/9b_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/9b_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_9B<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/2b_f32.cc b/gemma/instantiations/bf16.cc
similarity index 87%
rename from gemma/instantiations/2b_f32.cc
rename to gemma/instantiations/bf16.cc
index fd49571..19ae585 100644
--- a/gemma/instantiations/2b_f32.cc
+++ b/gemma/instantiations/bf16.cc
@@ -14,8 +14,7 @@
 // limitations under the License.
 
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/2b_f32.cc"
+#define HWY_TARGET_INCLUDE "gemma/instantiations/bf16.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2B<float>
+#define GEMMA_TYPE hwy::bfloat16_t
 #include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/7b_f32.cc b/gemma/instantiations/f32.cc
similarity index 87%
rename from gemma/instantiations/7b_f32.cc
rename to gemma/instantiations/f32.cc
index 7f09e85..6b5496d 100644
--- a/gemma/instantiations/7b_f32.cc
+++ b/gemma/instantiations/f32.cc
@@ -14,8 +14,7 @@
 // limitations under the License.
 
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/7b_f32.cc"
+#define HWY_TARGET_INCLUDE "gemma/instantiations/f32.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma7B<float>
+#define GEMMA_TYPE float
 #include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gemma2_2b_bf16.cc b/gemma/instantiations/gemma2_2b_bf16.cc
deleted file mode 100644
index d817137..0000000
--- a/gemma/instantiations/gemma2_2b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gemma2_2b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_2B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gemma2_2b_f32.cc b/gemma/instantiations/gemma2_2b_f32.cc
deleted file mode 100644
index c2f52a1..0000000
--- a/gemma/instantiations/gemma2_2b_f32.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gemma2_2b_f32.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_2B<float>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gemma2_2b_sfp.cc b/gemma/instantiations/gemma2_2b_sfp.cc
deleted file mode 100644
index 1122ba9..0000000
--- a/gemma/instantiations/gemma2_2b_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gemma2_2b_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_2B<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gr2b_bf16.cc b/gemma/instantiations/gr2b_bf16.cc
deleted file mode 100644
index 4c0b36e..0000000
--- a/gemma/instantiations/gr2b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gr2b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGriffin2B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gr2b_f32.cc b/gemma/instantiations/gr2b_f32.cc
deleted file mode 100644
index 8d12b1a..0000000
--- a/gemma/instantiations/gr2b_f32.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gr2b_f32.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGriffin2B<float>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gr2b_sfp.cc b/gemma/instantiations/gr2b_sfp.cc
deleted file mode 100644
index 32f40ff..0000000
--- a/gemma/instantiations/gr2b_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gr2b_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGriffin2B<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/9b_f32.cc b/gemma/instantiations/nuq.cc
similarity index 87%
rename from gemma/instantiations/9b_f32.cc
rename to gemma/instantiations/nuq.cc
index d96b279..5e3ff4d 100644
--- a/gemma/instantiations/9b_f32.cc
+++ b/gemma/instantiations/nuq.cc
@@ -14,8 +14,7 @@
 // limitations under the License.
 
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/9b_f32.cc"
+#define HWY_TARGET_INCLUDE "gemma/instantiations/nuq.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_9B<float>
+#define GEMMA_TYPE NuqStream
 #include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/paligemma_224_bf16.cc b/gemma/instantiations/paligemma_224_bf16.cc
deleted file mode 100644
index 8d508e7..0000000
--- a/gemma/instantiations/paligemma_224_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/paligemma_224_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigPaliGemma_224<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/paligemma_224_f32.cc b/gemma/instantiations/paligemma_224_f32.cc
deleted file mode 100644
index 97107e9..0000000
--- a/gemma/instantiations/paligemma_224_f32.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/paligemma_224_f32.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigPaliGemma_224<float>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/paligemma_224_sfp.cc b/gemma/instantiations/paligemma_224_sfp.cc
deleted file mode 100644
index aff27a0..0000000
--- a/gemma/instantiations/paligemma_224_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/paligemma_224_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigPaliGemma_224<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/2b_sfp.cc b/gemma/instantiations/sfp.cc
similarity index 87%
rename from gemma/instantiations/2b_sfp.cc
rename to gemma/instantiations/sfp.cc
index c93f1d1..563d034 100644
--- a/gemma/instantiations/2b_sfp.cc
+++ b/gemma/instantiations/sfp.cc
@@ -14,8 +14,7 @@
 // limitations under the License.
 
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/2b_sfp.cc"
+#define HWY_TARGET_INCLUDE "gemma/instantiations/sfp.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2B<SfpStream>
+#define GEMMA_TYPE SfpStream
 #include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/tiny_bf16.cc b/gemma/instantiations/tiny_bf16.cc
deleted file mode 100644
index 53dad72..0000000
--- a/gemma/instantiations/tiny_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/tiny_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemmaTiny<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/tiny_f32.cc b/gemma/instantiations/tiny_f32.cc
deleted file mode 100644
index 6f11ddc..0000000
--- a/gemma/instantiations/tiny_f32.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/tiny_f32.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemmaTiny<float>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/tiny_sfp.cc b/gemma/instantiations/tiny_sfp.cc
deleted file mode 100644
index 2eaa86f..0000000
--- a/gemma/instantiations/tiny_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/tiny_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemmaTiny<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/kv_cache.cc b/gemma/kv_cache.cc
index d12c9af..cc9db89 100644
--- a/gemma/kv_cache.cc
+++ b/gemma/kv_cache.cc
@@ -15,59 +15,56 @@
 
 #include "gemma/kv_cache.h"
 
+#include <algorithm>
+
 #include "gemma/common.h"  // CallForModel
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // ZeroBytes
 
 namespace gcpp {
-namespace {
-template <class TConfig>
-struct CreateKVCache {
-  KVCache operator()(size_t prefill_tbatch_size) const {
-    KVCache kv_cache = {};
-
-    const size_t size_cache_pos = CachePosSize<TConfig>()();
-    if (size_cache_pos != 0) {
-      // Allocate more so that prefill can always access one batch, even if
-      // near the end of the sequence.
-      kv_cache.seq_len = TConfig::kSeqLen + prefill_tbatch_size;
-      kv_cache.kv_cache =
-          hwy::AllocateAligned<float>(kv_cache.seq_len * size_cache_pos);
-    }
-
-    // TODO(patrickms): Add query batching support for Griffin.
-    if (TConfig::kGriffinLayers) {
-      constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
-      const size_t conv1d_cache_size =
-          TConfig::kGriffinLayers * (kConv1dWidth == 0 ? 0 : kConv1dWidth - 1) *
-          TConfig::kModelDim;
-      if (conv1d_cache_size != 0) {
-        kv_cache.conv1d_cache = hwy::AllocateAligned<float>(conv1d_cache_size);
-        hwy::ZeroBytes(kv_cache.conv1d_cache.get(),
-                       conv1d_cache_size * sizeof(kv_cache.conv1d_cache[0]));
-      }
-
-      const size_t rglru_cache_size =
-          TConfig::kGriffinLayers * TConfig::kModelDim;
-      if (rglru_cache_size != 0) {
-        kv_cache.rglru_cache = hwy::AllocateAligned<float>(rglru_cache_size);
-        hwy::ZeroBytes(kv_cache.rglru_cache.get(),
-                       rglru_cache_size * sizeof(kv_cache.rglru_cache[0]));
-      }
-    }  // kGriffinLayers
-
-    return kv_cache;
-  }
-};
-}  // namespace
 
 // prefill_tbatch_size is the maximum number of tokens from one query to
 // prefill at a time.
-KVCache KVCache::Create(Model model_type, size_t prefill_tbatch_size) {
-  // TWeight=float is a placeholder and unused because CreateKVCache does not
-  // use TConfig::Weight.
-  return CallForModel</*TWeight=*/float, CreateKVCache>(model_type,
-                                                        prefill_tbatch_size);
+KVCache KVCache::Create(const ModelConfig& weights_config,
+                        size_t prefill_tbatch_size) {
+  KVCache kv_cache = {};
+
+  const size_t size_cache_pos = weights_config.CachePosSize();
+  if (size_cache_pos != 0) {
+    // Allocate more so that prefill can always access one batch, even if
+    // near the end of the sequence.
+    kv_cache.seq_len = weights_config.seq_len + prefill_tbatch_size;
+    kv_cache.kv_cache =
+        hwy::AllocateAligned<float>(kv_cache.seq_len * size_cache_pos);
+  }
+  size_t num_griffin_layers = weights_config.NumLayersOfType(
+      LayerAttentionType::kGriffinRecurrentBlock);
+
+  // TODO(patrickms): Add query batching support for Griffin.
+  if (num_griffin_layers > 0) {
+    size_t conv1d_width = 0;
+    for (const auto& layer_config : weights_config.layer_configs) {
+      conv1d_width = std::max(conv1d_width, layer_config.conv1d_width);
+    }
+    const size_t conv1d_cache_size =
+        num_griffin_layers * (conv1d_width == 0 ? 0 : conv1d_width - 1) *
+        weights_config.model_dim;
+    if (conv1d_cache_size != 0) {
+      kv_cache.conv1d_cache = hwy::AllocateAligned<float>(conv1d_cache_size);
+      hwy::ZeroBytes(kv_cache.conv1d_cache.get(),
+                     conv1d_cache_size * sizeof(kv_cache.conv1d_cache[0]));
+    }
+
+    const size_t rglru_cache_size =
+        num_griffin_layers * weights_config.model_dim;
+    if (rglru_cache_size != 0) {
+      kv_cache.rglru_cache = hwy::AllocateAligned<float>(rglru_cache_size);
+      hwy::ZeroBytes(kv_cache.rglru_cache.get(),
+                     rglru_cache_size * sizeof(kv_cache.rglru_cache[0]));
+    }
+  }  // kGriffinLayers
+
+  return kv_cache;
 }
 
 }  // namespace gcpp
diff --git a/gemma/kv_cache.h b/gemma/kv_cache.h
index 65b40c1..9c46d93 100644
--- a/gemma/kv_cache.h
+++ b/gemma/kv_cache.h
@@ -35,7 +35,8 @@ struct KVCache {
   // kModelDim * kGriffinLayers
   hwy::AlignedFreeUniquePtr<float[]> rglru_cache;
 
-  static KVCache Create(Model type, size_t prefill_tbatch_size);
+  static KVCache Create(const ModelConfig& weights_config,
+                        size_t prefill_tbatch_size);
 };
 
 }  // namespace gcpp
diff --git a/gemma/run.cc b/gemma/run.cc
index 42e54a4..6f53fc7 100644
--- a/gemma/run.cc
+++ b/gemma/run.cc
@@ -194,7 +194,7 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
 
   Gemma model = CreateGemma(loader, pools);
   KVCache kv_cache =
-      KVCache::Create(model.Info().model, inference.prefill_tbatch_size);
+      KVCache::Create(model.GetModelConfig(), inference.prefill_tbatch_size);
 
   if (app.verbosity >= 1) {
     std::string instructions =
diff --git a/gemma/weights.cc b/gemma/weights.cc
index 955c4d6..de54ef3 100644
--- a/gemma/weights.cc
+++ b/gemma/weights.cc
@@ -17,12 +17,14 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
+#include <random>
 #include <vector>
 
 #include "compression/compress.h"
 #include "compression/io.h"  // Path
 #include "gemma/common.h"
-#include "util/allocator.h"
+#include "gemma/configs.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // HWY_ABORT
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -31,58 +33,128 @@
 
 namespace gcpp {
 
-namespace {
-template <class TConfig>
-struct LoadCompressedWeightsT {
-  ByteStorageT operator()(const Path& weights, hwy::ThreadPool& pool) const {
-    PROFILER_ZONE("Startup.LoadCompressedWeights");
-    if (!weights.Exists()) {
-      HWY_ABORT("The model weights file '%s' does not exist.",
-                weights.path.c_str());
-    }
-
-    // Allocate compressed weights.
-    using CWeights = CompressedWeights<TConfig>;
-    ByteStorageT c_weights_u8 = AllocateSizeof<CWeights>();
-    CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
-    new (c_weights) CWeights(pool);
-
-    CacheLoader loader(weights);
-    ForEachType fet =
-        loader.HaveToc() ? ForEachType::kLoadWithToc : ForEachType::kLoadNoToc;
-    CWeights::ForEachTensor(
-        {c_weights}, fet,
+template <typename T>
+struct TensorLoader {
+  void operator()(ModelWeightsPtrs<T>& weights, ForEachType fet,
+                  CacheLoader& loader) {
+    weights.ForEachTensor(
+        {&weights}, fet,
         [&loader](const char* name, hwy::Span<MatPtr*> tensors) {
           loader(name, tensors);
         });
-    std::vector<float> scales(TConfig::kNumTensorScales);
-    if (TConfig::kNumTensorScales > 0) {
-      loader.LoadScales(scales.data(), scales.size());
-    }
-    if (!loader.ReadAll(pool, c_weights->model_storage)) {
-      HWY_ABORT("Failed to load model weights.");
-    }
-    if (TConfig::kNumTensorScales > 0) {
-      c_weights->GetOrApplyScales(scales);
-    }
-    {
-      PROFILER_ZONE("Startup.Reshape");
-      c_weights->Reshape(pool);
-    }
-    return c_weights_u8;
   }
 };
-}  // namespace
 
-ByteStorageT LoadCompressedWeights(const Path& weights, Model model_type,
-                                   Type weight_type, hwy::ThreadPool& pool) {
-  return CallForModelAndWeight<LoadCompressedWeightsT>(model_type, weight_type,
-                                                       weights, pool);
+BlobError ModelWeightsStorage::Load(const Path& weights, Model model_type,
+                                    Type weight_type, hwy::ThreadPool& pool) {
+  PROFILER_ZONE("Startup.LoadModelWeightsPtrs");
+  if (!weights.Exists()) {
+    HWY_ABORT("The model weights file '%s' does not exist.",
+              weights.path.c_str());
+  }
+  CacheLoader loader(weights);
+  ForEachType fet =
+      loader.HaveToc() ? ForEachType::kLoadWithToc : ForEachType::kLoadNoToc;
+  if (fet == ForEachType::kLoadWithToc) {
+    // TODO(rays): Load the config from the file.
+    HWY_ABORT("TOC not supported yet.");
+  } else {
+    // No Toc-> no config.
+    config_ = ConfigFromModel(model_type);
+    config_.weight = weight_type;
+  }
+  CreateForType(weight_type, pool);
+  CallForModelWeightT<TensorLoader>(fet, loader);
+  std::vector<float> scales(config_.num_tensor_scales + config_.num_vit_scales);
+  if (!scales.empty()) {
+    loader.LoadScales(scales.data(), scales.size());
+  }
+  BlobError err = loader.ReadAll(pool, model_storage_);
+  if (err != 0) {
+    fprintf(stderr, "Failed to load model weights: %d\n", err);
+    return err;
+  }
+  if (!scales.empty()) {
+    GetOrApplyScales(scales);
+  }
+  if (fet == ForEachType::kLoadNoToc) {
+    PROFILER_ZONE("Startup.Reshape");
+    AllocAndCopyWithTranspose(pool);
+  }
+  return 0;
+}
+
+void ModelWeightsStorage::Allocate(const ModelConfig& config, Type weight_type,
+                                   hwy::ThreadPool& pool) {
+  PROFILER_ZONE("Startup.AllocateModelWeightsPtrs");
+  config_ = config;
+  config_.weight = weight_type;
+  CreateForType(weight_type, pool);
+  if (float_weights_) float_weights_->Allocate(model_storage_, pool);
+  if (bf16_weights_) bf16_weights_->Allocate(model_storage_, pool);
+  if (sfp_weights_) sfp_weights_->Allocate(model_storage_, pool);
+  if (nuq_weights_) nuq_weights_->Allocate(model_storage_, pool);
+}
+
+class WeightInitializer {
+ public:
+  WeightInitializer(std::mt19937& gen) : dist_(0.0f, 1.0f), gen_(gen) {}
+
+  void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
+    float* data = tensors[0]->data<float>();
+    for (size_t i = 0; i < tensors[0]->NumElements(); ++i) {
+      data[i] = dist_(gen_);
+    }
+    tensors[0]->set_scale(1.0f);
+  }
+
+ private:
+  std::normal_distribution<float> dist_;
+  std::mt19937& gen_;
+};
+
+void ModelWeightsStorage::RandInit(std::mt19937& gen) {
+  HWY_ASSERT(float_weights_);
+  WeightInitializer init(gen);
+  ModelWeightsPtrs<float>::ForEachTensor({float_weights_.get()},
+                                         ForEachType::kLoadNoToc, init);
+}
+
+void ModelWeightsStorage::ZeroInit() {
+  if (float_weights_) float_weights_->ZeroInit();
+  if (bf16_weights_) bf16_weights_->ZeroInit();
+  if (sfp_weights_) sfp_weights_->ZeroInit();
+  if (nuq_weights_) nuq_weights_->ZeroInit();
+}
+
+void ModelWeightsStorage::GetOrApplyScales(std::vector<float>& scales) {
+  if (float_weights_) float_weights_->GetOrApplyScales(scales);
+  if (bf16_weights_) bf16_weights_->GetOrApplyScales(scales);
+  if (sfp_weights_) sfp_weights_->GetOrApplyScales(scales);
+  if (nuq_weights_) nuq_weights_->GetOrApplyScales(scales);
+}
+
+void ModelWeightsStorage::AllocAndCopyWithTranspose(hwy::ThreadPool& pool) {
+  if (float_weights_)
+    float_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
+  if (bf16_weights_)
+    bf16_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
+  if (sfp_weights_)
+    sfp_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
+  if (nuq_weights_)
+    nuq_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
+}
+
+void ModelWeightsStorage::CopyWithTranspose(hwy::ThreadPool& pool) {
+  if (float_weights_) float_weights_->CopyWithTranspose(pool);
+  if (bf16_weights_) bf16_weights_->CopyWithTranspose(pool);
+  if (sfp_weights_) sfp_weights_->CopyWithTranspose(pool);
+  if (nuq_weights_) nuq_weights_->CopyWithTranspose(pool);
 }
 
 namespace {
-// For reasons unknown, this is shown as potentially unused in the IDE.
-void HWY_MAYBE_UNUSED LogVec(const char* name, const float* data, size_t len) {
+
+void LogVec(const char* name, const float* data, size_t len) {
   hwy::Stats stats;
   for (size_t i = 0; i < len; ++i) {
     stats.Notify(data[i]);
@@ -91,36 +163,44 @@ void HWY_MAYBE_UNUSED LogVec(const char* name, const float* data, size_t len) {
          name, len, stats.Min(), stats.Mean(), stats.Max());
 }
 
-class WeightLogger {
- public:
-  void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
-    const MatPtr& tensor = *tensors[0];
-    if (tensor.scale() != 1.0f) {
-      printf("[scale=%f] ", tensor.scale());
-    }
-    LogVec(name, tensor.data<float>(), tensor.NumElements());
-    total_weights += tensor.NumElements();
-  }
-  size_t total_weights = 0;
-};
-
-template <typename TConfig>
-struct LogWeightStatsT {
-  void operator()(const ByteStorageT& weights_u8) const {
-    auto& weights =
-        *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-    WeightLogger logger;
-    CompressedWeights<TConfig>::ForEachTensor(
-        {&weights}, ForEachType::kIgnoreNulls, logger);
-    printf("%-20s  %12zu\n", "Total", logger.total_weights);
-  }
-};
 }  // namespace
 
-void LogWeightStats(gcpp::Model model_type, Type weight_type,
-                    const ByteStorageT& weights) {
-  HWY_ASSERT(weight_type == Type::kF32);
-  CallForModel<float, LogWeightStatsT>(model_type, weights);
+void ModelWeightsStorage::LogWeightStats() {
+  size_t total_weights = 0;
+  // Only for float weights.
+  ModelWeightsPtrs<float>::ForEachTensor(
+      {float_weights_.get()}, ForEachType::kInitNoToc,
+      [&total_weights](const char* name, hwy::Span<MatPtr*> tensors) {
+        const MatPtr& tensor = *tensors[0];
+        if (tensor.scale() != 1.0f) {
+          printf("[scale=%f] ", tensor.scale());
+        }
+        LogVec(name, tensor.data<float>(), tensor.NumElements());
+        total_weights += tensor.NumElements();
+      });
+  printf("%-20s  %12zu\n", "Total", total_weights);
+}
+
+void ModelWeightsStorage::CreateForType(Type weight_type,
+                                        hwy::ThreadPool& pool) {
+  switch (weight_type) {
+    case Type::kF32:
+      float_weights_ = std::make_unique<ModelWeightsPtrs<float>>(config_, pool);
+      break;
+    case Type::kBF16:
+      bf16_weights_ = std::make_unique<ModelWeightsPtrs<BF16>>(config_, pool);
+      break;
+    case Type::kSFP:
+      sfp_weights_ =
+          std::make_unique<ModelWeightsPtrs<SfpStream>>(config_, pool);
+      break;
+    case Type::kNUQ:
+      nuq_weights_ =
+          std::make_unique<ModelWeightsPtrs<NuqStream>>(config_, pool);
+      break;
+    default:
+      HWY_ABORT("Weight type %d unsupported.", static_cast<int>(weight_type));
+  }
 }
 
 }  // namespace gcpp
diff --git a/gemma/weights.h b/gemma/weights.h
index 84ad3ef..65a5965 100644
--- a/gemma/weights.h
+++ b/gemma/weights.h
@@ -18,9 +18,10 @@
 
 #include <stddef.h>
 
-#include <array>
 #include <complex>
 #include <cstdio>
+#include <memory>
+#include <random>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -29,7 +30,6 @@
 #include "compression/shared.h"
 #include "gemma/common.h"
 #include "gemma/configs.h"
-#include "util/allocator.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -53,57 +53,79 @@ enum class ForEachType {
   kInitNoToc,
 };
 
-template <class TConfig>
-struct CompressedLayer {
+template <class Weight>
+struct LayerWeightsPtrs {
   // Large data is constructed separately.
-  CompressedLayer()
-      : attn_vec_einsum_w("att_ein", kModelDim, kHeads * kQKVDim),
-        qkv_einsum_w("qkv_ein", (kHeads + 2 * kKVHeads) * kQKVDim, kModelDim),
-        qkv_einsum_w1("qkv1_w", kHeads * kQKVDim, kModelDim),
-        qkv_einsum_w2("qkv2_w", 2 * kKVHeads * kQKVDim, kModelDim),
-        attention_output_biases("attn_ob", 1, kAOBiasDim),
-        griffin({.linear_x_w = {"gr_lin_x_w", kGriffinDim, kGriffinDim},
-                 .linear_x_biases = {"gr_lin_x_b", 1, kGriffinDim},
-                 .linear_y_w = {"gr_lin_y_w", kGriffinDim, kGriffinDim},
-                 .linear_y_biases = {"gr_lin_y_b", 1, kGriffinDim},
-                 .linear_out_w = {"gr_lin_out_w", kGriffinDim, kGriffinDim},
-                 .linear_out_biases = {"gr_lin_out_b", 1, kGriffinDim},
-                 .conv_w = {"gr_conv_w", kConv1dWidth, kGriffinDim},
-                 .conv_biases = {"gr_conv_b", 1, kGriffinDim},
-                 .gate_w = {"gr_gate_w", 2 * kGriffinDim, kGriffinDim / kHeads},
-                 .gate_biases = {"gr_gate_b", 1, kGriffinDim * 2},
-                 .a = {"gr_a", 1, kGriffinDim}}),
+  explicit LayerWeightsPtrs(const LayerConfig& config)
+      : attn_vec_einsum_w("att_ein", config.model_dim,
+                          config.heads * config.qkv_dim),
+        qkv_einsum_w("qkv_ein",
+                     (config.heads + 2 * config.kv_heads) * config.qkv_dim,
+                     config.model_dim),
+        qkv_einsum_w1("qkv1_w", config.heads * config.qkv_dim,
+                      config.model_dim),
+        qkv_einsum_w2("qkv2_w", 2 * config.kv_heads * config.qkv_dim,
+                      config.model_dim),
+        attention_output_biases(
+            "attn_ob", 1,
+            config.softmax_attn_output_biases ? config.model_dim : 0),
+        griffin(
+            {.linear_x_w = {"gr_lin_x_w", config.griffin_dim,
+                            config.griffin_dim},
+             .linear_x_biases = {"gr_lin_x_b", 1, config.griffin_dim},
+             .linear_y_w = {"gr_lin_y_w", config.griffin_dim,
+                            config.griffin_dim},
+             .linear_y_biases = {"gr_lin_y_b", 1, config.griffin_dim},
+             .linear_out_w = {"gr_lin_out_w", config.griffin_dim,
+                              config.griffin_dim},
+             .linear_out_biases = {"gr_lin_out_b", 1, config.griffin_dim},
+             .conv_w = {"gr_conv_w", config.conv1d_width, config.griffin_dim},
+             .conv_biases = {"gr_conv_b", 1, config.griffin_dim},
+             .gate_w = {"gr_gate_w", 2 * config.griffin_dim,
+                        config.griffin_dim / config.heads},
+             .gate_biases = {"gr_gate_b", 1, config.griffin_dim * 2},
+             .a = {"gr_a", 1, config.griffin_dim}}),
         // MultiHeadDotProductAttention.
-        vit({.attn_out_w = {"attn_out_w", kHeads * kQKVDim, kModelDim},
-             .attn_out_b = {"attn_out_b", 1, kModelDim},
-             .qkv_einsum_w = {"qkv_ein_w", (kHeads + 2 * kKVHeads) * kQKVDim,
-                              kModelDim},
-             .qkv_einsum_b = {"qkv_ein_b", (kHeads + 2 * kKVHeads), kQKVDim},
-             .linear_0_w = {"linear_0_w", kModelDim, kFFHiddenDim},
-             .linear_0_b = {"linear_0_b", 1, kFFHiddenDim},
-             .linear_1_w = {"linear_1_w", kFFHiddenDim, kModelDim},
-             .linear_1_b = {"linear_1_b", 1, kModelDim},
-             .layer_norm_0_bias = {"ln_0_bias", 1, kModelDim},
-             .layer_norm_0_scale = {"ln_0_scale", 1, kModelDim},
-             .layer_norm_1_bias = {"ln_1_bias", 1, kModelDim},
-             .layer_norm_1_scale = {"ln_1_scale", 1, kModelDim}}),
-        gating_einsum_w("gating_ein", 2 * kFFHiddenDim, kModelDim),
-        gating_einsum_w1("gating1_w", kFFHiddenDim, kModelDim),
-        gating_einsum_w2("gating2_w", kFFHiddenDim, kModelDim),
-        linear_w("linear_w", kModelDim, kFFHiddenDim),
-        pre_attention_norm_scale("pre_att_ns", 1, kModelDim),
-        pre_ffw_norm_scale("pre_ff_ns", 1, kModelDim),
+        vit({.attn_out_w = {"attn_out_w", config.heads * config.qkv_dim,
+                            config.model_dim},
+             .attn_out_b = {"attn_out_b", 1, config.model_dim},
+             .qkv_einsum_w = {"qkv_ein_w",
+                              (config.heads + 2 * config.kv_heads) *
+                                  config.qkv_dim,
+                              config.model_dim},
+             .qkv_einsum_b = {"qkv_ein_b", (config.heads + 2 * config.kv_heads),
+                              config.qkv_dim},
+             .linear_0_w = {"linear_0_w", config.model_dim,
+                            config.ff_hidden_dim},
+             .linear_0_b = {"linear_0_b", 1, config.ff_hidden_dim},
+             .linear_1_w = {"linear_1_w", config.ff_hidden_dim,
+                            config.model_dim},
+             .linear_1_b = {"linear_1_b", 1, config.model_dim},
+             .layer_norm_0_bias = {"ln_0_bias", 1, config.model_dim},
+             .layer_norm_0_scale = {"ln_0_scale", 1, config.model_dim},
+             .layer_norm_1_bias = {"ln_1_bias", 1, config.model_dim},
+             .layer_norm_1_scale = {"ln_1_scale", 1, config.model_dim}}),
+        gating_einsum_w("gating_ein", 2 * config.ff_hidden_dim,
+                        config.model_dim),
+        gating_einsum_w1("gating1_w", config.ff_hidden_dim, config.model_dim),
+        gating_einsum_w2("gating2_w", config.ff_hidden_dim, config.model_dim),
+        linear_w("linear_w", config.model_dim, config.ff_hidden_dim),
+        pre_attention_norm_scale("pre_att_ns", 1, config.model_dim),
+        pre_ffw_norm_scale("pre_ff_ns", 1, config.model_dim),
         post_attention_norm_scale(
-            "post_att_ns", 1, kPostNorm == PostNormType::Scale ? kModelDim : 0),
-        post_ffw_norm_scale("post_ff_ns", 1,
-                            kPostNorm == PostNormType::Scale ? kModelDim : 0),
-        ffw_gating_biases("ffw_gat_b", 1, kFFBiases ? 2 * kFFHiddenDim : 0),
-        ffw_output_biases("ffw_out_b", 1, kFFBiases ? kModelDim : 0),
-        att_weights("att_w", kModelDim, kHeads * kQKVDim)
-  {}
-  ~CompressedLayer() = default;
+            "post_att_ns", 1,
+            config.post_norm == PostNormType::Scale ? config.model_dim : 0),
+        post_ffw_norm_scale(
+            "post_ff_ns", 1,
+            config.post_norm == PostNormType::Scale ? config.model_dim : 0),
+        ffw_gating_biases("ffw_gat_b", 1,
+                          config.ff_biases ? 2 * config.ff_hidden_dim : 0),
+        ffw_output_biases("ffw_out_b", 1,
+                          config.ff_biases ? config.model_dim : 0),
+        att_weights("att_w", config.model_dim, config.heads * config.qkv_dim),
+        layer_config(config) {}
+  ~LayerWeightsPtrs() = default;
 
-  using Weight = typename TConfig::Weight;
   // If weights are f32, also f32; otherwise at least bf16. Useful for ops that
   // do not yet support smaller compressed types, or require at least bf16. When
   // weights are f32, we also want such tensors to be f32.
@@ -113,25 +135,6 @@ struct CompressedLayer {
               hwy::If<hwy::IsSame<Weight, double>(), double,
                       hwy::If<IsF32<Weight>(), float, BF16>>>;
 
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kKVHeads = TConfig::kKVHeads;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-  static constexpr size_t kAttVecEinsumWSize = kHeads * kQKVDim * kModelDim;
-  static constexpr size_t kQKVEinsumWSize =
-      (kHeads + 2 * kKVHeads) * kQKVDim * kModelDim;
-  static constexpr size_t kQKVEinsumBSize = (kHeads + 2 * kKVHeads) * kQKVDim;
-  // 2x for (gelu gating vector, gated vector)
-  static constexpr size_t kGatingEinsumWSize = 2 * kFFHiddenDim * kModelDim;
-  static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
-  static constexpr bool kFFBiases = TConfig::kFFBiases;
-  static constexpr PostNormType kPostNorm = TConfig::kPostNorm;
-  static constexpr size_t kAOBiasDim =
-      TConfig::kSoftmaxAttnOutputBiases ? kModelDim : 0;
-  static constexpr size_t kGriffinDim =
-      TConfig::kGriffinLayers > 0 ? kModelDim : 0;
-
   template <class T>
   using ArrayT = MatPtrT<T>;
 
@@ -195,28 +198,32 @@ struct CompressedLayer {
   // Reshaped attention; not loaded from disk via ForEachTensor.
   ArrayT<Weight> att_weights;
 
+  const LayerConfig& layer_config;
+
   // Initializes att_weights from attn_vec_einsum_w, hence this must be called
   // after loading weights via ForEachTensor.
   // TODO: update compression/convert_weights to bake this in.
-  void Reshape(MatStorage& storage) {
+  void Reshape(MatStorage* storage) {
     if (attn_vec_einsum_w.data() == nullptr) return;
 
-    constexpr size_t kModelDim = TConfig::kModelDim;
-    constexpr size_t kHeads = TConfig::kHeads;
-    constexpr size_t kQKVDim = TConfig::kQKVDim;
+    const size_t model_dim = layer_config.model_dim;
+    const size_t heads = layer_config.heads;
+    const size_t qkv_dim = layer_config.qkv_dim;
 
-    // Would have to implement a CompressTraits::Copy for NUQ.
-    static_assert(!hwy::IsSame<Weight, NuqStream>());
+    // TODO: implement a CompressTraits::Copy for NUQ.
+    // static_assert(!hwy::IsSame<Weight, NuqStream>());
 
     // Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
-    storage.Allocate();
-    att_weights.SetPtr(storage);
-    for (size_t m = 0; m < kModelDim; ++m) {
-      Weight* HWY_RESTRICT out_row = att_weights.data() + m * kHeads * kQKVDim;
-      for (size_t h = 0; h < kHeads; ++h) {
+    if (storage != nullptr) {
+      storage->Allocate();
+      att_weights.SetPtr(*storage);
+    }
+    for (size_t m = 0; m < model_dim; ++m) {
+      Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim;
+      for (size_t h = 0; h < heads; ++h) {
         hwy::CopyBytes(
-            attn_vec_einsum_w.data() + h * kModelDim * kQKVDim + m * kQKVDim,
-            out_row + h * kQKVDim, kQKVDim * sizeof(Weight));
+            attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim,
+            out_row + h * qkv_dim, qkv_dim * sizeof(Weight));
       }
     }
     att_weights.set_scale(attn_vec_einsum_w.scale());
@@ -235,11 +242,11 @@ struct CompressedLayer {
   }
 
   template <class Func>
-  static void ForEachTensor(const std::vector<CompressedLayer<TConfig>*>& ptrs,
+  static void ForEachTensor(const std::vector<LayerWeightsPtrs<Weight>*>& ptrs,
                             int layer_idx, ForEachType fet, Func func,
                             char sep = ' ', int sep_index = -1) {
     MatPtr* tensors[ptrs.size()];
-    auto type = TConfig::kLayerConfig[layer_idx];
+    auto type = ptrs[0]->layer_config.type;
     if (type == LayerAttentionType::kVit) {
       // MHA.
       GEMMA_CALL_FUNC(vit.attn_out_w);
@@ -296,17 +303,17 @@ struct CompressedLayer {
     GEMMA_CALL_FUNC(pre_attention_norm_scale);
     GEMMA_CALL_FUNC(pre_ffw_norm_scale);
 
-    if (TConfig::kPostNorm == PostNormType::Scale) {
+    if (ptrs[0]->layer_config.post_norm == PostNormType::Scale) {
       GEMMA_CALL_FUNC(post_attention_norm_scale);
       GEMMA_CALL_FUNC(post_ffw_norm_scale);
     }
 
-    if (TConfig::kFFBiases) {
+    if (ptrs[0]->layer_config.ff_biases) {
       GEMMA_CALL_FUNC(ffw_gating_biases);
       GEMMA_CALL_FUNC(ffw_output_biases);
     }
 
-    if (TConfig::kSoftmaxAttnOutputBiases &&
+    if (ptrs[0]->layer_config.softmax_attn_output_biases &&
         type == LayerAttentionType::kGemma) {
       GEMMA_CALL_FUNC(attention_output_biases);
     }
@@ -322,47 +329,45 @@ struct CompressedLayer {
 
   // Allocates memory for all the tensors in the layer.
   // Note that this is slow and only used for a stand-alone layer.
-  void Allocate() {
-    layer_storage.clear();
-    ForEachTensor({this}, /*layer_idx=*/0, ForEachType::kInitNoToc,
-                  [this](const char* name, hwy::Span<MatPtr*> tensors) {
-                    this->layer_storage.emplace_back(*tensors[0]);
-                    layer_storage.back().Allocate();
-                    tensors[0]->SetPtr(layer_storage.back());
-                  });
+  void Allocate(std::vector<MatStorage>& layer_storage) {
+    ForEachTensor(
+        {this}, /*layer_idx=*/0, ForEachType::kInitNoToc,
+        [&layer_storage](const char* name, hwy::Span<MatPtr*> tensors) {
+          layer_storage.emplace_back(*tensors[0]);
+          layer_storage.back().Allocate();
+          tensors[0]->SetPtr(layer_storage.back());
+        });
   }
-
-  // Storage for all the matrices and vectors. Only used for a stand-alone
-  // layer. For a model, the CompressedWeights::model_storage is used instead.
-  std::vector<MatStorage> layer_storage;
 };
 
-template <class TConfig>
-struct CompressedWeights {
-  explicit CompressedWeights(hwy::ThreadPool& pool)
-      : embedder_input_embedding("c_embedding", TConfig::kVocabSize,
-                                 TConfig::kModelDim),
-        final_norm_scale("c_final_norm", 1, TConfig::kModelDim),
-        vit_encoder_norm_bias("enc_norm_bias", 1,
-                              TConfig::VitConfig::kModelDim),
-        vit_encoder_norm_scale("enc_norm_scale", 1,
-                               TConfig::VitConfig::kModelDim),
-        vit_img_embedding_bias("img_emb_bias", 1,
-                               TConfig::VitConfig::kModelDim),
+template <class Weight>
+struct ModelWeightsPtrs {
+  ModelWeightsPtrs(const ModelConfig& config, hwy::ThreadPool& pool)
+      : embedder_input_embedding("c_embedding", config.vocab_size,
+                                 config.model_dim),
+        final_norm_scale("c_final_norm", 1, config.model_dim),
+        vit_encoder_norm_bias("enc_norm_bias", 1, config.vit_model_dim),
+        vit_encoder_norm_scale("enc_norm_scale", 1, config.vit_model_dim),
+        vit_img_embedding_bias("img_emb_bias", 1, config.vit_model_dim),
         vit_img_embedding_kernel("img_emb_kernel", 14 * 14 * 3,
-                                 TConfig::VitConfig::kModelDim),
-        vit_img_pos_embedding("img_pos_emb", 256,
-                              TConfig::VitConfig::kModelDim),
-        vit_img_head_bias("img_head_bias", 1, TConfig::kModelDim),
-        vit_img_head_kernel("img_head_kernel", TConfig::VitConfig::kModelDim,
-                            TConfig::kModelDim),
-        scale_names({"att_ein", "qkv_ein", "gr_lin_x_w", "gr_lin_y_w",
-                     "gr_lin_out_w", "gr_gate_w", "gating_ein", "linear_w"}) {}
+                                 config.vit_model_dim),
+        vit_img_pos_embedding("img_pos_emb", 256, config.vit_model_dim),
+        vit_img_head_bias("img_head_bias", 1, config.model_dim),
+        vit_img_head_kernel("img_head_kernel", config.vit_model_dim,
+                            config.model_dim),
+        scale_names(config.scale_names),
+        weights_config(config) {
+    c_layers.reserve(config.layer_configs.size());
+    for (const auto& layer_config : config.layer_configs) {
+      c_layers.push_back(LayerWeightsPtrs<Weight>(layer_config));
+    }
+    for (const auto& layer_config : config.vit_layer_configs) {
+      vit_layers.push_back(LayerWeightsPtrs<Weight>(layer_config));
+    }
+  }
 
-  ~CompressedWeights() = default;
-
-  using Weight = typename TConfig::Weight;
-  using WeightF32OrBF16 = typename CompressedLayer<TConfig>::WeightF32OrBF16;
+  ~ModelWeightsPtrs() = default;
+  using WeightF32OrBF16 = typename LayerWeightsPtrs<Weight>::WeightF32OrBF16;
   using WeightF32OrInputT = hwy::If<hwy::IsSame<WeightF32OrBF16, BF16>(),
                                     EmbedderInputT, WeightF32OrBF16>;
 
@@ -380,49 +385,73 @@ struct CompressedWeights {
   MatPtrT<float> vit_img_head_bias;
   MatPtrT<WeightF32OrBF16> vit_img_head_kernel;
 
-  // Storage for all the matrices and vectors.
-  std::vector<MatStorage> model_storage;
   std::unordered_set<std::string> scale_names;
 
-  CompressedLayer<TConfig> c_layers[TConfig::kLayers];
-  CompressedLayer<typename TConfig::VitConfig>
-      vit_layers[TConfig::VitConfig::kLayers];
+  const ModelConfig& weights_config;
 
-  // Called by weights.cc after ForEachTensor.
-  void Reshape(hwy::ThreadPool& pool) {
+  std::vector<LayerWeightsPtrs<Weight>> c_layers;
+  std::vector<LayerWeightsPtrs<Weight>> vit_layers;
+
+  // Called by weights.cc after Loading, before att_w has been allocated.
+  void AllocAndCopyWithTranspose(hwy::ThreadPool& pool,
+                                 std::vector<MatStorage>& model_storage) {
     size_t storage_index = model_storage.size();
-    for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
-      model_storage.emplace_back(GetLayer(layer)->att_weights);
+    for (auto& layer : c_layers) {
+      model_storage.emplace_back(layer.att_weights);
     }
-    pool.Run(0, TConfig::kLayers,
-             [this, storage_index](uint64_t layer, size_t /*thread*/) {
-               GetLayer(layer)->Reshape(model_storage[storage_index + layer]);
+    pool.Run(0, c_layers.size(),
+             [this, &model_storage, storage_index](uint64_t layer,
+                                                   size_t /*thread*/) {
+               GetLayer(layer)->Reshape(&model_storage[storage_index + layer]);
              });
   }
+  // For when the storage has already been allocated.
+  void CopyWithTranspose(hwy::ThreadPool& pool) {
+    pool.Run(0, c_layers.size(), [this](uint64_t layer, size_t /*thread*/) {
+      GetLayer(layer)->Reshape(nullptr);
+    });
+  }
 
   void ZeroInit() {
     embedder_input_embedding.ZeroInit();
     final_norm_scale.ZeroInit();
-    for (int i = 0; i < TConfig::kLayers; ++i) {
+    for (size_t i = 0; i < c_layers.size(); ++i) {
       c_layers[i].ZeroInit(i);
     }
   }
 
-  const CompressedLayer<TConfig>* GetLayer(size_t layer) const {
+  const LayerWeightsPtrs<Weight>* GetLayer(size_t layer) const {
     return &c_layers[layer];
   }
-  CompressedLayer<TConfig>* GetLayer(size_t layer) { return &c_layers[layer]; }
-  const CompressedLayer<typename TConfig::VitConfig>* GetVitLayer(
-      size_t layer) const {
+  LayerWeightsPtrs<Weight>* GetLayer(size_t layer) { return &c_layers[layer]; }
+  const LayerWeightsPtrs<Weight>* GetVitLayer(size_t layer) const {
     return &vit_layers[layer];
   }
-  CompressedLayer<typename TConfig::VitConfig>* GetVitLayer(size_t layer) {
+  LayerWeightsPtrs<Weight>* GetVitLayer(size_t layer) {
     return &vit_layers[layer];
   }
 
+  void Allocate(std::vector<MatStorage>& model_storage, hwy::ThreadPool& pool) {
+    std::vector<MatPtr*> model_toc;
+    ForEachTensor(
+        {this}, ForEachType::kInitNoToc,
+        [&model_toc, &model_storage](const char*, hwy::Span<MatPtr*> tensors) {
+          model_toc.push_back(tensors[0]);
+          model_storage.emplace_back(*tensors[0]);
+        });
+    // Allocate in parallel using the pool.
+    pool.Run(0, model_toc.size(),
+             [&model_toc, &model_storage](uint64_t task, size_t /*thread*/) {
+               // model_storage may have had content before we started.
+               size_t idx = task + model_storage.size() - model_toc.size();
+               model_storage[idx].Allocate();
+               model_toc[task]->SetPtr(model_storage[idx]);
+             });
+  }
+
   // Copies the data from other to *this.
-  void CopyFrom(const CompressedWeights<TConfig>& other) {
-    ForEachTensor({this, const_cast<CompressedWeights<TConfig>*>(&other)},
+  void CopyFrom(const ModelWeightsPtrs<Weight>& other) {
+    ForEachTensor({this, const_cast<ModelWeightsPtrs<Weight>*>(&other)},
                   ForEachType::kIgnoreNulls,
                   [](const char*, hwy::Span<MatPtr*> tensors) {
                     hwy::CopyBytes(tensors[1]->Ptr(), tensors[0]->Ptr(),
@@ -448,16 +477,14 @@ struct CompressedWeights {
             ++scale_pos;
           }
         });
-    HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
+    HWY_ASSERT(scale_pos == weights_config.num_tensor_scales);
   }
 
   template <class Func>
-  static void ForEachTensor(
-      const std::vector<CompressedWeights<TConfig>*>& ptrs, ForEachType fet,
-      Func func) {
-    std::vector<CompressedLayer<TConfig>*> layers(ptrs.size());
-    std::vector<CompressedLayer<typename TConfig::VitConfig>*> vit_layers(
-        ptrs.size());
+  static void ForEachTensor(const std::vector<ModelWeightsPtrs<Weight>*>& ptrs,
+                            ForEachType fet, Func func) {
+    std::vector<LayerWeightsPtrs<Weight>*> layers(ptrs.size());
+    std::vector<LayerWeightsPtrs<Weight>*> vit_layers(ptrs.size());
     MatPtr* tensors[ptrs.size()];
     // Variables used by GEMMA_CALL_FUNC.
     int layer_idx = -1;
@@ -465,7 +492,7 @@ struct CompressedWeights {
     int sep_index = -1;
     GEMMA_CALL_FUNC(embedder_input_embedding);
     GEMMA_CALL_FUNC(final_norm_scale);
-    if constexpr (TConfig::VitConfig::kLayers > 0) {
+    if (ptrs[0]->weights_config.vit_layer_configs.size() > 0) {
       // Vit parts.
       GEMMA_CALL_FUNC(vit_encoder_norm_bias);
       GEMMA_CALL_FUNC(vit_encoder_norm_scale);
@@ -476,90 +503,108 @@ struct CompressedWeights {
       GEMMA_CALL_FUNC(vit_img_head_kernel);
     }
 
-    for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
+    for (int layer_idx = 0; layer_idx < ptrs[0]->c_layers.size(); ++layer_idx) {
       for (int i = 0; i < ptrs.size(); ++i) {
         layers[i] = ptrs[i]->GetLayer(layer_idx);
       }
-      CompressedLayer<TConfig>::ForEachTensor(layers, layer_idx, fet, func);
+      LayerWeightsPtrs<Weight>::ForEachTensor(layers, layer_idx, fet, func);
     }
 
     // Vit layers. Not supported for compress_weights.
-    if constexpr (TConfig::VitConfig::kLayers > 0) {
-      for (int layer_idx = 0; layer_idx < TConfig::VitConfig::kLayers;
+    if (ptrs[0]->weights_config.vit_layer_configs.size() > 0) {
+      for (int layer_idx = 0; layer_idx < ptrs[0]->vit_layers.size();
            ++layer_idx) {
-        auto type = TConfig::VitConfig::kLayerConfig[layer_idx];
+        auto type = ptrs[0]->vit_layers[layer_idx].layer_config.type;
         HWY_ASSERT(type == LayerAttentionType::kVit);
         for (int i = 0; i < ptrs.size(); ++i) {
           vit_layers[i] = ptrs[i]->GetVitLayer(layer_idx);
         }
-        CompressedLayer<typename TConfig::VitConfig>::ForEachTensor(
-            vit_layers, layer_idx, fet, func);
+        LayerWeightsPtrs<Weight>::ForEachTensor(vit_layers, layer_idx, fet,
+                                                func);
       }
     }
   }
 };
 #undef GEMMA_CALL_FUNC
 
-// Pair of configs for the compressed and uncompressed weights.
-template <class CConfig, class UCConfig>
-struct ConfigPair {
-  using uc = UCConfig;
-  using c = CConfig;
-};
-
 // ----------------------------------------------------------------------------
 // Interface
 
-template <typename TConfig>
-struct AllocateCompressedWeights {
-  ByteStorageT operator()(hwy::ThreadPool& pool) const {
-    using TWeights = CompressedWeights<TConfig>;
-    ByteStorageT weights_u8 = AllocateSizeof<TWeights>();
-    TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
-    new (weights) TWeights(pool);
-    std::vector<MatPtr*> model_toc;
-    auto& model_storage = weights->model_storage;
-    TWeights::ForEachTensor(
-        {weights}, ForEachType::kInitNoToc,
-        [&model_toc, &model_storage](const char*, hwy::Span<MatPtr*> tensors) {
-          model_toc.push_back(tensors[0]);
-          model_storage.emplace_back(*tensors[0]);
-        });
-    // Allocate in parallel using the pool.
-    pool.Run(0, model_storage.size(),
-             [&model_toc, &model_storage](uint64_t task, size_t /*thread*/) {
-               model_storage[task].Allocate();
-               model_toc[task]->SetPtr(model_storage[task]);
-             });
-    return weights_u8;
+class ModelWeightsStorage {
+ public:
+  ModelWeightsStorage() = default;
+  ~ModelWeightsStorage() = default;
+
+  BlobError Load(const Path& weights, Model model_type, Type weight_type,
+                 hwy::ThreadPool& pool);
+  void Allocate(Model model_type, Type weight_type, hwy::ThreadPool& pool) {
+    Allocate(ConfigFromModel(model_type), weight_type, pool);
   }
-};
+  void Allocate(const ModelConfig& config, Type weight_type,
+                hwy::ThreadPool& pool);
+  void RandInit(std::mt19937& gen);
+  void ZeroInit();
+  void GetOrApplyScales(std::vector<float>& scales);
+  void AllocAndCopyWithTranspose(hwy::ThreadPool& pool);
+  void CopyWithTranspose(hwy::ThreadPool& pool);
+  void LogWeightStats();
+  const ModelConfig& Config() const { return config_; }
 
-template <typename TConfig>
-struct ZeroInitCompressedWeights {
-  void operator()(ByteStorageT& weights_u8, hwy::ThreadPool& pool) const {
-    CompressedWeights<TConfig>& weights =
-        *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-    weights.ZeroInit();
+  template <typename T>
+  ModelWeightsPtrs<T>* GetWeightsOfType() const {
+    if constexpr (IsSfpStream<T>()) {
+      return sfp_weights_.get();
+    } else if constexpr (IsF32<T>()) {
+      return float_weights_.get();
+    } else if constexpr (IsBF16<T>()) {
+      return bf16_weights_.get();
+    } else if constexpr (IsNuqStream<T>()) {
+      return nuq_weights_.get();
+    } else {
+      return HWY_ABORT("Unsupported type.");
+    }
   }
-};
 
-template <typename TConfig>
-struct ReshapeCompressedWeights {
-  void operator()(ByteStorageT& weights_u8, hwy::ThreadPool& pool) const {
-    CompressedWeights<TConfig>& weights =
-        *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-    weights.Reshape(pool);
+  template <template <typename T> class FuncT, typename... TArgs>
+  decltype(auto) CallForModelWeightT(TArgs&&... args) {
+    if (HWY_LIKELY(sfp_weights_))
+      return FuncT<SfpStream>()(*sfp_weights_, std::forward<TArgs>(args)...);
+    if (bf16_weights_)
+      return FuncT<BF16>()(*bf16_weights_, std::forward<TArgs>(args)...);
+    if (nuq_weights_)
+      return FuncT<NuqStream>()(*nuq_weights_, std::forward<TArgs>(args)...);
+    if (float_weights_)
+      return FuncT<float>()(*float_weights_, std::forward<TArgs>(args)...);
+    return HWY_ABORT("No weights loaded.");
   }
+
+  template <template <typename T> class FuncT, typename... TArgs>
+  decltype(auto) CallForModelWeight(TArgs&&... args) {
+    if (HWY_LIKELY(sfp_weights_))
+      return FuncT<SfpStream>()(*this, std::forward<TArgs>(args)...);
+    if (bf16_weights_)
+      return FuncT<BF16>()(*this, std::forward<TArgs>(args)...);
+    if (nuq_weights_)
+      return FuncT<NuqStream>()(*this, std::forward<TArgs>(args)...);
+    if (float_weights_)
+      return FuncT<float>()(*this, std::forward<TArgs>(args)...);
+    return HWY_ABORT("No weights loaded.");
+  }
+
+ private:
+  void CreateForType(Type weight_type, hwy::ThreadPool& pool);
+
+  ModelConfig config_;
+  // To eliminate type templates, we hold a pointer to one of each weight type
+  // and dispatch to whichever is non-null.
+  std::unique_ptr<ModelWeightsPtrs<float>> float_weights_;
+  std::unique_ptr<ModelWeightsPtrs<BF16>> bf16_weights_;
+  std::unique_ptr<ModelWeightsPtrs<SfpStream>> sfp_weights_;
+  std::unique_ptr<ModelWeightsPtrs<NuqStream>> nuq_weights_;
+  // Storage for all the matrices and vectors.
+  std::vector<MatStorage> model_storage_;
 };
 
-// TODO: also add RandInitCompressedWeights
-
-ByteStorageT LoadCompressedWeights(const Path& weights, Model model_type,
-                                   Type weight_type, hwy::ThreadPool& pool);
-
-void LogWeightStats(Model model, Type weight_type, const ByteStorageT& weights);
-
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_H_
diff --git a/ops/gemma_matvec_test.cc b/ops/gemma_matvec_test.cc
index bfc515c..6982b20 100644
--- a/ops/gemma_matvec_test.cc
+++ b/ops/gemma_matvec_test.cc
@@ -115,8 +115,8 @@ void TestMatVecAdd() {
   FloatPtr expected_out = SimpleMatVecAdd(*mat, vec, add);
   FloatPtr actual_out = hwy::AllocateAligned<float>(kOuter);
   HWY_ASSERT(vec && add && expected_out && actual_out);
-  MatVecAdd<kOuter, kInner>(*mat, 0, vec.get(), add.get(), actual_out.get(),
-                            pool);
+  MatVecAdd(*mat, 0, kOuter, kInner, vec.get(), add.get(), actual_out.get(),
+            pool);
   AssertClose<kOuter>(actual_out, expected_out);
 }
 
@@ -135,9 +135,8 @@ void TestTwoMatVecAdd() {
   FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
   HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
              expected_out1 && actual_out1);
-  TwoMatVecAdd<kOuter, kInner>(*mat0, *mat1, 0, vec.get(), add0.get(),
-                               add1.get(), actual_out0.get(), actual_out1.get(),
-                               pool);
+  TwoMatVecAdd(*mat0, *mat1, 0, kOuter, kInner, vec.get(), add0.get(),
+               add1.get(), actual_out0.get(), actual_out1.get(), pool);
   AssertClose<kOuter>(actual_out0, expected_out0);
   AssertClose<kOuter>(actual_out1, expected_out1);
 }
@@ -156,9 +155,8 @@ void TestTwoOfsMatVecAddLoop() {
   FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
   HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
              expected_out1 && actual_out1);
-  TwoOfsMatVecAddLoop<kOuter, kInner>(*mat, 0, 0, vec.get(), add0.get(),
-                                      add1.get(), actual_out0.get(),
-                                      actual_out1.get());
+  TwoOfsMatVecAddLoop(*mat, 0, 0, kOuter, kInner, vec.get(), add0.get(),
+                      add1.get(), actual_out0.get(), actual_out1.get());
   AssertClose<kOuter>(actual_out0, expected_out0);
   AssertClose<kOuter>(actual_out1, expected_out1);
 }
diff --git a/ops/matvec-inl.h b/ops/matvec-inl.h
index 5d629ac..d9fdeeb 100644
--- a/ops/matvec-inl.h
+++ b/ops/matvec-inl.h
@@ -47,10 +47,10 @@ namespace hn = hwy::HWY_NAMESPACE;
 
 // Simple version without tiling nor threading, but two offsets/outputs and
 // always with addition.
-template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
-          typename AddT>
+template <typename ArrayT, typename VecT, typename AddT>
 HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
-                                    const size_t mat_ofs1,
+                                    const size_t mat_ofs1, const size_t outer,
+                                    const size_t inner,
                                     const VecT* HWY_RESTRICT vec_aligned,
                                     const AddT* HWY_RESTRICT add0,
                                     const AddT* HWY_RESTRICT add1,
@@ -58,13 +58,13 @@ HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
                                     float* HWY_RESTRICT out1) {
   PROFILER_ZONE("TwoOfsMatVecAddLoop");
 
-  for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
-    const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner;
-    const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner;
+  for (size_t idx_row = 0; idx_row < outer; ++idx_row) {
+    const size_t row_ofs0 = mat_ofs0 + (idx_row)*inner;
+    const size_t row_ofs1 = mat_ofs1 + (idx_row)*inner;
     out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
-                    Dot(mat, row_ofs0, vec_aligned, kInner);
+                    Dot(mat, row_ofs0, vec_aligned, inner);
     out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
-                    Dot(mat, row_ofs1, vec_aligned, kInner);
+                    Dot(mat, row_ofs1, vec_aligned, inner);
   }
 }
 
@@ -84,6 +84,14 @@ HWY_INLINE constexpr size_t RowsPerStrip() {
   return kRowsPerStrip;
 }
 
+HWY_INLINE size_t RowsPerStrip(const size_t outer) {
+  // Aim for 128 work items to reduce pool overhead. Must be at least one
+  // vector; prefer a power of two for faster division.
+  constexpr size_t kLanes = hn::ScalableTag<float>().MaxLanes();
+  return outer < 128 ? kLanes
+                     : HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(outer / 128));
+}
+
 namespace detail {
 
 // For each i = [0, num_rows), compute partial (length `num_cols`) dot product
@@ -161,63 +169,63 @@ HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
 
 // Stores dot products of rows with `vec_aligned` + add the values from `add`
 // (if kAdd), then stores them to `out`.
-template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
-          typename VecT, typename AddT>
+template <bool kAdd, typename ArrayT, typename VecT, typename AddT>
 HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs,
+                        const size_t outer, const size_t inner,
                         const VecT* HWY_RESTRICT const vec_aligned,
                         const AddT* HWY_RESTRICT const add,
                         float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
   PROFILER_ZONE("MatVecAdd");
 
   const hn::ScalableTag<float> df;
-  constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
-  constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
+  const size_t rows_per_strip = RowsPerStrip(outer);
+  const size_t num_strips = outer / rows_per_strip;
 
   // For each entire strip.
-  pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
+  pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
     PROFILER_ZONE("MatVec.lambda");
-    const size_t r0 = strip * kRowsPerStrip;
-    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, kInner, r0,
-                                          kRowsPerStrip, vec_aligned, add,
+    const size_t r0 = strip * rows_per_strip;
+    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, inner, r0,
+                                          rows_per_strip, vec_aligned, add,
                                           out + r0);
   });
 
   // Remaining rows
-  const size_t r0 = kNumStrips * kRowsPerStrip;
-  if (r0 < kOuter) {
+  const size_t r0 = num_strips * rows_per_strip;
+  if (r0 < outer) {
     PROFILER_ZONE("MatVec remainder");
-    const size_t num_rows = kOuter - r0;
-    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, kInner, r0,
-                                          num_rows, vec_aligned, add, out + r0);
+    const size_t num_rows = outer - r0;
+    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, inner, r0, num_rows,
+                                          vec_aligned, add, out + r0);
   }
 }
 
 // With addition
-template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
-          typename AddT>
+template <typename ArrayT, typename VecT, typename AddT>
 HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
+                          const size_t outer, const size_t inner,
                           const VecT* HWY_RESTRICT const vec_aligned,
                           const AddT* HWY_RESTRICT const add,
                           float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
-  return MatVecT</*kAdd=*/true, kOuter, kInner>(mat, mat_ofs, vec_aligned, add,
-                                                out, pool);
+  return MatVecT</*kAdd=*/true>(mat, mat_ofs, outer, inner, vec_aligned, add,
+                                out, pool);
 }
 
 // Without addition
-template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
+template <typename ArrayT, typename VecT>
 HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
+                       const size_t outer, const size_t inner,
                        const VecT* HWY_RESTRICT const vec_aligned,
                        float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
-  MatVecT</*kAdd=*/false, kOuter, kInner>(mat, mat_ofs, vec_aligned,
-                                          /*add=*/static_cast<VecT*>(nullptr),
-                                          out, pool);
+  MatVecT</*kAdd=*/false>(mat, mat_ofs, outer, inner, vec_aligned,
+                          /*add=*/static_cast<VecT*>(nullptr), out, pool);
 }
 
 // Two matrices, same vector
-template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT1,
-          typename ArrayT2, typename VecT, typename AddT>
+template <bool kAdd, typename ArrayT1, typename ArrayT2, typename VecT,
+          typename AddT>
 HWY_NOINLINE void TwoMatVecT(const ArrayT1& mat0, const ArrayT2& mat1,
-                             const size_t mat_ofs,
+                             const size_t mat_ofs, size_t outer, size_t inner,
                              const VecT* HWY_RESTRICT vec_aligned,
                              const AddT* HWY_RESTRICT add0,
                              const AddT* HWY_RESTRICT add1,
@@ -226,56 +234,56 @@ HWY_NOINLINE void TwoMatVecT(const ArrayT1& mat0, const ArrayT2& mat1,
   PROFILER_ZONE("TwoMatVecAdd");
 
   const hn::ScalableTag<float> df;
-  constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
-  constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
+  const size_t rows_per_strip = RowsPerStrip(outer);
+  const size_t num_strips = outer / rows_per_strip;
 
   // For each entire strip.
-  pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
+  pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
     PROFILER_ZONE("TwoMatVec.lambda");
-    const size_t r0 = strip * kRowsPerStrip;
-    detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, kInner, r0,
-                                          kRowsPerStrip, vec_aligned, add0,
+    const size_t r0 = strip * rows_per_strip;
+    detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, inner, r0,
+                                          rows_per_strip, vec_aligned, add0,
                                           out0 + r0);
-    detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, kInner, r0,
-                                          kRowsPerStrip, vec_aligned, add1,
+    detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, inner, r0,
+                                          rows_per_strip, vec_aligned, add1,
                                           out1 + r0);
   });
 
   // Remaining rows
-  const size_t r0 = kNumStrips * kRowsPerStrip;
-  if (r0 < kOuter) {
+  const size_t r0 = num_strips * rows_per_strip;
+  if (r0 < outer) {
     PROFILER_ZONE("TwoMatVec remainder");
-    const size_t num_rows = kOuter - r0;
+    const size_t num_rows = outer - r0;
     detail::FullDotProductsForStrip<kAdd>(
-        df, mat0, mat_ofs, kInner, r0, num_rows, vec_aligned, add0, out0 + r0);
+        df, mat0, mat_ofs, inner, r0, num_rows, vec_aligned, add0, out0 + r0);
     detail::FullDotProductsForStrip<kAdd>(
-        df, mat1, mat_ofs, kInner, r0, num_rows, vec_aligned, add1, out1 + r0);
+        df, mat1, mat_ofs, inner, r0, num_rows, vec_aligned, add1, out1 + r0);
   }
 }
 
 // With addition
-template <size_t kOuter, size_t kInner, typename ArrayT1, typename ArrayT2,
-          typename VecT, typename AddT>
+template <typename ArrayT1, typename ArrayT2, typename VecT, typename AddT>
 HWY_NOINLINE void TwoMatVecAdd(
     const ArrayT1& mat0, const ArrayT2& mat1, const size_t mat_ofs,
+    const size_t outer, const size_t inner,
     const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0,
     const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0,
     float* HWY_RESTRICT out1, hwy::ThreadPool& pool) {
-  return TwoMatVecT</*kAdd=*/true, kOuter, kInner>(
-      mat0, mat1, mat_ofs, vec_aligned, add0, add1, out0, out1, pool);
+  return TwoMatVecT</*kAdd=*/true>(mat0, mat1, mat_ofs, outer, inner,
+                                   vec_aligned, add0, add1, out0, out1, pool);
 }
 
 // Without addition
-template <size_t kOuter, size_t kInner, typename ArrayT1, typename ArrayT2,
-          typename VecT>
+template <typename ArrayT1, typename ArrayT2, typename VecT>
 HWY_NOINLINE void TwoMatVec(const ArrayT1& mat0, const ArrayT2& mat1,
-                            const size_t mat_ofs,
+                            const size_t mat_ofs, const size_t outer,
+                            const size_t inner,
                             const VecT* HWY_RESTRICT vec_aligned,
                             float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
                             hwy::ThreadPool& pool) {
-  TwoMatVecT</*kAdd=*/false, kOuter, kInner, ArrayT1, ArrayT2, VecT, VecT>(
-      mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
-      out0, out1, pool);
+  TwoMatVecT</*kAdd=*/false, ArrayT1, ArrayT2, VecT, VecT>(
+      mat0, mat1, mat_ofs, outer, inner, vec_aligned, /*add0=*/nullptr,
+      /*add1=*/nullptr, out0, out1, pool);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/ops/ops-inl.h b/ops/ops-inl.h
index 846ecc5..ecc3f97 100644
--- a/ops/ops-inl.h
+++ b/ops/ops-inl.h
@@ -21,11 +21,11 @@
 #include <stddef.h>
 #include <stdio.h>
 
-#include <array>
 #include <cmath>
 #include <limits>
 #include <random>
 #include <type_traits>  // std::enable_if_t
+#include <vector>
 
 #include "compression/compress.h"
 #include "util/basics.h"  // TokenAndProb
@@ -673,9 +673,8 @@ SampleArgmax(const float* probabilities, size_t vocab_size) {
   return max_index;
 }
 
-template <size_t k>
-HWY_NOINLINE HWY_MAYBE_UNUSED std::discrete_distribution<int>
-create_distribution(std::array<float, k>& top_k, float temperature) {
+HWY_INLINE HWY_MAYBE_UNUSED std::discrete_distribution<int> create_distribution(
+    std::vector<float>& top_k, float temperature) {
   HWY_ASSERT(temperature >= 0.0f);
   if (temperature == 0.0f) {
     // Temperature == 0 is a special case which always returns the argmax (0).
@@ -696,16 +695,16 @@ create_distribution(std::array<float, k>& top_k, float temperature) {
   return std::discrete_distribution<int>(std::begin(top_k), std::end(top_k));
 }
 
-template <size_t k, typename TAcceptToken>
+template <typename TAcceptToken>
 HWY_NOINLINE HWY_MAYBE_UNUSED int SampleTopK(
-    const float* HWY_RESTRICT probabilities, size_t vocab_size,
+    const float* HWY_RESTRICT probabilities, size_t k, size_t vocab_size,
     std::mt19937& gen, float temperature, TAcceptToken& accept_token) {
-  static_assert(k != 0, "");
+  HWY_ASSERT(k != 0);
   HWY_ASSERT(k <= vocab_size);
   // TODO: Optimize, potentially using new VQSort PartialSort.
-  std::array<float, k> top_k{};  // sorted from highest [0], to lowest [k-1]
-  top_k.fill(-std::numeric_limits<float>::infinity());
-  std::array<int, k> indices{};
+  // Sorted from highest [0], to lowest [k-1]
+  std::vector<float> top_k(k, -std::numeric_limits<float>::infinity());
+  std::vector<int> indices(k);
   size_t num_accepted = 0;
   for (size_t i = 0; i < vocab_size; ++i) {
     if (probabilities[i] < top_k[k - 1]) continue;
@@ -727,7 +726,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED int SampleTopK(
     }
   }
   HWY_ASSERT(k <= num_accepted);
-  return indices[create_distribution<k>(top_k, temperature)(gen)];
+  return indices[create_distribution(top_k, temperature)(gen)];
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/ops/ops_test.cc b/ops/ops_test.cc
index 6ac5115..a6f9b2d 100644
--- a/ops/ops_test.cc
+++ b/ops/ops_test.cc
@@ -387,8 +387,8 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void ScalarRopeAndMulBy(
 }
 
 void TestRopeAndMulBy() {
-  using Config = ConfigGemma2_9B<float>;
-  int dim_qkv = Config::kQKVDim;
+  ModelConfig config = ConfigFromModel(Model::GEMMA2_9B);
+  int dim_qkv = config.layer_configs[0].qkv_dim;
   RowVectorBatch<float> x(1, dim_qkv);
 
   std::mt19937 gen;
@@ -400,15 +400,15 @@ void TestRopeAndMulBy() {
     x.All()[i] = random_float();
   }
 
-  const float qmul = ChooseQueryScale<Config>();
+  const float qmul = ChooseQueryScale(config);
   const float kmul = 1.0;
 
   std::vector<float> qexpected(dim_qkv);
   std::vector<float> qactual(dim_qkv);
   std::vector<float> kexpected(dim_qkv);
   std::vector<float> kactual(dim_qkv);
-  RowVectorBatch<float> inv_timescale =
-      gcpp::Activations::CreateInvTimescale<Config>();
+  RowVectorBatch<float> inv_timescale = gcpp::Activations::CreateInvTimescale(
+      config.layer_configs[0].qkv_dim, config.layer_configs[0].post_qk);
   // Assert VectorizedRope computation is same as regular rope at different pos.
   for (int pos = 1; pos < 500; pos++) {
     // Rope'd Q embeddings
@@ -571,29 +571,29 @@ void TestSampleTopK() {
   float temperature = 1.0f;
   // SampleTopK<1> should return the argmax.
   std::function<bool(int, float)> accept_token;
-  int sample = SampleTopK<1>(logits.data(), kSize, gen, temperature,
-                             accept_token);
+  int sample =
+      SampleTopK(logits.data(), /*k=*/1, kSize, gen, temperature, accept_token);
   EXPECT_EQ(sample, 51);  // Last is largest.
   // Only accept even tokens, expect the last (largest) even index.
   accept_token = [](int i, float) { return i % 2 == 0; };
-  sample = SampleTopK<1>(logits.data(), kSize, gen, temperature,
-                         accept_token);
+  sample =
+      SampleTopK(logits.data(), /*k=*/1, kSize, gen, temperature, accept_token);
   EXPECT_EQ(sample, 50);  // Last even index.
   // Reset the logits to a positive, increasing sequence and take Softmax.
   std::iota(logits.begin(), logits.end(), 1.0f);
   Softmax(logits.data(), kSize);
   // Sample from the top 3, expect one of the top 3 even indices.
   for (int i = 0; i < 100; ++i) {
-    sample = SampleTopK<3>(logits.data(), kSize, gen, temperature,
-                          accept_token);
+    sample = SampleTopK(logits.data(), /*k=*/3, kSize, gen, temperature,
+                        accept_token);
     EXPECT_TRUE(sample == 50 || sample == 48 || sample == 46);
   }
   // Now set the temperature to 0.0f, which should always return the argmax,
   // even for k=3.
   temperature = 0.0f;
   for (int i = 0; i < 100; ++i) {
-    sample = SampleTopK<3>(logits.data(), kSize, gen, temperature,
-                          accept_token);
+    sample = SampleTopK(logits.data(), /*k=*/3, kSize, gen, temperature,
+                        accept_token);
     EXPECT_EQ(sample, 50);
   }
 }
diff --git a/util/app.h b/util/app.h
index e46e8df..c0a2d91 100644
--- a/util/app.h
+++ b/util/app.h
@@ -189,6 +189,8 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   const ModelInfo& Info() const { return info_; }
 
  private:
+  // TODO(rays): remove this. Eventually ModelConfig will be loaded from the
+  // weights file, so we can remove the need for this struct entirely.
   ModelInfo info_;
 };