diff --git a/BUILD.bazel b/BUILD.bazel
index 1e0cc73..c480f23 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -104,8 +104,6 @@ cc_test(
     tags = ["hwy_ops_test"],
     deps = [
         ":allocator",
-        ":common",
-        ":gemma_lib",
         ":ops",
         ":test_util",
         ":threading",
@@ -183,7 +181,10 @@ cc_test(
 
 cc_library(
     name = "common",
-    srcs = ["gemma/common.cc"],
+    srcs = [
+        "gemma/common.cc",
+        "gemma/configs.cc",
+    ],
     hdrs = [
         "gemma/common.h",
         "gemma/configs.h",
@@ -195,12 +196,20 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "configs_test",
+    srcs = ["gemma/configs_test.cc"],
+    deps = [
+        ":common",
+        "@googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "weights",
     srcs = ["gemma/weights.cc"],
     hdrs = ["gemma/weights.h"],
     deps = [
-        ":allocator",
         ":common",
         "//compression:compress",
         "//compression:io",
@@ -219,7 +228,6 @@ cc_library(
         ":common",
         "//compression:io",
         "@highway//:hwy",
-        "@highway//:nanobenchmark",  # timer
         "@highway//:profiler",
         "@com_google_sentencepiece//:sentencepiece_processor",
     ],
@@ -239,30 +247,10 @@ cc_library(
     name = "gemma_lib",
     srcs = [
         "gemma/gemma.cc",
-        "gemma/instantiations/27b_bf16.cc",
-        "gemma/instantiations/27b_f32.cc",
-        "gemma/instantiations/27b_sfp.cc",
-        "gemma/instantiations/2b_bf16.cc",
-        "gemma/instantiations/2b_f32.cc",
-        "gemma/instantiations/2b_sfp.cc",
-        "gemma/instantiations/7b_bf16.cc",
-        "gemma/instantiations/7b_f32.cc",
-        "gemma/instantiations/7b_sfp.cc",
-        "gemma/instantiations/9b_bf16.cc",
-        "gemma/instantiations/9b_f32.cc",
-        "gemma/instantiations/9b_sfp.cc",
-        "gemma/instantiations/tiny_bf16.cc",
-        "gemma/instantiations/tiny_f32.cc",
-        "gemma/instantiations/tiny_sfp.cc",
-        "gemma/instantiations/gr2b_bf16.cc",
-        "gemma/instantiations/gr2b_f32.cc",
-        "gemma/instantiations/gr2b_sfp.cc",
-        "gemma/instantiations/gemma2_2b_bf16.cc",
-        "gemma/instantiations/gemma2_2b_f32.cc",
-        "gemma/instantiations/gemma2_2b_sfp.cc",
-        "gemma/instantiations/paligemma_224_bf16.cc",
-        "gemma/instantiations/paligemma_224_f32.cc",
-        "gemma/instantiations/paligemma_224_sfp.cc",
+        "gemma/instantiations/bf16.cc",
+        "gemma/instantiations/f32.cc",
+        "gemma/instantiations/nuq.cc",
+        "gemma/instantiations/sfp.cc",
     ],
     hdrs = [
         "gemma/activations.h",
@@ -327,8 +315,6 @@ cc_library(
         ":threading",
         "//compression:io",
         "@highway//:hwy",
-        "@highway//:thread_pool",
-        "@highway//:topology",
     ],
 )
 
@@ -367,7 +353,6 @@ cc_test(
         ":benchmark_helper",
         ":common",
         ":gemma_lib",
-        ":tokenizer",
         "@googletest//:gtest_main",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
@@ -396,7 +381,6 @@ cc_binary(
     name = "single_benchmark",
     srcs = ["evals/benchmark.cc"],
     deps = [
-        ":app",
         ":args",
         ":benchmark_helper",
         ":common",
@@ -405,7 +389,6 @@ cc_binary(
         "//compression:io",
         "@highway//:hwy",
         "@highway//:nanobenchmark",
-        "@highway//:thread_pool",
         "@nlohmann_json//:json",
     ],
 )
@@ -429,13 +412,11 @@ cc_binary(
         "evals/debug_prompt.cc",
     ],
     deps = [
-        ":app",
         ":args",
         ":benchmark_helper",
         ":gemma_lib",
         "//compression:io",
         "@highway//:hwy",
-        "@highway//:thread_pool",
         "@nlohmann_json//:json",
     ],
 )
@@ -444,7 +425,6 @@ cc_binary(
     name = "gemma_mmlu",
     srcs = ["evals/run_mmlu.cc"],
     deps = [
-        ":app",
         ":args",
         ":benchmark_helper",
         ":gemma_lib",
@@ -488,7 +468,6 @@ cc_library(
     deps = [
         ":allocator",
         ":common",
-        ":gemma_lib",
         ":ops",
         ":prompt",
         ":weights",
@@ -508,7 +487,6 @@ cc_library(
         "backprop/forward_scalar.h",
     ],
     deps = [
-        ":allocator",
         ":common",
         ":prompt",
         ":weights",
@@ -525,7 +503,6 @@ cc_test(
         "backprop/test_util.h",
     ],
     deps = [
-        ":allocator",
         ":backprop_scalar",
         ":common",
         ":prompt",
@@ -599,6 +576,7 @@ cc_test(
         ":threading",
         ":weights",
         "@googletest//:gtest_main",
+        "//compression:sfp",
         "@highway//:thread_pool",
     ],
 )
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51ab2e4..bade5de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,34 +68,15 @@ set(SOURCES
   gemma/activations.h
   gemma/common.cc
   gemma/common.h
+  gemma/configs.cc
   gemma/configs.h
   gemma/gemma-inl.h
   gemma/gemma.cc
   gemma/gemma.h
-  gemma/instantiations/27b_bf16.cc
-  gemma/instantiations/27b_f32.cc
-  gemma/instantiations/27b_sfp.cc
-  gemma/instantiations/2b_bf16.cc
-  gemma/instantiations/2b_f32.cc
-  gemma/instantiations/2b_sfp.cc
-  gemma/instantiations/7b_bf16.cc
-  gemma/instantiations/7b_f32.cc
-  gemma/instantiations/7b_sfp.cc
-  gemma/instantiations/9b_bf16.cc
-  gemma/instantiations/9b_f32.cc
-  gemma/instantiations/9b_sfp.cc
-  gemma/instantiations/gr2b_bf16.cc
-  gemma/instantiations/gr2b_f32.cc
-  gemma/instantiations/gr2b_sfp.cc
-  gemma/instantiations/tiny_bf16.cc
-  gemma/instantiations/tiny_f32.cc
-  gemma/instantiations/tiny_sfp.cc
-  gemma/instantiations/gemma2_2b_bf16.cc
-  gemma/instantiations/gemma2_2b_f32.cc
-  gemma/instantiations/gemma2_2b_sfp.cc
-  gemma/instantiations/paligemma_224_bf16.cc
-  gemma/instantiations/paligemma_224_f32.cc
-  gemma/instantiations/paligemma_224_sfp.cc
+  gemma/instantiations/bf16.cc
+  gemma/instantiations/f32.cc
+  gemma/instantiations/nuq.cc
+  gemma/instantiations/sfp.cc
   gemma/kv_cache.cc
   gemma/kv_cache.h
   gemma/tokenizer.cc
diff --git a/backprop/activations.h b/backprop/activations.h
index 4f2e821..c616759 100644
--- a/backprop/activations.h
+++ b/backprop/activations.h
@@ -18,32 +18,27 @@
 
 #include <stddef.h>
 
-#include <array>
+#include <vector>
 
 #include "compression/compress.h"  // MatStorageT
-#include "util/allocator.h"  // ByteStorageT
+#include "gemma/configs.h"         // ModelConfig
 
 namespace gcpp {
 
-template <typename T, typename TConfig>
+template <typename T>
 struct ForwardLayer {
-  ForwardLayer()
-      : input("input", kSeqLen, kModelDim),
-        pre_att_rms_out("pre_att_rms_out", kSeqLen, kModelDim),
-        qkv("qkv", kSeqLen * (kHeads + 2), kQKVDim),
-        att("att", kSeqLen * kHeads, kSeqLen),
-        att_out("att_out", kSeqLen * kHeads, kQKVDim),
-        att_post1("att_post1", kSeqLen, kModelDim),
-        attention_out("attention_out", kSeqLen, kModelDim),
-        bf_pre_ffw_rms_out("bf_pre_ffw_rms_out", kSeqLen, kModelDim),
-        ffw_hidden("ffw_hidden", kSeqLen, kFFHiddenDim * 2),
-        ffw_hidden_gated("ffw_hidden_gated", kSeqLen, kFFHiddenDim) {}
-
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
+  ForwardLayer(const LayerConfig& config, size_t seq_len)
+      : input("input", seq_len, config.model_dim),
+        pre_att_rms_out("pre_att_rms_out", seq_len, config.model_dim),
+        qkv("qkv", seq_len * (config.heads + 2), config.qkv_dim),
+        att("att", seq_len * config.heads, seq_len),
+        att_out("att_out", seq_len * config.heads, config.qkv_dim),
+        att_post1("att_post1", seq_len, config.model_dim),
+        attention_out("attention_out", seq_len, config.model_dim),
+        bf_pre_ffw_rms_out("bf_pre_ffw_rms_out", seq_len, config.model_dim),
+        ffw_hidden("ffw_hidden", seq_len, config.ff_hidden_dim * 2),
+        ffw_hidden_gated("ffw_hidden_gated", seq_len, config.ff_hidden_dim),
+        layer_config(config) {}
 
   MatStorageT<T> input;
   MatStorageT<T> pre_att_rms_out;
@@ -55,56 +50,30 @@ struct ForwardLayer {
   MatStorageT<T> bf_pre_ffw_rms_out;
   MatStorageT<T> ffw_hidden;
   MatStorageT<T> ffw_hidden_gated;
+  const LayerConfig& layer_config;
 };
 
-template <typename T, typename TConfig>
+template <typename T>
 struct ForwardPass {
-  ForwardPass()
-      : final_layer_output("final_layer_output", kSeqLen, kModelDim),
-        final_norm_output("final_norm_output", kSeqLen, kModelDim),
-        logits("logits", kSeqLen, kVocabSize),
-        probs("probs", kSeqLen, kVocabSize) {
-  }  // prevents placement-new calling memset
+  ForwardPass(const ModelConfig& config)
+      : final_layer_output("final_layer_output", config.seq_len,
+                           config.model_dim),
+        final_norm_output("final_norm_output", config.seq_len,
+                          config.model_dim),
+        logits("logits", config.seq_len, config.vocab_size),
+        probs("probs", config.seq_len, config.vocab_size),
+        weights_config(config) {
+    for (const auto& layer_config : config.layer_configs) {
+      layers.emplace_back(layer_config, config.seq_len);
+    }
+  }
 
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  static constexpr size_t kLayers = TConfig::kLayers;
-
-  std::array<ForwardLayer<T, TConfig>, kLayers> layers;
+  std::vector<ForwardLayer<T>> layers;
   MatStorageT<T> final_layer_output;
   MatStorageT<T> final_norm_output;
   MatStorageT<T> logits;
   MatStorageT<T> probs;
-};
-
-template <typename TConfig>
-struct AllocateForwardPass {
-  ByteStorageT operator()() const {
-    ByteStorageT c_weights_u8 = AllocateSizeof<ForwardPass<float, TConfig>>();
-    auto* c_weights =
-        reinterpret_cast<ForwardPass<float, TConfig>*>(c_weights_u8.get());
-    new (c_weights) ForwardPass<float, TConfig>();
-    return c_weights_u8;
-  }
-};
-
-// Owns activations and undoes the type erasure of AllocateAligned.
-template<typename T, typename TConfig>
-class ActivationsWrapper {
-  using WrappedT = ForwardPass<T, TConfig>;
-
- public:
-  ActivationsWrapper()
-      : data_(AllocateSizeof<WrappedT>()),
-        activations_(*(new(data_.get()) WrappedT())) {}
-
-  const WrappedT& get() const { return activations_; }
-  WrappedT& get() { return activations_; }
-
- private:
-  ByteStorageT data_;
-  WrappedT& activations_;
+  const ModelConfig& weights_config;
 };
 
 }  // namespace gcpp
diff --git a/backprop/backward-inl.h b/backprop/backward-inl.h
index f765a5a..2a0f330 100644
--- a/backprop/backward-inl.h
+++ b/backprop/backward-inl.h
@@ -28,6 +28,7 @@
 #include "backprop/activations.h"
 #include "backprop/prompt.h"
 #include "gemma/common.h"
+#include "gemma/weights.h"
 #include "util/allocator.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -53,45 +54,41 @@ namespace gcpp {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-template <size_t kCols, size_t kRows>
-void MatMulVJP(const float* HWY_RESTRICT weights,  // kRows * kCols,
-               const float* HWY_RESTRICT x,       // num_tokens * kCols
-               const float* HWY_RESTRICT v,       // num_tokens * kRows
-               size_t num_tokens,
-               float* HWY_RESTRICT grad_w,         // kRows * kCols,
-               float* HWY_RESTRICT grad_x,        // num_tokens * kCols
-               hwy::ThreadPool& pool) {
-  hwy::ZeroBytes(grad_x, num_tokens * kCols * sizeof(grad_x[0]));
+HWY_INLINE void MatMulVJP(const float* HWY_RESTRICT weights,  // kRows * kCols,
+                          const float* HWY_RESTRICT x,  // num_tokens * kCols
+                          const float* HWY_RESTRICT v,  // num_tokens * kRows
+                          size_t cols, size_t rows, size_t num_tokens,
+                          float* HWY_RESTRICT grad_w,  // kRows * kCols,
+                          float* HWY_RESTRICT grad_x,  // num_tokens * kCols
+                          hwy::ThreadPool& pool) {
+  hwy::ZeroBytes(grad_x, num_tokens * cols * sizeof(grad_x[0]));
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t voffs = pos * kRows;
-    const size_t xoffs = pos * kCols;
-    for (size_t j = 0; j < kRows; ++j) {
-      MulByConstAndAdd(v[voffs + j], &x[xoffs], &grad_w[j * kCols], kCols);
-      MulByConstAndAdd(v[voffs + j], &weights[j * kCols], &grad_x[xoffs],
-                       kCols);
+    const size_t voffs = pos * rows;
+    const size_t xoffs = pos * cols;
+    for (size_t j = 0; j < rows; ++j) {
+      MulByConstAndAdd(v[voffs + j], &x[xoffs], &grad_w[j * cols], cols);
+      MulByConstAndAdd(v[voffs + j], &weights[j * cols], &grad_x[xoffs], cols);
     }
   }
 }
 
-template <size_t kHeads, size_t kCols, size_t kRows>
-void MultiHeadMatMulVJP(
-    const float* HWY_RESTRICT weights,  // kHeads * kRows * kCols
-    const float* HWY_RESTRICT x,        // num_tokens * kHeads * kCols
+HWY_INLINE void MultiHeadMatMulVJP(
+    const float* HWY_RESTRICT weights,  // heads * kRows * kCols
+    const float* HWY_RESTRICT x,        // num_tokens * heads * kCols
     const float* HWY_RESTRICT v,        // num_tokens * kRows
-    size_t num_tokens,
-    float* HWY_RESTRICT grad_w,         // kHeads * kRows * kCols
-    float* HWY_RESTRICT grad_x,         // num_tokens * kHeads * kCols
+    size_t heads, size_t cols, size_t rows, size_t num_tokens,
+    float* HWY_RESTRICT grad_w,  // heads * kRows * kCols
+    float* HWY_RESTRICT grad_x,  // num_tokens * heads * kCols
     hwy::ThreadPool& pool) {
-  hwy::ZeroBytes(grad_x, num_tokens * kHeads * kCols * sizeof(grad_x[0]));
+  hwy::ZeroBytes(grad_x, num_tokens * heads * cols * sizeof(grad_x[0]));
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t j = 0; j < kRows; ++j) {
-      for (size_t h = 0; h < kHeads; ++h) {
-        MulByConstAndAdd(v[pos * kRows + j],
-                         &x[pos * kHeads * kCols + h * kCols],
-                         &grad_w[h * kRows * kCols + j * kCols], kCols);
-        MulByConstAndAdd(v[pos * kRows + j],
-                         &weights[h * kRows * kCols + j * kCols],
-                         &grad_x[pos * kHeads * kCols + h * kCols], kCols);
+    for (size_t j = 0; j < rows; ++j) {
+      for (size_t h = 0; h < heads; ++h) {
+        MulByConstAndAdd(v[pos * rows + j], &x[pos * heads * cols + h * cols],
+                         &grad_w[h * rows * cols + j * cols], cols);
+        MulByConstAndAdd(v[pos * rows + j],
+                         &weights[h * rows * cols + j * cols],
+                         &grad_x[pos * heads * cols + h * cols], cols);
       }
     }
   }
@@ -168,39 +165,39 @@ static HWY_NOINLINE void InputEmbeddingVJP(
   }
 }
 
-template <typename TConfig, typename LayerT>
-void LayerVJP(const LayerT& weights,
-              const ForwardLayer<float, TConfig>& forward,
+template <typename T>
+void LayerVJP(const LayerWeightsPtrs<T>& weights,
+              const ForwardLayer<float>& forward,
               const float* HWY_RESTRICT next_layer_grad, size_t num_tokens,
-              LayerT& grad, ForwardLayer<float, TConfig>& backward,
+              LayerWeightsPtrs<T>& grad, ForwardLayer<float>& backward,
               const RowVectorBatch<float>& inv_timescale,
               hwy::ThreadPool& pool) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-  static const float kQueryScale =
-      static_cast<float>(1.0 / sqrt(static_cast<double>(kQKVDim)));
-  HWY_ASSERT(num_tokens <= kSeqLen);
+  const LayerConfig& config = weights.layer_config;
+  const size_t model_dim = config.model_dim;
+  const size_t qkv_dim = config.qkv_dim;
+  const size_t heads = config.heads;
+  const size_t seq_len = forward.input.Rows();
+  const size_t ff_hidden_dim = config.ff_hidden_dim;
+  const float query_scale =
+      static_cast<float>(1.0 / sqrt(static_cast<double>(qkv_dim)));
+  HWY_ASSERT(num_tokens <= seq_len);
 
-  MatMulVJP<kFFHiddenDim, kModelDim>(
-      weights.linear_w.data(), forward.ffw_hidden_gated.data(), next_layer_grad,
-      num_tokens, grad.linear_w.data(), backward.ffw_hidden_gated.data(),
-      pool);
+  MatMulVJP(weights.linear_w.data(), forward.ffw_hidden_gated.data(),
+            next_layer_grad, ff_hidden_dim, model_dim, num_tokens,
+            grad.linear_w.data(), backward.ffw_hidden_gated.data(), pool);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t hidden_offset = pos * kFFHiddenDim * 2;
+    const size_t hidden_offset = pos * ff_hidden_dim * 2;
     const float* HWY_RESTRICT f_out = forward.ffw_hidden.data() + hidden_offset;
-    const float* HWY_RESTRICT f_out_mul = f_out + kFFHiddenDim;
+    const float* HWY_RESTRICT f_out_mul = f_out + ff_hidden_dim;
     const float* HWY_RESTRICT b_out_gated =
-        backward.ffw_hidden_gated.data() + pos * kFFHiddenDim;
+        backward.ffw_hidden_gated.data() + pos * ff_hidden_dim;
     float* HWY_RESTRICT b_out = backward.ffw_hidden.data() + hidden_offset;
-    float* HWY_RESTRICT b_out_mul = b_out + kFFHiddenDim;
+    float* HWY_RESTRICT b_out_mul = b_out + ff_hidden_dim;
     namespace hn = hwy::HWY_NAMESPACE;
     using DF = hn::ScalableTag<float>;
     DF df;
-    for (size_t i = 0; i < kFFHiddenDim; i += Lanes(df)) {
+    for (size_t i = 0; i < ff_hidden_dim; i += Lanes(df)) {
       const auto y = Load(df, f_out + i);
       const auto x = Load(df, f_out_mul + i);
       const auto v = Load(df, b_out_gated + i);
@@ -209,101 +206,94 @@ void LayerVJP(const LayerT& weights,
     }
   }
 
-  MatMulVJP<kModelDim, kFFHiddenDim * 2>(
-      weights.gating_einsum_w.data(),
-      forward.bf_pre_ffw_rms_out.data(), backward.ffw_hidden.data(),
-      num_tokens, grad.gating_einsum_w.data(),
-      backward.bf_pre_ffw_rms_out.data(), pool);
-  RMSNormVJP(weights.pre_ffw_norm_scale.data(),
-             forward.attention_out.data(),
-             backward.bf_pre_ffw_rms_out.data(),
-             kModelDim, num_tokens,
-             grad.pre_ffw_norm_scale.data(),
-             backward.attention_out.data(), pool);
+  MatMulVJP(weights.gating_einsum_w.data(), forward.bf_pre_ffw_rms_out.data(),
+            backward.ffw_hidden.data(), model_dim, ff_hidden_dim * 2,
+            num_tokens, grad.gating_einsum_w.data(),
+            backward.bf_pre_ffw_rms_out.data(), pool);
+  RMSNormVJP(weights.pre_ffw_norm_scale.data(), forward.attention_out.data(),
+             backward.bf_pre_ffw_rms_out.data(), model_dim, num_tokens,
+             grad.pre_ffw_norm_scale.data(), backward.attention_out.data(),
+             pool);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(next_layer_grad + pos * kModelDim,
-            backward.attention_out.data() + pos * kModelDim, kModelDim);
+    AddFrom(next_layer_grad + pos * model_dim,
+            backward.attention_out.data() + pos * model_dim, model_dim);
   }
 
   backward.qkv.ZeroInit();
 
-  MultiHeadMatMulVJP<kHeads, kQKVDim, kModelDim>(
-      weights.attn_vec_einsum_w.data(), forward.att_out.data(),
-      backward.attention_out.data(), num_tokens,
-      grad.attn_vec_einsum_w.data(), backward.att_out.data(), pool);
+  MultiHeadMatMulVJP(weights.attn_vec_einsum_w.data(), forward.att_out.data(),
+                     backward.attention_out.data(), heads, qkv_dim, model_dim,
+                     num_tokens, grad.attn_vec_einsum_w.data(),
+                     backward.att_out.data(), pool);
 
-  for (size_t head = 0; head < kHeads; ++head) {
+  for (size_t head = 0; head < heads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t aoffset = head * kSeqLen + pos * kHeads * kSeqLen;
+      const size_t aoffset = head * seq_len + pos * heads * seq_len;
       const float* HWY_RESTRICT f_head_att = forward.att.data() + aoffset;
       const float* HWY_RESTRICT b_att_out =
-          backward.att_out.data() + (pos * kHeads + head) * kQKVDim;
+          backward.att_out.data() + (pos * heads + head) * qkv_dim;
       float* HWY_RESTRICT b_head_att = backward.att.data() + aoffset;
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const size_t v2offs = (pos2 * (kHeads + 2) + kHeads + 1) * kQKVDim;
+        const size_t v2offs = (pos2 * (heads + 2) + heads + 1) * qkv_dim;
         const float* HWY_RESTRICT f_v2 = forward.qkv.data() + v2offs;
         float* HWY_RESTRICT b_v2 = backward.qkv.data() + v2offs;
-        b_head_att[pos2] = Dot(b_att_out, f_v2, kQKVDim);
-        MulByConstAndAdd(f_head_att[pos2], b_att_out, b_v2, kQKVDim);
+        b_head_att[pos2] = Dot(b_att_out, f_v2, qkv_dim);
+        MulByConstAndAdd(f_head_att[pos2], b_att_out, b_v2, qkv_dim);
       }
     }
   }
 
-  for (size_t head = 0; head < kHeads; ++head) {
+  for (size_t head = 0; head < heads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t aoffset = head * kSeqLen + pos * kHeads * kSeqLen;
+      const size_t aoffset = head * seq_len + pos * heads * seq_len;
       const float* HWY_RESTRICT f_head_att = forward.att.data() + aoffset;
       float* HWY_RESTRICT b_head_att = backward.att.data() + aoffset;
       SoftmaxVJP(f_head_att, b_head_att, pos + 1);
     }
   }
 
-  for (size_t head = 0; head < kHeads; ++head) {
+  for (size_t head = 0; head < heads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t qoffs = (pos * (kHeads + 2) + head) * kQKVDim;
-      const size_t aoffs = head * kSeqLen + pos * kHeads * kSeqLen;
+      const size_t qoffs = (pos * (heads + 2) + head) * qkv_dim;
+      const size_t aoffs = head * seq_len + pos * heads * seq_len;
       const float* HWY_RESTRICT f_q = forward.qkv.data() + qoffs;
       const float* HWY_RESTRICT b_head_att = backward.att.data() + aoffs;
       float* HWY_RESTRICT b_q = backward.qkv.data() + qoffs;
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const size_t k2offs = (pos2 * (kHeads + 2) + kHeads) * kQKVDim;
+        const size_t k2offs = (pos2 * (heads + 2) + heads) * qkv_dim;
         const float* HWY_RESTRICT f_k2 = forward.qkv.data() + k2offs;
         float* HWY_RESTRICT b_k2 = backward.qkv.data() + k2offs;
-        MulByConstAndAdd(b_head_att[pos2], f_k2, b_q, kQKVDim);
-        MulByConstAndAdd(b_head_att[pos2], f_q, b_k2, kQKVDim);
+        MulByConstAndAdd(b_head_att[pos2], f_k2, b_q, qkv_dim);
+        MulByConstAndAdd(b_head_att[pos2], f_q, b_k2, qkv_dim);
       }
     }
   }
 
   for (int pos = 0; pos < static_cast<int>(num_tokens); ++pos) {
     float* HWY_RESTRICT b_kv =
-        backward.qkv.data() + (pos * (kHeads + 2) + kHeads) * kQKVDim;
-    Rope(b_kv, kQKVDim, inv_timescale.Const(), -pos);
+        backward.qkv.data() + (pos * (heads + 2) + heads) * qkv_dim;
+    Rope(b_kv, qkv_dim, inv_timescale.Const(), -pos);
   }
 
-  for (size_t head = 0; head < kHeads; ++head) {
+  for (size_t head = 0; head < heads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
       float* HWY_RESTRICT b_q =
-          backward.qkv.data() + (pos * (kHeads + 2) + head) * kQKVDim;
-      MulByConst(kQueryScale, b_q, kQKVDim);
-      Rope(b_q, kQKVDim, inv_timescale.Const(), -pos);
+          backward.qkv.data() + (pos * (heads + 2) + head) * qkv_dim;
+      MulByConst(query_scale, b_q, qkv_dim);
+      Rope(b_q, qkv_dim, inv_timescale.Const(), -pos);
     }
   }
 
-  MatMulVJP<kModelDim, (kHeads + 2) * kQKVDim>(
-      weights.qkv_einsum_w.data(), forward.pre_att_rms_out.data(),
-      backward.qkv.data(), num_tokens,
-      grad.qkv_einsum_w.data(), backward.pre_att_rms_out.data(), pool);
-  RMSNormVJP(weights.pre_attention_norm_scale.data(),
-             forward.input.data(),
-             backward.pre_att_rms_out.data(),
-             kModelDim, num_tokens,
-             grad.pre_attention_norm_scale.data(),
-             backward.input.data(), pool);
+  MatMulVJP(weights.qkv_einsum_w.data(), forward.pre_att_rms_out.data(),
+            backward.qkv.data(), model_dim, (heads + 2) * qkv_dim, num_tokens,
+            grad.qkv_einsum_w.data(), backward.pre_att_rms_out.data(), pool);
+  RMSNormVJP(weights.pre_attention_norm_scale.data(), forward.input.data(),
+             backward.pre_att_rms_out.data(), model_dim, num_tokens,
+             grad.pre_attention_norm_scale.data(), backward.input.data(), pool);
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(backward.attention_out.data() + pos * kModelDim,
-            backward.input.data() + pos * kModelDim, kModelDim);
+    AddFrom(backward.attention_out.data() + pos * model_dim,
+            backward.input.data() + pos * model_dim, model_dim);
   }
 }
 
@@ -342,20 +332,22 @@ static HWY_NOINLINE void CrossEntropyLossGrad(
   }
 }
 
-template <typename TConfig, typename WeightsT, typename LayerT>
-void CrossEntropyLossBackwardPass(const Prompt& prompt, const WeightsT& weights,
-                                  const ForwardPass<float, TConfig>& forward,
-                                  WeightsT& grad,
-                                  ForwardPass<float, TConfig>& backward,
-                                  RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool) {
-  static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kLayers = TConfig::kLayers;
-  const float kEmbScaling = EmbeddingScaling<TConfig>();
-  static_assert(!TConfig::kAbsolutePE);
-  static_assert(TConfig::kPostNorm == PostNormType::None);
-  static_assert(TConfig::kKVHeads == 1);
+template <typename T>
+void CrossEntropyLossBackwardPassInl(const Prompt& prompt,
+                                     const ModelWeightsPtrs<T>& weights,
+                                     const ForwardPass<float>& forward,
+                                     ModelWeightsPtrs<T>& grad,
+                                     ForwardPass<float>& backward,
+                                     RowVectorBatch<float>& inv_timescale,
+                                     hwy::ThreadPool& pool) {
+  const ModelConfig& config = weights.weights_config;
+  const size_t kVocabSize = config.vocab_size;
+  const size_t model_dim = config.model_dim;
+  const size_t kLayers = config.layer_configs.size();
+  const float kEmbScaling = EmbeddingScaling(model_dim);
+  HWY_ASSERT(!config.absolute_pe);
+  HWY_ASSERT(config.layer_configs[0].post_norm == PostNormType::None);
+  HWY_ASSERT(config.layer_configs[0].kv_heads == 1);
 
   HWY_DASSERT(prompt.context_size > 0);
   HWY_DASSERT(prompt.context_size < prompt.tokens.size());
@@ -370,42 +362,38 @@ void CrossEntropyLossBackwardPass(const Prompt& prompt, const WeightsT& weights,
                kVocabSize);
   }
 
-  if constexpr (TConfig::kFinalCap > 0.0f) {
+  if (config.final_cap > 0.0f) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      SoftcapVJP(TConfig::kFinalCap, forward.logits.data() + pos * kVocabSize,
+      SoftcapVJP(config.final_cap, forward.logits.data() + pos * kVocabSize,
                  backward.logits.data() + pos * kVocabSize, kVocabSize);
     }
   }
 
-  MatMulVJP<kModelDim, kVocabSize>(
-      weights.embedder_input_embedding.data(), forward.final_norm_output.data(),
-      backward.logits.data(), num_tokens,
-      grad.embedder_input_embedding.data(), backward.final_norm_output.data(),
-      pool);
+  MatMulVJP(weights.embedder_input_embedding.data(),
+            forward.final_norm_output.data(), backward.logits.data(), model_dim,
+            kVocabSize, num_tokens, grad.embedder_input_embedding.data(),
+            backward.final_norm_output.data(), pool);
 
-  RMSNormVJP(weights.final_norm_scale.data(),
-             forward.final_layer_output.data(),
-             backward.final_norm_output.data(),
-             kModelDim, num_tokens,
-             grad.final_norm_scale.data(),
-             backward.final_layer_output.data(), pool);
+  RMSNormVJP(weights.final_norm_scale.data(), forward.final_layer_output.data(),
+             backward.final_norm_output.data(), model_dim, num_tokens,
+             grad.final_norm_scale.data(), backward.final_layer_output.data(),
+             pool);
 
   for (int layer = static_cast<int>(kLayers) - 1; layer >= 0; --layer) {
-    auto type = TConfig::kLayerConfig[layer];
+    auto layer_config = config.layer_configs[layer];
     // TODO(szabadka) Implement Griffin layer vjp.
-    HWY_ASSERT(type == LayerAttentionType::kGemma);
+    HWY_ASSERT(layer_config.type == LayerAttentionType::kGemma);
     float* next_layer_grad = layer + 1 < kLayers
                              ? backward.layers[layer + 1].input.data()
                              : backward.final_layer_output.data();
-    LayerVJP<TConfig, LayerT>(*weights.GetLayer(layer), forward.layers[layer],
-                              next_layer_grad, num_tokens,
-                              *grad.GetLayer(layer), backward.layers[layer],
-                              inv_timescale, pool);
+    LayerVJP(*weights.GetLayer(layer), forward.layers[layer], next_layer_grad,
+             num_tokens, *grad.GetLayer(layer), backward.layers[layer],
+             inv_timescale, pool);
   }
 
   InputEmbeddingVJP(weights.embedder_input_embedding.data(), prompt.tokens,
                     kEmbScaling, backward.layers[0].input.data(),
-                    grad.embedder_input_embedding.data(), kModelDim);
+                    grad.embedder_input_embedding.data(), model_dim);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/backprop/backward.cc b/backprop/backward.cc
index c186952..868b391 100644
--- a/backprop/backward.cc
+++ b/backprop/backward.cc
@@ -38,44 +38,15 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-template <typename TConfig>
-void CrossEntropyLossBackwardPass(const Prompt& prompt,
-                                  const ByteStorageT& weights_u8,
-                                  const ByteStorageT& forward_u8,
-                                  ByteStorageT& grad_u8,
-                                  ByteStorageT& backward_u8,
-                                  RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool) {
-  using TWeights = CompressedWeights<TConfig>;
-  const auto& weights = *reinterpret_cast<const TWeights*>(weights_u8.get());
-  auto& grad = *reinterpret_cast<TWeights*>(grad_u8.get());
-  using TAct = ForwardPass<float, TConfig>;
-  const auto& forward = *reinterpret_cast<const TAct*>(forward_u8.get());
-  auto& backward = *reinterpret_cast<TAct*>(backward_u8.get());
-  CrossEntropyLossBackwardPass<TConfig, CompressedWeights<TConfig>,
-                               CompressedLayer<TConfig>>(
-      prompt, weights, forward, grad, backward, inv_timescale, pool);
-}
-
-void CrossEntropyLossBackwardPassT(Model model, const Prompt& prompt,
-                                   const ByteStorageT& weights,
-                                   const ByteStorageT& forward,
-                                   ByteStorageT& grad, ByteStorageT& backward,
+void CrossEntropyLossBackwardPassT(const Prompt& prompt,
+                                   const ModelWeightsPtrs<float>& weights,
+                                   const ForwardPass<float>& forward,
+                                   ModelWeightsPtrs<float>& grad,
+                                   ForwardPass<float>& backward,
                                    RowVectorBatch<float>& inv_timescale,
                                    hwy::ThreadPool& pool) {
-  // TODO(janwas): use CallFunctorForModel
-  switch (model) {
-    case Model::GEMMA_2B:
-      CrossEntropyLossBackwardPass<ConfigGemma2B<float>>(
-          prompt, weights, forward, grad, backward, inv_timescale, pool);
-      break;
-    case Model::GEMMA_TINY:
-      CrossEntropyLossBackwardPass<ConfigGemmaTiny<float>>(
-          prompt, weights, forward, grad, backward, inv_timescale, pool);
-      break;
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
+  CrossEntropyLossBackwardPassInl(prompt, weights, forward, grad, backward,
+                                  inv_timescale, pool);
 }
 
 }  // namespace HWY_NAMESPACE
@@ -87,14 +58,15 @@ namespace gcpp {
 
 HWY_EXPORT(CrossEntropyLossBackwardPassT);
 
-void CrossEntropyLossBackwardPass(const Model& model, const Prompt& prompt,
-                                  const ByteStorageT& weights,
-                                  const ByteStorageT& forward,
-                                  ByteStorageT& grad, ByteStorageT& backward,
+void CrossEntropyLossBackwardPass(const Prompt& prompt,
+                                  const ModelWeightsPtrs<float>& weights,
+                                  const ForwardPass<float>& forward,
+                                  ModelWeightsPtrs<float>& grad,
+                                  ForwardPass<float>& backward,
                                   RowVectorBatch<float>& inv_timescale,
                                   hwy::ThreadPool& pool) {
   return HWY_DYNAMIC_DISPATCH(CrossEntropyLossBackwardPassT)(
-      model, prompt, weights, forward, grad, backward, inv_timescale, pool);
+      prompt, weights, forward, grad, backward, inv_timescale, pool);
 }
 
 }  // namespace gcpp
diff --git a/backprop/backward.h b/backprop/backward.h
index 0ac218a..d8e50c7 100644
--- a/backprop/backward.h
+++ b/backprop/backward.h
@@ -16,17 +16,19 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_H_
 
+#include "backprop/activations.h"
 #include "backprop/prompt.h"
-#include "gemma/activations.h"
-#include "gemma/common.h"
+#include "gemma/weights.h"
+#include "util/allocator.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
 
-void CrossEntropyLossBackwardPass(const Model& model, const Prompt& prompt,
-                                  const ByteStorageT& weights,
-                                  const ByteStorageT& forward,
-                                  ByteStorageT& grad, ByteStorageT& backward,
+void CrossEntropyLossBackwardPass(const Prompt& prompt,
+                                  const ModelWeightsPtrs<float>& weights,
+                                  const ForwardPass<float>& forward,
+                                  ModelWeightsPtrs<float>& grad,
+                                  ForwardPass<float>& backward,
                                   RowVectorBatch<float>& inv_timescale,
                                   hwy::ThreadPool& pool);
 
diff --git a/backprop/backward_scalar.h b/backprop/backward_scalar.h
index a804cd3..b0a37b3 100644
--- a/backprop/backward_scalar.h
+++ b/backprop/backward_scalar.h
@@ -125,65 +125,64 @@ void GatedGeluVJP(const T* in, const T* d_out, T* d_in, size_t N, size_t K) {
   }
 }
 
-
-template<typename T>
+template <typename T>
 void MaskedAttentionVJP(const T* qkv, const T* doutput, T* dqkv,
-                        size_t num_tokens, size_t kHeads, size_t kQKVDim,
-                        size_t kSeqLen) {
+                        size_t num_tokens, size_t kHeads, size_t qkv_dim,
+                        size_t seq_len) {
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    const size_t offset = pos * (kHeads + 2) * kQKVDim;
-    memset(dqkv + offset, 0, (kHeads + 1) * kQKVDim * sizeof(qkv[0]));
+    const size_t offset = pos * (kHeads + 2) * qkv_dim;
+    memset(dqkv + offset, 0, (kHeads + 1) * qkv_dim * sizeof(qkv[0]));
   }
   for (size_t head = 0; head < kHeads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t qoffs = (pos * (kHeads + 2) + head) * kQKVDim;
-      const size_t aoffs = head * kSeqLen + pos * kHeads * kSeqLen;
+      const size_t qoffs = (pos * (kHeads + 2) + head) * qkv_dim;
+      const size_t aoffs = head * seq_len + pos * kHeads * seq_len;
       const T* q = qkv + qoffs;
       const T* dout = doutput + aoffs;
       T* dq = dqkv + qoffs;
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const size_t koffs = (pos2 * (kHeads + 2) + kHeads) * kQKVDim;
+        const size_t koffs = (pos2 * (kHeads + 2) + kHeads) * qkv_dim;
         const T* k = qkv + koffs;
         T* dk = dqkv + koffs;
-        MulByConstAndAddT(dout[pos2], k, dq, kQKVDim);
-        MulByConstAndAddT(dout[pos2], q, dk, kQKVDim);
+        MulByConstAndAddT(dout[pos2], k, dq, qkv_dim);
+        MulByConstAndAddT(dout[pos2], q, dk, qkv_dim);
       }
     }
   }
 }
 
-template<typename T>
-void MaskedSoftmaxVJPT(const T* y, T* dy, size_t num_tokens,
-                       size_t kHeads, size_t kSeqLen) {
+template <typename T>
+void MaskedSoftmaxVJPT(const T* y, T* dy, size_t num_tokens, size_t kHeads,
+                       size_t seq_len) {
   for (size_t head = 0; head < kHeads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      size_t offset = pos * kHeads * kSeqLen + head * kSeqLen;
+      size_t offset = pos * kHeads * seq_len + head * seq_len;
       SoftmaxVJPT(y + offset, dy + offset, pos + 1);
-      memset(dy + offset + pos + 1, 0, (kSeqLen - pos - 1) * sizeof(T));
+      memset(dy + offset + pos + 1, 0, (seq_len - pos - 1) * sizeof(T));
     }
   }
 }
 
-template<typename T>
+template <typename T>
 void MixByAttentionVJP(const T* qkv, const T* attention, const T* doutput,
-                       T* dqkv, T* dattention, size_t num_tokens,
-                       size_t kHeads, size_t kQKVDim, size_t kSeqLen) {
+                       T* dqkv, T* dattention, size_t num_tokens, size_t kHeads,
+                       size_t qkv_dim, size_t seq_len) {
   auto v_offset = [&](size_t pos) {
-    return (pos * (kHeads + 2) + kHeads + 1) * kQKVDim;
+    return (pos * (kHeads + 2) + kHeads + 1) * qkv_dim;
   };
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    memset(&dqkv[v_offset(pos)], 0, kQKVDim * sizeof(qkv[0]));
+    memset(&dqkv[v_offset(pos)], 0, qkv_dim * sizeof(qkv[0]));
   }
   for (size_t head = 0; head < kHeads; ++head) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      const size_t offset = head * kQKVDim + pos * kHeads * kQKVDim;
-      const size_t aoffset = head * kSeqLen + pos * kHeads * kSeqLen;
+      const size_t offset = head * qkv_dim + pos * kHeads * qkv_dim;
+      const size_t aoffset = head * seq_len + pos * kHeads * seq_len;
       const T* att = &attention[aoffset];
       const T* dout = &doutput[offset];
       T* datt = &dattention[aoffset];
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        datt[pos2] = DotT(dout, &qkv[v_offset(pos2)], kQKVDim);
-        MulByConstAndAddT(att[pos2], dout, &dqkv[v_offset(pos2)], kQKVDim);
+        datt[pos2] = DotT(dout, &qkv[v_offset(pos2)], qkv_dim);
+        MulByConstAndAddT(att[pos2], dout, &dqkv[v_offset(pos2)], qkv_dim);
       }
     }
   }
@@ -199,77 +198,76 @@ void InputEmbeddingVJPT(const T* w, const std::vector<int>& tokens, T scaling,
   }
 }
 
-template <typename T, typename TConfig>
-void LayerVJP(const CompressedLayer<TConfig>& weights,
-              const ForwardLayer<T, TConfig>& forward, const T* dy,
-              CompressedLayer<TConfig>& grad,
-              ForwardLayer<T, TConfig>& backward, size_t num_tokens) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-  static const T kQueryScale = 1.0 / std::sqrt(T(kQKVDim));
+template <typename T>
+void LayerVJP(const LayerWeightsPtrs<T>& weights,
+              const ForwardLayer<T>& forward, const T* dy,
+              LayerWeightsPtrs<T>& grad, ForwardLayer<T>& backward,
+              size_t num_tokens) {
+  const LayerConfig& layer_config = weights.layer_config;
+  const size_t model_dim = layer_config.model_dim;
+  const size_t seq_len = forward.input.Rows();
+  const size_t qkv_dim = layer_config.qkv_dim;
+  const size_t kHeads = layer_config.heads;
+  const size_t kFFHiddenDim = layer_config.ff_hidden_dim;
+  const T kQueryScale = 1.0 / std::sqrt(T(qkv_dim));
 
-  MatMulVJPT(weights.linear_w.data(), forward.ffw_hidden_gated.data(),
-             dy, grad.linear_w.data(), backward.ffw_hidden_gated.data(),
-             kModelDim, kFFHiddenDim, num_tokens);
+  MatMulVJPT(weights.linear_w.data(), forward.ffw_hidden_gated.data(), dy,
+             grad.linear_w.data(), backward.ffw_hidden_gated.data(), model_dim,
+             kFFHiddenDim, num_tokens);
 
   GatedGeluVJP(forward.ffw_hidden.data(), backward.ffw_hidden_gated.data(),
                backward.ffw_hidden.data(), kFFHiddenDim, num_tokens);
 
   MatMulVJPT(weights.gating_einsum_w.data(), forward.bf_pre_ffw_rms_out.data(),
              backward.ffw_hidden.data(), grad.gating_einsum_w.data(),
-             backward.bf_pre_ffw_rms_out.data(), kFFHiddenDim * 2, kModelDim,
+             backward.bf_pre_ffw_rms_out.data(), kFFHiddenDim * 2, model_dim,
              num_tokens);
 
   RMSNormVJPT(weights.pre_ffw_norm_scale.data(), forward.attention_out.data(),
               backward.bf_pre_ffw_rms_out.data(),
               grad.pre_ffw_norm_scale.data(), backward.attention_out.data(),
-              kModelDim, num_tokens);
+              model_dim, num_tokens);
 
-  AddFromT(dy, backward.attention_out.data(), num_tokens * kModelDim);
+  AddFromT(dy, backward.attention_out.data(), num_tokens * model_dim);
 
   MultiHeadMatMulVJPT(weights.attn_vec_einsum_w.data(), forward.att_out.data(),
                       backward.attention_out.data(),
-                      grad.attn_vec_einsum_w.data(),
-                      backward.att_out.data(),
-                      kHeads, kModelDim, kQKVDim, num_tokens);
+                      grad.attn_vec_einsum_w.data(), backward.att_out.data(),
+                      kHeads, model_dim, qkv_dim, num_tokens);
 
   MixByAttentionVJP(forward.qkv.data(), forward.att.data(),
                     backward.att_out.data(), backward.qkv.data(),
-                    backward.att.data(), num_tokens, kHeads, kQKVDim,
-                    kSeqLen);
+                    backward.att.data(), num_tokens, kHeads, qkv_dim, seq_len);
 
-  MaskedSoftmaxVJPT(forward.att.data(), backward.att.data(),
-                    num_tokens, kHeads, kSeqLen);
+  MaskedSoftmaxVJPT(forward.att.data(), backward.att.data(), num_tokens, kHeads,
+                    seq_len);
 
   MaskedAttentionVJP(forward.qkv.data(), backward.att.data(),
-                     backward.qkv.data(), num_tokens, kHeads, kQKVDim, kSeqLen);
+                     backward.qkv.data(), num_tokens, kHeads, qkv_dim, seq_len);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = backward.qkv.data() + pos * (kHeads + 2) * kQKVDim;
-    MulByConstT(kQueryScale, qkv, kHeads * kQKVDim);
+    T* qkv = backward.qkv.data() + pos * (kHeads + 2) * qkv_dim;
+    MulByConstT(kQueryScale, qkv, kHeads * qkv_dim);
   }
 
   for (int pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = backward.qkv.data() + pos * (kHeads + 2) * kQKVDim;
+    T* qkv = backward.qkv.data() + pos * (kHeads + 2) * qkv_dim;
     for (size_t h = 0; h <= kHeads; ++h) {
-      Rope(qkv + h * kQKVDim, kQKVDim, -pos);
+      Rope(qkv + h * qkv_dim, qkv_dim, -pos);
     }
   }
 
   MatMulVJPT(weights.qkv_einsum_w.data(), forward.pre_att_rms_out.data(),
              backward.qkv.data(), grad.qkv_einsum_w.data(),
-            backward.pre_att_rms_out.data(),
-            (kHeads + 2) * kQKVDim, kModelDim, num_tokens);
+             backward.pre_att_rms_out.data(), (kHeads + 2) * qkv_dim, model_dim,
+             num_tokens);
   RMSNormVJPT(weights.pre_attention_norm_scale.data(), forward.input.data(),
               backward.pre_att_rms_out.data(),
-              grad.pre_attention_norm_scale.data(),
-              backward.input.data(), kModelDim, num_tokens);
+              grad.pre_attention_norm_scale.data(), backward.input.data(),
+              model_dim, num_tokens);
 
   AddFromT(backward.attention_out.data(), backward.input.data(),
-           num_tokens * kModelDim);
+           num_tokens * model_dim);
 }
 
 template <typename T>
@@ -296,56 +294,54 @@ void CrossEntropyLossGrad(const T* x, T* dx, const Prompt& prompt, size_t V) {
   }
 }
 
-template <typename T, typename TConfig>
+template <typename T>
 void CrossEntropyLossBackwardPass(const Prompt& prompt,
-                                  const CompressedWeights<TConfig>& weights,
-                                  const ForwardPass<T, TConfig>& forward,
-                                  CompressedWeights<TConfig>& grad,
-                                  ForwardPass<T, TConfig>& backward) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  static constexpr size_t kLayers = TConfig::kLayers;
+                                  const ModelWeightsPtrs<T>& weights,
+                                  const ForwardPass<T>& forward,
+                                  ModelWeightsPtrs<T>& grad,
+                                  ForwardPass<T>& backward) {
+  const ModelConfig& config = weights.weights_config;
+  const size_t model_dim = config.model_dim;
+  const size_t vocab_size = config.vocab_size;
+  const size_t layers = config.layer_configs.size();
   const std::vector<int> tokens = prompt.tokens;
   const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
 
   CrossEntropyLossGrad(forward.probs.data(), backward.logits.data(), prompt,
-                       kVocabSize);
+                       vocab_size);
 
-  SoftmaxVJPT(forward.probs.data(), backward.logits.data(),
-              kVocabSize, num_tokens);
+  SoftmaxVJPT(forward.probs.data(), backward.logits.data(), vocab_size,
+              num_tokens);
 
-  if constexpr (TConfig::kFinalCap > 0.0f) {
+  if (config.final_cap > 0.0f) {
     for (size_t i = 0; i < num_tokens; ++i) {
-      SoftcapVJPT(TConfig::kFinalCap, forward.logits.data() + i * kVocabSize,
-                  backward.logits.data() + i * kVocabSize, kVocabSize);
+      SoftcapVJPT(config.final_cap, forward.logits.data() + i * vocab_size,
+                  backward.logits.data() + i * vocab_size, vocab_size);
     }
   }
 
-  MatMulVJPT(weights.embedder_input_embedding.data(),
-             forward.final_norm_output.data(),
-             backward.logits.data(),
-             grad.embedder_input_embedding.data(),
-             backward.final_norm_output.data(),
-             kVocabSize, kModelDim, num_tokens);
+  MatMulVJPT(
+      weights.embedder_input_embedding.data(), forward.final_norm_output.data(),
+      backward.logits.data(), grad.embedder_input_embedding.data(),
+      backward.final_norm_output.data(), vocab_size, model_dim, num_tokens);
 
   RMSNormVJPT(weights.final_norm_scale.data(),
               forward.final_layer_output.data(),
-              backward.final_norm_output.data(),
-              grad.final_norm_scale.data(),
-              backward.final_layer_output.data(), kModelDim, num_tokens);
+              backward.final_norm_output.data(), grad.final_norm_scale.data(),
+              backward.final_layer_output.data(), model_dim, num_tokens);
 
-  for (int layer = static_cast<int>(kLayers) - 1; layer >= 0; --layer) {
-    T* next_layer_grad = layer + 1 < kLayers
-                         ? backward.layers[layer + 1].input.data()
-                         : backward.final_layer_output.data();
+  for (int layer = static_cast<int>(layers) - 1; layer >= 0; --layer) {
+    T* next_layer_grad = layer + 1 < layers
+                             ? backward.layers[layer + 1].input.data()
+                             : backward.final_layer_output.data();
     LayerVJP(*weights.GetLayer(layer), forward.layers[layer], next_layer_grad,
              *grad.GetLayer(layer), backward.layers[layer], num_tokens);
   }
 
-  const T kEmbScaling = EmbeddingScaling(kModelDim);
-  InputEmbeddingVJPT(weights.embedder_input_embedding.data(),
-                     tokens, kEmbScaling, backward.layers[0].input.data(),
-                     grad.embedder_input_embedding.data(), kModelDim);
+  const T kEmbScaling = EmbeddingScaling(model_dim);
+  InputEmbeddingVJPT(weights.embedder_input_embedding.data(), tokens,
+                     kEmbScaling, backward.layers[0].input.data(),
+                     grad.embedder_input_embedding.data(), model_dim);
 }
 
 }  // namespace gcpp
diff --git a/backprop/backward_scalar_test.cc b/backprop/backward_scalar_test.cc
index 262a121..b5e39db 100644
--- a/backprop/backward_scalar_test.cc
+++ b/backprop/backward_scalar_test.cc
@@ -19,7 +19,6 @@
 #include <stdio.h>
 #include <string.h>  // memcpy
 
-#include <array>
 #include <complex>
 #include <limits>
 #include <random>
@@ -384,44 +383,49 @@ TEST(BackPropTest, InputEmbeddingVJP) {
   }
 }
 
-template <typename T>
-struct TestConfig : ConfigBaseGemmaV2 {
-  using Weight = T;
-  static constexpr int kSeqLen = 18;
-  static constexpr int kVocabSize = 12;
-  static constexpr int kModelDim = 32;
-  static constexpr int kHeads = 3;
-  static constexpr int kQKVDim = 12;
-  static constexpr int kFFHiddenDim = 48;
-  static constexpr std::array<LayerAttentionType, 2> kLayerConfig =
-      FixedLayerConfig<2>(LayerAttentionType::kGemma);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-
-  static constexpr int kKVHeads = 1;
-  static constexpr int kGemmaLayers = kLayers;
-};
+static ModelConfig TestConfig() {
+  ModelConfig config;
+  config.scale_names = {"att_ein",      "qkv_ein",   "gr_lin_x_w", "gr_lin_y_w",
+                        "gr_lin_out_w", "gr_gate_w", "gating_ein", "linear_w"};
+  config.model_dim = 32;
+  config.vocab_size = 12;
+  config.seq_len = 18;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .ff_hidden_dim = 48,
+      .heads = 3,
+      .kv_heads = 1,
+      .qkv_dim = 12,
+  };
+  config.layer_configs = {2, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes = FixedAttentionWindowSizes<2>(32);
+  // This is required for optimize_test to pass.
+  config.final_cap = 30.0f;
+  return config;
+}
 
 TEST(BackPropTest, LayerVJP) {
   std::mt19937 gen(42);
   using T = double;
   using TC = std::complex<T>;
-  const size_t kOutputSize = TestConfig<T>::kSeqLen * TestConfig<T>::kModelDim;
-  CompressedLayer<TestConfig<T>> weights;
-  CompressedLayer<TestConfig<T>> grad;
-  ForwardLayer<T, TestConfig<T>> forward;
-  ForwardLayer<T, TestConfig<T>> backward = {};
-  CompressedLayer<TestConfig<TC>> c_weights;
-  ForwardLayer<TC, TestConfig<TC>> c_forward;
-  std::array<T, kOutputSize> y;
+  ModelConfig config = TestConfig();
+  const size_t kOutputSize = config.seq_len * config.model_dim;
+  LayerWeightsPtrs<T> weights(config.layer_configs[0]);
+  LayerWeightsPtrs<T> grad(config.layer_configs[0]);
+  ForwardLayer<T> forward(config.layer_configs[0], config.seq_len);
+  ForwardLayer<T> backward(config.layer_configs[0], config.seq_len);
+  LayerWeightsPtrs<TC> c_weights(config.layer_configs[0]);
+  ForwardLayer<TC> c_forward(config.layer_configs[0], config.seq_len);
+  MatStorageT<T> y("y", kOutputSize, 1);
   MatStorageT<T> dy("dy", kOutputSize, 1);
-  std::array<TC, kOutputSize> c_y;
+  MatStorageT<TC> c_y("c_y", kOutputSize, 1);
   const size_t num_tokens = 3;
-  weights.Allocate();
-  grad.Allocate();
-  c_weights.Allocate();
+  std::vector<MatStorage> layer_storage;
+  weights.Allocate(layer_storage);
+  grad.Allocate(layer_storage);
+  c_weights.Allocate(layer_storage);
   backward.input.ZeroInit();
 
   for (size_t iter = 0; iter < 10; ++iter) {
@@ -432,7 +436,7 @@ TEST(BackPropTest, LayerVJP) {
     Complexify(forward.input, c_forward.input);
     auto func = [&]() {
       ApplyLayer(c_weights, c_forward, num_tokens, c_y.data());
-      return DotT(dy.data(), c_y.data(), num_tokens * TestConfig<T>::kModelDim);
+      return DotT(dy.data(), c_y.data(), num_tokens * config.model_dim);
     };
     grad.ZeroInit(/*layer_idx=*/0);
     ApplyLayer(weights, forward, num_tokens, y.data());
@@ -447,12 +451,13 @@ TEST(BackPropTest, EndToEnd) {
   std::mt19937 gen(42);
   using T = double;
   using TC = std::complex<T>;
-  WeightsWrapper<TestConfig<T>> weights;
-  WeightsWrapper<TestConfig<T>> grad;
-  ForwardPass<T, TestConfig<T>> forward;
-  ForwardPass<T, TestConfig<T>> backward;
-  WeightsWrapper<TestConfig<TC>> c_weights;
-  ForwardPass<TC, TestConfig<TC>> c_forward;
+  ModelConfig config = TestConfig();
+  WeightsWrapper<T> weights(config);
+  WeightsWrapper<T> grad(config);
+  ForwardPass<T> forward(config);
+  ForwardPass<T> backward(config);
+  WeightsWrapper<TC> c_weights(config);
+  ForwardPass<TC> c_forward(config);
 
   ReverseSequenceSampler training_task({0, 0, 1, 1});
   std::vector<Prompt> batch = training_task.SampleBatch(3, gen);
@@ -474,9 +479,9 @@ TEST(BackPropTest, EndToEnd) {
   }
 }
 
-template <typename T, typename TConfig>
-void MulByConstAndAddT(T c, const CompressedLayer<TConfig>& x,
-                       CompressedLayer<TConfig>& out) {
+template <typename T>
+void MulByConstAndAddT(T c, const LayerWeightsPtrs<T>& x,
+                       LayerWeightsPtrs<T>& out) {
   MulByConstAndAddT(c, x.pre_attention_norm_scale,
                     out.pre_attention_norm_scale);
   MulByConstAndAddT(c, x.attn_vec_einsum_w, out.attn_vec_einsum_w);
@@ -486,23 +491,23 @@ void MulByConstAndAddT(T c, const CompressedLayer<TConfig>& x,
   MulByConstAndAddT(c, x.linear_w, out.linear_w);
 }
 
-template <typename T, typename TConfig>
-void MulByConstAndAddT(T c, const CompressedWeights<TConfig>& x,
-                       CompressedWeights<TConfig>& out) {
-  static constexpr size_t kLayers = TConfig::kLayers;
+template <typename T>
+void MulByConstAndAddT(T c, const ModelWeightsPtrs<T>& x,
+                       ModelWeightsPtrs<T>& out) {
+  const size_t layers = x.c_layers.size();
   MulByConstAndAddT(c, x.embedder_input_embedding,
                     out.embedder_input_embedding);
   MulByConstAndAddT(c, x.final_norm_scale, out.final_norm_scale);
-  for (size_t i = 0; i < kLayers; ++i) {
+  for (size_t i = 0; i < layers; ++i) {
     MulByConstAndAddT(c, *x.GetLayer(i), *out.GetLayer(i));
   }
 }
 
 // Evaluates forward pass on a batch.
-template <typename T, typename TConfig>
+template <typename T>
 T CrossEntropyLossForwardPass(const std::vector<Prompt>& batch,
-                              const WeightsWrapper<TConfig>& weights,
-                              ForwardPass<T, TConfig>& forward) {
+                              const WeightsWrapper<T>& weights,
+                              ForwardPass<T>& forward) {
   T loss = 0.0;
   for (const Prompt& prompt : batch) {
     loss += CrossEntropyLossForwardPass(prompt, weights.get(), forward);
@@ -514,12 +519,11 @@ T CrossEntropyLossForwardPass(const std::vector<Prompt>& batch,
 // Evaluates forward pass on a batch by applying gradient with the given
 // learning rate. Does not update weights, but uses the given tmp weights
 // instead.
-template <typename T, typename TConfig>
+template <typename T>
 T CrossEntropyLossForwardPass(T learning_rate, const std::vector<Prompt>& batch,
-                              const WeightsWrapper<TConfig>& weights,
-                              const WeightsWrapper<TConfig>& grad,
-                              WeightsWrapper<TConfig>& tmp,
-                              ForwardPass<T, TConfig>& forward) {
+                              const WeightsWrapper<T>& weights,
+                              const WeightsWrapper<T>& grad,
+                              WeightsWrapper<T>& tmp, ForwardPass<T>& forward) {
   tmp.CopyFrom(weights);
   const T scale = -learning_rate / batch.size();
   MulByConstAndAddT(scale, grad.get(), tmp.get());
@@ -529,11 +533,9 @@ T CrossEntropyLossForwardPass(T learning_rate, const std::vector<Prompt>& batch,
 // Uses line search in the negative gradient direction to update weights. We do
 // this so that we can test that each step during the gradient descent can
 // decrease the objective function value.
-template <typename T, typename TConfig>
-T FindOptimalUpdate(const WeightsWrapper<TConfig>& grad,
-                    WeightsWrapper<TConfig>& weights,
-                    WeightsWrapper<TConfig>& tmp,
-                    ForwardPass<T, TConfig>& forward,
+template <typename T>
+T FindOptimalUpdate(const WeightsWrapper<T>& grad, WeightsWrapper<T>& weights,
+                    WeightsWrapper<T>& tmp, ForwardPass<T>& forward,
                     const std::vector<Prompt>& batch, T loss,
                     T initial_learning_rate) {
   T lr0 = initial_learning_rate;
@@ -568,13 +570,14 @@ TEST(BackProptest, Convergence) {
   std::mt19937 gen(42);
   using T = float;
   using TC = std::complex<double>;
-  WeightsWrapper<TestConfig<T>> weights;
-  WeightsWrapper<TestConfig<T>> grad;
-  WeightsWrapper<TestConfig<T>> tmp;
-  ForwardPass<T, TestConfig<T>> forward;
-  ForwardPass<T, TestConfig<T>> backward;
-  WeightsWrapper<TestConfig<TC>> c_weights;
-  ForwardPass<TC, TestConfig<TC>> c_forward;
+  ModelConfig config = TestConfig();
+  WeightsWrapper<T> weights(config);
+  WeightsWrapper<T> grad(config);
+  WeightsWrapper<T> tmp(config);
+  ForwardPass<T> forward(config);
+  ForwardPass<T> backward(config);
+  WeightsWrapper<TC> c_weights(config);
+  ForwardPass<TC> c_forward(config);
   constexpr size_t kBatchSize = 5;
   ReverseSequenceSampler training_task({0, 0, 0, 1, 1});
   T learning_rate = 0.01;
diff --git a/backprop/backward_test.cc b/backprop/backward_test.cc
index 01c5e73..2b82c12 100644
--- a/backprop/backward_test.cc
+++ b/backprop/backward_test.cc
@@ -19,7 +19,6 @@
 
 #include <stddef.h>
 
-#include <array>
 #include <complex>
 #include <cstdlib>  // std::abs
 #include <random>
@@ -34,7 +33,6 @@
 #include "backprop/test_util.h"
 #include "gemma/activations.h"
 #include "gemma/configs.h"
-#include "gemma/weights.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
@@ -50,6 +48,7 @@
 #include "backprop/forward-inl.h"
 #include "compression/compress.h"
 #include "ops/ops-inl.h"
+#include "util/allocator.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
@@ -85,8 +84,8 @@ void TestMatMulVJP() {
     };
 
     grad.ZeroInit();
-    MatMulVJP<kCols, kRows>(weights.data(), x.data(), dy.data(), kTokens,
-                            grad.data(), dx.data(), pool);
+    MatMulVJP(weights.data(), x.data(), dy.data(), kCols, kRows, kTokens,
+              grad.data(), dx.data(), pool);
     TestGradient(dx, c_x, func, 5e-5f, 5e-5f, __LINE__);
     TestGradient(grad, c_weights, func, 5e-5f, 5e-5f, __LINE__);
 
@@ -130,9 +129,8 @@ void TestMultiHeadMatMulVJP() {
     };
 
     grad.ZeroInit();
-    MultiHeadMatMulVJP<kHeads, kCols, kRows>(
-        weights.data(), x.data(), dy.data(), kTokens, grad.data(), dx.data(),
-        pool);
+    MultiHeadMatMulVJP(weights.data(), x.data(), dy.data(), kHeads, kCols,
+                       kRows, kTokens, grad.data(), dx.data(), pool);
     TestGradient(dx, c_x, func, 5e-5f, 5e-5f, __LINE__);
     TestGradient(grad, c_weights, func, 5e-5f, 5e-5f, __LINE__);
 
@@ -186,63 +184,63 @@ void TestRMSNormVJP() {
   }
 }
 
-template <typename T>
-struct TestConfig : ConfigBaseGemmaV2 {
-  using Weight = T;
-  static constexpr int kSeqLen = 24;
-  static constexpr int kVocabSize = 16;
-  static constexpr int kModelDim = 32;
-  static constexpr int kHeads = 3;
-  static constexpr int kQKVDim = 16;
-  static constexpr int kFFHiddenDim = 64;
-  static constexpr std::array<LayerAttentionType, 2> kLayerConfig =
-      FixedLayerConfig<2>(LayerAttentionType::kGemma);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-
-  static constexpr int kKVHeads = 1;
-  static constexpr int kGemmaLayers = kLayers;
-};
+static ModelConfig TestConfig() {
+  ModelConfig config;
+  config.scale_names = {"att_ein",      "qkv_ein",   "gr_lin_x_w", "gr_lin_y_w",
+                        "gr_lin_out_w", "gr_gate_w", "gating_ein", "linear_w"};
+  config.model_dim = 32;
+  config.vocab_size = 16;
+  config.seq_len = 24;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .ff_hidden_dim = 64,
+      .heads = 3,
+      .kv_heads = 1,
+      .qkv_dim = 16,
+  };
+  config.layer_configs = {2, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes = FixedAttentionWindowSizes<2>(32);
+  // This is required for optimize_test to pass.
+  config.att_cap = 50.0f;
+  config.final_cap = 30.0f;
+  return config;
+}
 
 void TestEndToEnd() {
   std::mt19937 gen(42);
   hwy::ThreadPool pool(0);
-  using WeightsF = CompressedWeights<TestConfig<float>>;
-  using LayerF = CompressedLayer<TestConfig<float>>;
-  WeightsWrapper<TestConfig<float>> weights;
-  WeightsWrapper<TestConfig<float>> grad;
-  ActivationsWrapper<float, TestConfig<float>> forward0;
-  ActivationsWrapper<float, TestConfig<float>> forward1;
-  ActivationsWrapper<float, TestConfig<float>> backward;
+  ModelConfig config = TestConfig();
+  WeightsWrapper<float> weights(config);
+  WeightsWrapper<float> grad(config);
+  ForwardPass<float> forward0(config);
+  ForwardPass<float> forward1(config);
+  ForwardPass<float> backward(config);
   using TC = std::complex<double>;
-  WeightsWrapper<TestConfig<TC>> c_weights;
-  ForwardPass<TC, TestConfig<TC>> c_forward;
+  WeightsWrapper<TC> c_weights(config);
+  ForwardPass<TC> c_forward(config);
 
   ReverseSequenceSampler training_task({0, 0, 1, 1});
   std::vector<Prompt> batch = training_task.SampleBatch(3, gen);
 
-  RowVectorBatch<float> inv_timescale =
-      Activations::CreateInvTimescale<TestConfig<float>>();
+  RowVectorBatch<float> inv_timescale = Activations::CreateInvTimescale(
+      config.layer_configs[0].qkv_dim, config.layer_configs[0].post_qk);
   for (const Prompt& prompt : batch) {
     ReverseSequenceSampler::LogPrompt(prompt);
     RandInit(weights.get(), 1.0f, gen);
 
-    float loss0 = CrossEntropyLossForwardPass(
-        prompt, weights.get(), forward0.get());
+    float loss0 = CrossEntropyLossForwardPass(prompt, weights.get(), forward0);
 
-    float loss1 =
-        CrossEntropyLossForwardPass<TestConfig<float>, WeightsF, LayerF>(
-            prompt.tokens, prompt.context_size, weights.get(), forward1.get(),
-            inv_timescale, pool);
+    float loss1 = CrossEntropyLossForwardPass(
+        prompt.tokens, prompt.context_size, weights.get(), forward1,
+        inv_timescale, pool);
 
     EXPECT_NEAR(loss1, loss0, std::abs(loss0) * 2e-5);
 
     grad.ZeroInit();
-    CrossEntropyLossBackwardPass<TestConfig<float>, WeightsF, LayerF>(
-        prompt, weights.get(), forward1.get(), grad.get(), backward.get(),
-        inv_timescale, pool);
+    CrossEntropyLossBackwardPassInl(prompt, weights.get(), forward1, grad.get(),
+                                    backward, inv_timescale, pool);
 
     Complexify(weights.get(), c_weights.get());
     auto func = [&]() {
diff --git a/backprop/forward-inl.h b/backprop/forward-inl.h
index b6b1dc0..ca969c4 100644
--- a/backprop/forward-inl.h
+++ b/backprop/forward-inl.h
@@ -26,6 +26,7 @@
 #include "backprop/activations.h"
 #include "gemma/common.h"
 #include "gemma/configs.h"
+#include "gemma/weights.h"
 #include "util/allocator.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -93,29 +94,29 @@ static HWY_NOINLINE float CrossEntropyLoss(const float* HWY_RESTRICT probs,
   return loss * scaling;
 }
 
-template <typename TConfig, typename LayerT>
-void ApplyForwardLayer(const LayerT& weights,
-                       ForwardLayer<float, TConfig>& activations,
-                       size_t num_tokens, float* HWY_RESTRICT output,
+template <typename T>
+void ApplyForwardLayer(const LayerWeightsPtrs<T>& weights,
+                       ForwardLayer<float>& activations, size_t num_tokens,
+                       float* HWY_RESTRICT output,
                        const RowVectorBatch<float>& inv_timescale,
                        hwy::ThreadPool& pool) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static const float kQueryScale =
+  const LayerConfig& config = weights.layer_config;
+  const size_t model_dim = config.model_dim;
+  const size_t kSeqLen = activations.input.Rows();
+  const size_t kQKVDim = config.qkv_dim;
+  const size_t kHeads = config.heads;
+  static const float query_scale =
       static_cast<float>(1.0 / sqrt(static_cast<double>(kQKVDim)));
   HWY_ASSERT(num_tokens <= kSeqLen);
 
   ApplyRMSNorm(weights.pre_attention_norm_scale.data(),
-               activations.input.data(), kModelDim, num_tokens,
+               activations.input.data(), model_dim, num_tokens,
                activations.pre_att_rms_out.data(), pool);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec<(kHeads + 2) * kQKVDim, kModelDim>(
-        weights.qkv_einsum_w, 0,
-        activations.pre_att_rms_out.data() + pos * kModelDim,
-        activations.qkv.data() + pos * (kHeads + 2) * kQKVDim, pool);
+    MatVec(weights.qkv_einsum_w, 0, (kHeads + 2) * kQKVDim, model_dim,
+           activations.pre_att_rms_out.data() + pos * model_dim,
+           activations.qkv.data() + pos * (kHeads + 2) * kQKVDim, pool);
   }
   const size_t num_tasks = kHeads * num_tokens;
 
@@ -130,7 +131,7 @@ void ApplyForwardLayer(const LayerT& weights,
     float* HWY_RESTRICT q =
         activations.qkv.data() + (pos * (kHeads + 2) + head) * kQKVDim;
     Rope(q, kQKVDim, inv_timescale.Const(), pos);
-    MulByConst(kQueryScale, q, kQKVDim);
+    MulByConst(query_scale, q, kQKVDim);
   });
 
   pool.Run(0, num_tasks, [&](const uint64_t task, size_t thread) HWY_ATTR {
@@ -174,29 +175,29 @@ void ApplyForwardLayer(const LayerT& weights,
   activations.attention_out.ZeroInit();
   for (size_t pos = 0; pos < num_tokens; ++pos) {
     for (size_t head = 0; head < kHeads; ++head) {
-      MatVec<kModelDim, kQKVDim>(
-          weights.attn_vec_einsum_w, head * kModelDim * kQKVDim,
+      MatVec(
+          weights.attn_vec_einsum_w, head * model_dim * kQKVDim, model_dim,
+          kQKVDim,
           activations.att_out.data() + pos * kHeads * kQKVDim + head * kQKVDim,
-          activations.att_post1.data() + pos * kModelDim, pool);
-      AddFrom(activations.att_post1.data() + pos * kModelDim,
-              activations.attention_out.data() + pos * kModelDim, kModelDim);
+          activations.att_post1.data() + pos * model_dim, pool);
+      AddFrom(activations.att_post1.data() + pos * model_dim,
+              activations.attention_out.data() + pos * model_dim, model_dim);
     }
   }
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(activations.input.data() + pos * kModelDim,
-            activations.attention_out.data() + pos * kModelDim, kModelDim);
+    AddFrom(activations.input.data() + pos * model_dim,
+            activations.attention_out.data() + pos * model_dim, model_dim);
   }
 
   ApplyRMSNorm(weights.pre_ffw_norm_scale.data(),
-               activations.attention_out.data(), kModelDim, num_tokens,
+               activations.attention_out.data(), model_dim, num_tokens,
                activations.bf_pre_ffw_rms_out.data(), pool);
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
+  const size_t kFFHiddenDim = config.ff_hidden_dim;
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec<kFFHiddenDim * 2, kModelDim>(
-        weights.gating_einsum_w, 0,
-        activations.bf_pre_ffw_rms_out.data() + pos * kModelDim,
-        activations.ffw_hidden.data() + pos * kFFHiddenDim * 2, pool);
+    MatVec(weights.gating_einsum_w, 0, kFFHiddenDim * 2, model_dim,
+           activations.bf_pre_ffw_rms_out.data() + pos * model_dim,
+           activations.ffw_hidden.data() + pos * kFFHiddenDim * 2, pool);
   }
   for (size_t pos = 0; pos < num_tokens; ++pos) {
     const size_t hidden_offset = pos * kFFHiddenDim * 2;
@@ -215,77 +216,76 @@ void ApplyForwardLayer(const LayerT& weights,
     }
   }
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec<kModelDim, kFFHiddenDim>(
-        weights.linear_w, 0,
-        activations.ffw_hidden_gated.data() + pos * kFFHiddenDim,
-        output + pos * kModelDim, pool);
+    MatVec(weights.linear_w, 0, model_dim, kFFHiddenDim,
+           activations.ffw_hidden_gated.data() + pos * kFFHiddenDim,
+           output + pos * model_dim, pool);
   }
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    AddFrom(activations.attention_out.data() + pos * kModelDim,
-            output + pos * kModelDim, kModelDim);
+    AddFrom(activations.attention_out.data() + pos * model_dim,
+            output + pos * model_dim, model_dim);
   }
 }
 
-template <typename TConfig, typename WeightsT, typename LayerT>
+template <typename T>
 float CrossEntropyLossForwardPass(const std::vector<int>& prompt,
-                                  size_t context_size, const WeightsT& weights,
-                                  ForwardPass<float, TConfig>& forward,
+                                  size_t context_size,
+                                  const ModelWeightsPtrs<T>& weights,
+                                  ForwardPass<float>& forward,
                                   const RowVectorBatch<float>& inv_timescale,
                                   hwy::ThreadPool& pool) {
-  static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kLayers = TConfig::kLayers;
-  const float kEmbScaling = EmbeddingScaling<TConfig>();
-  static_assert(!TConfig::kAbsolutePE);
-  static_assert(TConfig::kPostNorm == PostNormType::None);
-  static_assert(TConfig::kKVHeads == 1);
+  const ModelConfig& config = weights.weights_config;
+  const size_t vocab_size = config.vocab_size;
+  const size_t model_dim = config.model_dim;
+  const size_t layers = config.layer_configs.size();
+  const float emb_scaling = EmbeddingScaling(model_dim);
+  HWY_ASSERT(!config.absolute_pe);
+  HWY_ASSERT(config.layer_configs[0].post_norm == PostNormType::None);
+  HWY_ASSERT(config.layer_configs[0].kv_heads == 1);
 
   HWY_DASSERT(context_size > 0);
   HWY_DASSERT(context_size < prompt.size());
   const size_t num_tokens = prompt.size() - 1;
 
-  InputEmbedding(weights.embedder_input_embedding, prompt, kEmbScaling,
-                 forward.layers[0].input.data(), kModelDim, kVocabSize);
+  InputEmbedding(weights.embedder_input_embedding, prompt, emb_scaling,
+                 forward.layers[0].input.data(), model_dim, vocab_size);
 
-  for (size_t layer = 0; layer < kLayers; ++layer) {
-    auto type = TConfig::kLayerConfig[layer];
+  for (size_t layer = 0; layer < config.layer_configs.size(); ++layer) {
+    auto type = config.layer_configs[layer].type;
     // TODO(szabadka) Implement Griffin layer.
     HWY_ASSERT(type == LayerAttentionType::kGemma);
-    float* HWY_RESTRICT output = layer + 1 < kLayers ?
-                                 forward.layers[layer + 1].input.data() :
-                                 forward.final_layer_output.data();
-    ApplyForwardLayer<TConfig, LayerT>(*weights.GetLayer(layer),
-                                       forward.layers[layer], num_tokens,
-                                       output, inv_timescale, pool);
+    float* HWY_RESTRICT output = layer + 1 < layers
+                                     ? forward.layers[layer + 1].input.data()
+                                     : forward.final_layer_output.data();
+    ApplyForwardLayer(*weights.GetLayer(layer), forward.layers[layer],
+                      num_tokens, output, inv_timescale, pool);
   }
 
   ApplyRMSNorm(weights.final_norm_scale.data(),
-               forward.final_layer_output.data(),
-               kModelDim, num_tokens, forward.final_norm_output.data(), pool);
+               forward.final_layer_output.data(), model_dim, num_tokens,
+               forward.final_norm_output.data(), pool);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    MatVec<kVocabSize, kModelDim>(
-        weights.embedder_input_embedding, 0,
-        forward.final_norm_output.data() + pos * kModelDim,
-        forward.logits.data() + pos * kVocabSize, pool);
+    MatVec(weights.embedder_input_embedding, 0, vocab_size, model_dim,
+           forward.final_norm_output.data() + pos * model_dim,
+           forward.logits.data() + pos * vocab_size, pool);
   }
 
-  if constexpr (TConfig::kFinalCap > 0.0f) {
+  if (config.final_cap > 0.0f) {
     for (size_t pos = 0; pos < num_tokens; ++pos) {
-      LogitsSoftCap(TConfig::kFinalCap,
-                    forward.logits.data() + pos * kVocabSize, kVocabSize);
+      LogitsSoftCap(config.final_cap, forward.logits.data() + pos * vocab_size,
+                    vocab_size);
     }
   }
 
   hwy::CopyBytes(forward.logits.data(), forward.probs.data(),
-                 num_tokens * kVocabSize * sizeof(forward.logits.At(0)));
+                 num_tokens * vocab_size * sizeof(forward.logits.At(0)));
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    Softmax(forward.probs.data() + pos * kVocabSize, kVocabSize);
+    Softmax(forward.probs.data() + pos * vocab_size, vocab_size);
   }
 
   return CrossEntropyLoss(forward.probs.data(), prompt, context_size,
-                          kVocabSize, pool);
+                          vocab_size, pool);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/backprop/forward.cc b/backprop/forward.cc
index 5b2cf1a..0c6cc5c 100644
--- a/backprop/forward.cc
+++ b/backprop/forward.cc
@@ -17,8 +17,9 @@
 
 #include "backprop/activations.h"
 #include "backprop/prompt.h"
-#include "gemma/activations.h"
 #include "gemma/common.h"
+#include "gemma/configs.h"
+#include "util/allocator.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 // Compiles this file for multiple architectures via "foreach_target.h", to
@@ -36,38 +37,13 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-template <typename TConfig>
-float CrossEntropyLossForwardPass(const Prompt& prompt,
-                                  const ByteStorageT& weights_u8,
-                                  ByteStorageT& forward_u8,
-                                  RowVectorBatch<float>& inv_timescale,
-                                  hwy::ThreadPool& pool) {
-  const auto& weights =
-      *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-  auto& forward =
-      *reinterpret_cast<ForwardPass<float, TConfig>*>(forward_u8.get());
-  return CrossEntropyLossForwardPass<TConfig, CompressedWeights<TConfig>,
-                                     CompressedLayer<TConfig>>(
-      prompt.tokens, prompt.context_size, weights, forward, inv_timescale,
-      pool);
-}
-
-float CrossEntropyLossForwardPassT(Model model, const Prompt& prompt,
-                                   const ByteStorageT& weights,
-                                   ByteStorageT& forward,
+float CrossEntropyLossForwardPassT(const Prompt& prompt,
+                                   const ModelWeightsPtrs<float>& weights,
+                                   ForwardPass<float>& forward,
                                    RowVectorBatch<float>& inv_timescale,
                                    hwy::ThreadPool& pool) {
-  // TODO(janwas): use CallFunctorForModel
-  switch (model) {
-    case Model::GEMMA_2B:
-      return CrossEntropyLossForwardPass<ConfigGemma2B<float>>(
-          prompt, weights, forward, inv_timescale, pool);
-    case Model::GEMMA_TINY:
-      return CrossEntropyLossForwardPass<ConfigGemmaTiny<float>>(
-          prompt, weights, forward, inv_timescale, pool);
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
+  return CrossEntropyLossForwardPass(prompt.tokens, prompt.context_size,
+                                     weights, forward, inv_timescale, pool);
 }
 
 }  // namespace HWY_NAMESPACE
@@ -79,13 +55,13 @@ namespace gcpp {
 
 HWY_EXPORT(CrossEntropyLossForwardPassT);
 
-float CrossEntropyLossForwardPass(const Model& model, const Prompt& prompt,
-                                  const ByteStorageT& weights,
-                                  ByteStorageT& forward,
+float CrossEntropyLossForwardPass(const Prompt& prompt,
+                                  const ModelWeightsPtrs<float>& weights,
+                                  ForwardPass<float>& forward,
                                   RowVectorBatch<float>& inv_timescale,
                                   hwy::ThreadPool& pool) {
   return HWY_DYNAMIC_DISPATCH(CrossEntropyLossForwardPassT)(
-      model, prompt, weights, forward, inv_timescale, pool);
+      prompt, weights, forward, inv_timescale, pool);
 }
 
 }  // namespace gcpp
diff --git a/backprop/forward.h b/backprop/forward.h
index 92ca371..3b42298 100644
--- a/backprop/forward.h
+++ b/backprop/forward.h
@@ -16,16 +16,17 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_FORWARD_H_
 
+#include "backprop/activations.h"
 #include "backprop/prompt.h"
-#include "gemma/activations.h"
-#include "gemma/common.h"
+#include "gemma/weights.h"
+#include "util/allocator.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
 
-float CrossEntropyLossForwardPass(const Model& model, const Prompt& prompt,
-                                  const ByteStorageT& weights,
-                                  ByteStorageT& forward,
+float CrossEntropyLossForwardPass(const Prompt& prompt,
+                                  const ModelWeightsPtrs<float>& weights,
+                                  ForwardPass<float>& forward,
                                   RowVectorBatch<float>& inv_timescale,
                                   hwy::ThreadPool& pool);
 
diff --git a/backprop/forward_scalar.h b/backprop/forward_scalar.h
index 064112b..617d0c3 100644
--- a/backprop/forward_scalar.h
+++ b/backprop/forward_scalar.h
@@ -127,108 +127,107 @@ void InputEmbedding(const T* w, const std::vector<int>& tokens, T scaling,
   }
 }
 
-template<typename T>
-void MaskedAttention(const T* qkv, T* output, size_t num_tokens,
-                     size_t kHeads, size_t kQKVDim, size_t kSeqLen) {
+template <typename T>
+void MaskedAttention(const T* qkv, T* output, size_t num_tokens, size_t heads,
+                     size_t qkv_dim, size_t seq_len) {
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t head = 0; head < kHeads; ++head) {
-      const size_t qoffset = pos * (kHeads + 2) * kQKVDim;
-      const size_t aoffset = pos * kHeads * kSeqLen + head * kSeqLen;
-      const T* q = qkv + qoffset + head * kQKVDim;
+    for (size_t head = 0; head < heads; ++head) {
+      const size_t qoffset = pos * (heads + 2) * qkv_dim;
+      const size_t aoffset = pos * heads * seq_len + head * seq_len;
+      const T* q = qkv + qoffset + head * qkv_dim;
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        const T* k = qkv + (pos2 * (kHeads + 2) + kHeads) * kQKVDim;
-        output[aoffset + pos2] = DotT(q, k, kQKVDim);
+        const T* k = qkv + (pos2 * (heads + 2) + heads) * qkv_dim;
+        output[aoffset + pos2] = DotT(q, k, qkv_dim);
       }
     }
   }
 }
-template<typename T>
-void MaskedSoftmax(T* x, size_t num_tokens, size_t kHeads, size_t kSeqLen) {
+template <typename T>
+void MaskedSoftmax(T* x, size_t num_tokens, size_t heads, size_t seq_len) {
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t head = 0; head < kHeads; ++head) {
-      size_t offset = pos * kHeads * kSeqLen + head * kSeqLen;
+    for (size_t head = 0; head < heads; ++head) {
+      size_t offset = pos * heads * seq_len + head * seq_len;
       Softmax(x + offset, pos + 1);
-      memset(x + offset + pos + 1, 0, (kSeqLen - pos - 1) * sizeof(T));
+      memset(x + offset + pos + 1, 0, (seq_len - pos - 1) * sizeof(T));
     }
   }
 }
-template<typename T>
+template <typename T>
 void MixByAttention(const T* qkv, const T* attention, T* output,
-                    size_t num_tokens, size_t kHeads, size_t kQKVDim,
-                    size_t kSeqLen) {
+                    size_t num_tokens, size_t heads, size_t qkv_dim,
+                    size_t seq_len) {
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    for (size_t head = 0; head < kHeads; ++head) {
-      const T* att = &attention[pos * kHeads * kSeqLen + head * kSeqLen];
-      T* out = &output[head * kQKVDim + pos * kHeads * kQKVDim];
-      memset(out, 0, kQKVDim * sizeof(out[0]));
+    for (size_t head = 0; head < heads; ++head) {
+      const T* att = &attention[pos * heads * seq_len + head * seq_len];
+      T* out = &output[head * qkv_dim + pos * heads * qkv_dim];
+      memset(out, 0, qkv_dim * sizeof(out[0]));
       for (size_t pos2 = 0; pos2 <= pos; ++pos2) {
-        size_t v_offset = (pos2 * (kHeads + 2) + kHeads + 1) * kQKVDim;
+        size_t v_offset = (pos2 * (heads + 2) + heads + 1) * qkv_dim;
         const T* v = &qkv[v_offset];
-        MulByConstAndAddT(att[pos2], v, out, kQKVDim);
+        MulByConstAndAddT(att[pos2], v, out, qkv_dim);
       }
     }
   }
 }
-template <typename T, typename TConfig>
-void ApplyLayer(const CompressedLayer<TConfig>& weights,
-                ForwardLayer<T, TConfig>& activations, size_t num_tokens,
-                T* output) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-  static const T kQueryScale = T(1.0) / std::sqrt(T(kQKVDim));
+template <typename T>
+void ApplyLayer(const LayerWeightsPtrs<T>& weights,
+                ForwardLayer<T>& activations, size_t num_tokens, T* output) {
+  const LayerConfig& layer_config = weights.layer_config;
+  const size_t model_dim = layer_config.model_dim;
+  const size_t seq_len = activations.input.Rows();
+  const size_t qkv_dim = layer_config.qkv_dim;
+  const size_t heads = layer_config.heads;
+  const size_t ff_hidden_dim = layer_config.ff_hidden_dim;
+  static const T query_scale = T(1.0) / std::sqrt(T(qkv_dim));
 
   RMSNormT(weights.pre_attention_norm_scale.data(), activations.input.data(),
-           activations.pre_att_rms_out.data(), kModelDim, num_tokens);
+           activations.pre_att_rms_out.data(), model_dim, num_tokens);
 
   MatMulT(weights.qkv_einsum_w.data(), activations.pre_att_rms_out.data(),
-          activations.qkv.data(), (kHeads + 2) * kQKVDim, kModelDim,
-          num_tokens);
+          activations.qkv.data(), (heads + 2) * qkv_dim, model_dim, num_tokens);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = activations.qkv.data() + pos * (kHeads + 2) * kQKVDim;
-    for (size_t h = 0; h <= kHeads; ++h) {
-      Rope(qkv + h * kQKVDim, kQKVDim, pos);
+    T* qkv = activations.qkv.data() + pos * (heads + 2) * qkv_dim;
+    for (size_t h = 0; h <= heads; ++h) {
+      Rope(qkv + h * qkv_dim, qkv_dim, pos);
     }
   }
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    T* qkv = activations.qkv.data() + pos * (kHeads + 2) * kQKVDim;
-    MulByConstT(kQueryScale, qkv, kHeads * kQKVDim);
+    T* qkv = activations.qkv.data() + pos * (heads + 2) * qkv_dim;
+    MulByConstT(query_scale, qkv, heads * qkv_dim);
   }
 
-  MaskedAttention(activations.qkv.data(), activations.att.data(),
-                  num_tokens, kHeads, kQKVDim, kSeqLen);
+  MaskedAttention(activations.qkv.data(), activations.att.data(), num_tokens,
+                  heads, qkv_dim, seq_len);
 
-  MaskedSoftmax(activations.att.data(), num_tokens, kHeads, kSeqLen);
+  MaskedSoftmax(activations.att.data(), num_tokens, heads, seq_len);
 
   MixByAttention(activations.qkv.data(), activations.att.data(),
-                 activations.att_out.data(), num_tokens, kHeads, kQKVDim,
-                 kSeqLen);
+                 activations.att_out.data(), num_tokens, heads, qkv_dim,
+                 seq_len);
 
   MultiHeadMatMul(weights.attn_vec_einsum_w.data(), activations.att_out.data(),
-                  activations.attention_out.data(), kHeads, kModelDim, kQKVDim,
+                  activations.attention_out.data(), heads, model_dim, qkv_dim,
                   num_tokens);
 
   AddFromT(activations.input.data(), activations.attention_out.data(),
-           num_tokens * kModelDim);
+           num_tokens * model_dim);
 
   RMSNormT(weights.pre_ffw_norm_scale.data(), activations.attention_out.data(),
-           activations.bf_pre_ffw_rms_out.data(), kModelDim, num_tokens);
+           activations.bf_pre_ffw_rms_out.data(), model_dim, num_tokens);
 
   MatMulT(weights.gating_einsum_w.data(), activations.bf_pre_ffw_rms_out.data(),
-          activations.ffw_hidden.data(), kFFHiddenDim * 2, kModelDim,
+          activations.ffw_hidden.data(), ff_hidden_dim * 2, model_dim,
           num_tokens);
 
   GatedGelu(activations.ffw_hidden.data(), activations.ffw_hidden_gated.data(),
-            kFFHiddenDim, num_tokens);
+            ff_hidden_dim, num_tokens);
 
-  MatMulT(weights.linear_w.data(), activations.ffw_hidden_gated.data(),
-          output, kModelDim, kFFHiddenDim, num_tokens);
+  MatMulT(weights.linear_w.data(), activations.ffw_hidden_gated.data(), output,
+          model_dim, ff_hidden_dim, num_tokens);
 
-  AddFromT(activations.attention_out.data(), output, num_tokens * kModelDim);
+  AddFromT(activations.attention_out.data(), output, num_tokens * model_dim);
 }
 
 template<typename T>
@@ -247,48 +246,47 @@ T CrossEntropyLoss(const T* x, const Prompt& prompt, size_t V) {
   return loss * scaling;
 }
 
-template <typename T, typename TConfig>
+template <typename T>
 T CrossEntropyLossForwardPass(const Prompt& prompt,
-                              const CompressedWeights<TConfig>& weights,
-                              ForwardPass<T, TConfig>& forward) {
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  static constexpr size_t kLayers = TConfig::kLayers;
+                              const ModelWeightsPtrs<T>& weights,
+                              ForwardPass<T>& forward) {
+  const ModelConfig& config = weights.weights_config;
+  const size_t model_dim = config.model_dim;
+  const size_t vocab_size = config.vocab_size;
+  const size_t layers = config.layer_configs.size();
   const std::vector<int> tokens = prompt.tokens;
   const size_t num_tokens = tokens.empty() ? 0 : tokens.size() - 1;
 
-  const T kEmbScaling = EmbeddingScaling(kModelDim);
-  InputEmbedding(weights.embedder_input_embedding.data(), tokens,
-                 kEmbScaling, forward.layers[0].input.data(), kModelDim);
+  const T kEmbScaling = EmbeddingScaling(model_dim);
+  InputEmbedding(weights.embedder_input_embedding.data(), tokens, kEmbScaling,
+                 forward.layers[0].input.data(), model_dim);
 
-  for (size_t layer = 0; layer < kLayers; ++layer) {
-    T* output = layer + 1 < kLayers ?
-                forward.layers[layer + 1].input.data() :
-                forward.final_layer_output.data();
+  for (size_t layer = 0; layer < layers; ++layer) {
+    T* output = layer + 1 < layers ? forward.layers[layer + 1].input.data()
+                                   : forward.final_layer_output.data();
     ApplyLayer(*weights.GetLayer(layer), forward.layers[layer], num_tokens,
                output);
   }
 
-  RMSNormT(weights.final_norm_scale.data(),
-           forward.final_layer_output.data(),
-           forward.final_norm_output.data(), kModelDim, num_tokens);
+  RMSNormT(weights.final_norm_scale.data(), forward.final_layer_output.data(),
+           forward.final_norm_output.data(), model_dim, num_tokens);
 
   MatMulT(weights.embedder_input_embedding.data(),
-          forward.final_norm_output.data(),
-          forward.logits.data(), kVocabSize, kModelDim, num_tokens);
+          forward.final_norm_output.data(), forward.logits.data(), vocab_size,
+          model_dim, num_tokens);
 
   for (size_t pos = 0; pos < num_tokens; ++pos) {
-    if constexpr (TConfig::kFinalCap > 0.0f) {
-      Softcap(TConfig::kFinalCap, forward.logits.data() + pos * kVocabSize,
-              kVocabSize);
+    if (config.final_cap > 0.0f) {
+      Softcap(config.final_cap, forward.logits.data() + pos * vocab_size,
+              vocab_size);
     }
   }
 
   memcpy(forward.probs.data(), forward.logits.data(),
-         num_tokens * kVocabSize * sizeof(forward.logits.At(0)));
-  Softmax(forward.probs.data(), kVocabSize, num_tokens);
+         num_tokens * vocab_size * sizeof(forward.logits.At(0)));
+  Softmax(forward.probs.data(), vocab_size, num_tokens);
 
-  return CrossEntropyLoss(forward.probs.data(), prompt, kVocabSize);
+  return CrossEntropyLoss(forward.probs.data(), prompt, vocab_size);
 }
 
 }  // namespace gcpp
diff --git a/backprop/optimize_test.cc b/backprop/optimize_test.cc
index 26698c6..b47a48d 100644
--- a/backprop/optimize_test.cc
+++ b/backprop/optimize_test.cc
@@ -16,6 +16,7 @@
 #include <stddef.h>
 
 #include <algorithm>
+#include <cstdio>
 #include <random>
 #include <vector>
 
@@ -26,8 +27,10 @@
 #include "backprop/optimizer.h"
 #include "backprop/prompt.h"
 #include "backprop/sampler.h"
+#include "compression/shared.h"
 #include "gemma/activations.h"
 #include "gemma/common.h"
+#include "gemma/configs.h"
 #include "gemma/gemma.h"
 #include "gemma/weights.h"
 #include "util/threading.h"
@@ -45,20 +48,18 @@ TEST(OptimizeTest, GradientDescent) {
       .training = ModelTraining::GEMMA_IT,
       .weight = Type::kF32,
   };
-  ByteStorageT grad = CallForModelAndWeight<AllocateCompressedWeights>(
-      info.model, info.weight, pool);
-  ByteStorageT grad_m = CallForModelAndWeight<AllocateCompressedWeights>(
-      info.model, info.weight, pool);
-  ByteStorageT grad_v = CallForModelAndWeight<AllocateCompressedWeights>(
-      info.model, info.weight, pool);
-  ByteStorageT forward =
-      CallForModelAndWeight<AllocateForwardPass>(info.model, info.weight);
-  ByteStorageT backward =
-      CallForModelAndWeight<AllocateForwardPass>(info.model, info.weight);
-  KVCache kv_cache = KVCache::Create(info.model, /*prefill_tbatch_size=*/16);
+  ModelConfig config = ConfigFromModel(info.model);
+  ModelWeightsStorage grad, grad_m, grad_v;
+  grad.Allocate(info.model, info.weight, pool);
+  grad_m.Allocate(info.model, info.weight, pool);
+  grad_v.Allocate(info.model, info.weight, pool);
+  grad_m.ZeroInit();
+  grad_v.ZeroInit();
+  ForwardPass<float> forward(config), backward(config);
+  KVCache kv_cache = KVCache::Create(config, /*prefill_tbatch_size=*/16);
 
-  RowVectorBatch<float> inv_timescale =
-      Activations::CreateInvTimescale<ConfigGemmaTiny<float>>();
+  RowVectorBatch<float> inv_timescale = Activations::CreateInvTimescale(
+      config.layer_configs[0].qkv_dim, config.layer_configs[0].post_qk);
 
   Gemma gemma(GemmaTokenizer(), info, pools);
 
@@ -92,14 +93,11 @@ TEST(OptimizeTest, GradientDescent) {
                       reply.begin() + context.size());
   };
 
-  RandInitWeights(info.model, info.weight, gemma.Weights(), pool, gen);
-  CallForModelAndWeight<ZeroInitCompressedWeights>(info.model, info.weight,
-                                                   grad_m, pool);
-  CallForModelAndWeight<ZeroInitCompressedWeights>(info.model, info.weight,
-                                                   grad_v, pool);
+  gemma.MutableWeights().RandInit(gen);
+  gemma.MutableWeights().AllocAndCopyWithTranspose(pool);
 
   printf("Initial weights:\n");
-  LogWeightStats(info.model, info.weight, gemma.Weights());
+  gemma.MutableWeights().LogWeightStats();
 
   constexpr size_t kBatchSize = 8;
   const float alpha = 0.001f;
@@ -113,29 +111,29 @@ TEST(OptimizeTest, GradientDescent) {
   size_t num_ok;
   for (; steps < 1000000; ++steps) {
     std::mt19937 sgen(42);
-    CallForModelAndWeight<ZeroInitCompressedWeights>(info.model, info.weight,
-                                                     grad, pool);
+    grad.ZeroInit();
     float total_loss = 0.0f;
     num_ok = 0;
     for (size_t i = 0; i < kBatchSize; ++i) {
       Prompt prompt = training_task.Sample(sgen);
       total_loss += CrossEntropyLossForwardPass(
-          info.model, prompt, gemma.Weights(), forward, inv_timescale, pool);
-      CrossEntropyLossBackwardPass(info.model, prompt, gemma.Weights(), forward,
-                                   grad, backward, inv_timescale, pool);
-      CallForModelAndWeight<ReshapeCompressedWeights>(
-          info.model, info.weight, gemma.MutableWeights(), pool);
+          prompt, *gemma.Weights().GetWeightsOfType<float>(), forward,
+          inv_timescale, pool);
+      CrossEntropyLossBackwardPass(
+          prompt, *gemma.Weights().GetWeightsOfType<float>(), forward,
+          *grad.GetWeightsOfType<float>(), backward, inv_timescale, pool);
+      gemma.MutableWeights().CopyWithTranspose(pool);
       num_ok += verify(prompt) ? 1 : 0;
     }
     total_loss /= kBatchSize;
 
-    AdamUpdate(info.model, info.weight, grad, alpha, beta1, beta2, epsilon,
-               steps + 1, gemma.Weights(), grad_m, grad_v, pool);
+    AdamUpdate(info.weight, grad, alpha, beta1, beta2, epsilon, steps + 1,
+               gemma.Weights(), grad_m, grad_v, pool);
     printf("step: %zu  total_loss: %.15f   num_ok: %zu/%zu\n",
            steps, total_loss, num_ok, kBatchSize);
     if (steps % 100 == 0) {
       printf("Batch gradient:\n");
-      LogWeightStats(info.model, info.weight, grad);
+      grad.LogWeightStats();
     }
     if (total_loss < 0.5f) {
       break;
@@ -143,7 +141,7 @@ TEST(OptimizeTest, GradientDescent) {
   }
   printf("Num steps: %zu\n", steps);
   printf("Final weights:\n");
-  LogWeightStats(info.model, info.weight, gemma.Weights());
+  gemma.MutableWeights().LogWeightStats();
   EXPECT_LT(steps, 300);
   EXPECT_EQ(num_ok, kBatchSize);
 }
diff --git a/backprop/optimizer.cc b/backprop/optimizer.cc
index 800f2fa..9187bf7 100644
--- a/backprop/optimizer.cc
+++ b/backprop/optimizer.cc
@@ -16,7 +16,6 @@
 #include "backprop/optimizer.h"
 
 #include <cmath>
-#include <random>
 
 #include "compression/compress.h"
 #include "gemma/common.h"
@@ -30,37 +29,6 @@ namespace gcpp {
 
 namespace {
 
-class WeightInitializer {
- public:
-  WeightInitializer(std::mt19937& gen) : dist_(0.0f, 1.0f), gen_(gen) {}
-
-  void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
-    float* data = tensors[0]->data<float>();
-    for (size_t i = 0; i < tensors[0]->NumElements(); ++i) {
-      data[i] = dist_(gen_);
-    }
-    tensors[0]->set_scale(1.0f);
-  }
-
- private:
-  std::normal_distribution<float> dist_;
-  std::mt19937& gen_;
-};
-
-template <typename TConfig>
-struct RandInitWeightsT {
-  void operator()(const ByteStorageT& weights_u8, hwy::ThreadPool& pool,
-                  std::mt19937& gen) const {
-    auto& weights =
-        *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-    // TODO(szabadka) Use the same weight initialization method as in the python
-    // version.
-    WeightInitializer init(gen);
-    CompressedWeights<TConfig>::ForEachTensor({&weights},
-                                              ForEachType::kLoadNoToc, init);
-  }
-};
-
 class AdamUpdater {
  public:
   explicit AdamUpdater(float alpha, float beta1, float beta2, float epsilon,
@@ -97,42 +65,31 @@ class AdamUpdater {
   float epsilon_;
 };
 
-template <typename TConfig>
-struct AdamUpdateT {
-  void operator()(const ByteStorageT& grad_u8, float alpha, float beta1,
-                  float beta2, float epsilon, size_t t,
-                  const ByteStorageT& weights_u8, const ByteStorageT& grad_m_u8,
-                  const ByteStorageT& grad_v_u8, hwy::ThreadPool& pool) const {
-    using TWeights = CompressedWeights<TConfig>;
-    auto& grad = *reinterpret_cast<TWeights*>(grad_u8.get());
-    auto& weights = *reinterpret_cast<TWeights*>(weights_u8.get());
-    auto& grad_m = *reinterpret_cast<TWeights*>(grad_m_u8.get());
-    auto& grad_v = *reinterpret_cast<TWeights*>(grad_v_u8.get());
-    AdamUpdater updater(alpha, beta1, beta2, epsilon, t);
-    TWeights::ForEachTensor(
-        {&grad, &weights, &grad_m, &grad_v}, ForEachType::kLoadNoToc,
-        [&updater](const char* name, hwy::Span<MatPtr*> tensors) {
-          updater(name, *tensors[0], *tensors[1], *tensors[2], *tensors[3]);
-        });
-  }
-};
+void AdamUpdate(ModelWeightsPtrs<float>* grad, float alpha, float beta1,
+                float beta2, float epsilon, size_t t,
+                ModelWeightsPtrs<float>* weights,
+                ModelWeightsPtrs<float>* grad_m,
+                ModelWeightsPtrs<float>* grad_v, hwy::ThreadPool& pool) {
+  AdamUpdater updater(alpha, beta1, beta2, epsilon, t);
+  ModelWeightsPtrs<float>::ForEachTensor(
+      {grad, weights, grad_m, grad_v}, ForEachType::kLoadNoToc,
+      [&updater](const char* name, hwy::Span<MatPtr*> tensors) {
+        updater(name, *tensors[0], *tensors[1], *tensors[2], *tensors[3]);
+      });
+}
 
 }  // namespace
 
-void RandInitWeights(Model model_type, Type weight_type,
-                     const ByteStorageT& weights, hwy::ThreadPool& pool,
-                     std::mt19937& gen) {
+void AdamUpdate(Type weight_type, const ModelWeightsStorage& grad, float alpha,
+                float beta1, float beta2, float epsilon, size_t t,
+                const ModelWeightsStorage& weights,
+                const ModelWeightsStorage& grad_m,
+                const ModelWeightsStorage& grad_v, hwy::ThreadPool& pool) {
   HWY_ASSERT(weight_type == Type::kF32);
-  CallForModel<float, RandInitWeightsT>(model_type, weights, pool, gen);
-}
-
-void AdamUpdate(Model model_type, Type weight_type, const ByteStorageT& grad,
-                float alpha, float beta1, float beta2, float epsilon, size_t t,
-                const ByteStorageT& weights, const ByteStorageT& grad_m,
-                const ByteStorageT& grad_v, hwy::ThreadPool& pool) {
-  HWY_ASSERT(weight_type == Type::kF32);
-  CallForModel<float, AdamUpdateT>(model_type, grad, alpha, beta1, beta2,
-                                   epsilon, t, weights, grad_m, grad_v, pool);
+  AdamUpdate(grad.GetWeightsOfType<float>(), alpha, beta1, beta2, epsilon, t,
+             weights.GetWeightsOfType<float>(),
+             grad_m.GetWeightsOfType<float>(), grad_v.GetWeightsOfType<float>(),
+             pool);
 }
 
 }  // namespace gcpp
diff --git a/backprop/optimizer.h b/backprop/optimizer.h
index b42f311..8b25c52 100644
--- a/backprop/optimizer.h
+++ b/backprop/optimizer.h
@@ -16,22 +16,17 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_OPTIMIZER_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_OPTIMIZER_H_
 
-#include <random>
-
 #include "gemma/common.h"
-#include "util/allocator.h"
+#include "gemma/weights.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
 
-void RandInitWeights(Model model_type, Type weight_type,
-                     const ByteStorageT& weights, hwy::ThreadPool& pool,
-                     std::mt19937& gen);
-
-void AdamUpdate(Model model_type, Type weight_type, const ByteStorageT& grad,
-                float alpha, float beta1, float beta2, float epsilon, size_t t,
-                const ByteStorageT& weights, const ByteStorageT& grad_m,
-                const ByteStorageT& grad_v, hwy::ThreadPool& pool);
+void AdamUpdate(Type weight_type, const ModelWeightsStorage& grad, float alpha,
+                float beta1, float beta2, float epsilon, size_t t,
+                const ModelWeightsStorage& weights,
+                const ModelWeightsStorage& grad_m,
+                const ModelWeightsStorage& grad_v, hwy::ThreadPool& pool);
 
 }  // namespace gcpp
 
diff --git a/backprop/test_util.h b/backprop/test_util.h
index bfa2cc5..86f99b1 100644
--- a/backprop/test_util.h
+++ b/backprop/test_util.h
@@ -21,11 +21,12 @@
 #include <cmath>
 #include <complex>
 #include <random>
+#include <vector>
 
 #include "gtest/gtest.h"
 #include "compression/compress.h"
+#include "gemma/configs.h"
 #include "gemma/weights.h"
-#include "util/allocator.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
@@ -39,8 +40,8 @@ void RandInit(MatPtrT<T>& x, T stddev, std::mt19937& gen) {
 }
 
 // TODO: make a member of Layer<T>.
-template <typename T, typename TConfig>
-void RandInit(CompressedLayer<TConfig>& w, T stddev, std::mt19937& gen) {
+template <typename T>
+void RandInit(LayerWeightsPtrs<T>& w, T stddev, std::mt19937& gen) {
   RandInit(w.pre_attention_norm_scale, stddev, gen);
   RandInit(w.attn_vec_einsum_w, stddev, gen);
   RandInit(w.qkv_einsum_w, stddev, gen);
@@ -49,9 +50,9 @@ void RandInit(CompressedLayer<TConfig>& w, T stddev, std::mt19937& gen) {
   RandInit(w.linear_w, stddev, gen);
 }
 
-template <typename T, typename TConfig>
-void RandInit(CompressedWeights<TConfig>& w, T stddev, std::mt19937& gen) {
-  static constexpr size_t kLayers = TConfig::kLayers;
+template <typename T>
+void RandInit(ModelWeightsPtrs<T>& w, T stddev, std::mt19937& gen) {
+  const size_t kLayers = w.c_layers.size();
   RandInit(w.embedder_input_embedding, stddev, gen);
   RandInit(w.final_norm_scale, stddev, gen);
   for (size_t i = 0; i < kLayers; ++i) {
@@ -66,9 +67,8 @@ void Complexify(const MatPtrT<T>& x, MatPtrT<std::complex<U>>& c_x) {
   }
 }
 
-template <typename TConfig, typename UConfig>
-void Complexify(const CompressedLayer<TConfig>& w,
-                CompressedLayer<UConfig>& c_w) {
+template <typename T, typename U>
+void Complexify(const LayerWeightsPtrs<T>& w, LayerWeightsPtrs<U>& c_w) {
   Complexify(w.pre_attention_norm_scale, c_w.pre_attention_norm_scale);
   Complexify(w.attn_vec_einsum_w, c_w.attn_vec_einsum_w);
   Complexify(w.qkv_einsum_w, c_w.qkv_einsum_w);
@@ -77,10 +77,9 @@ void Complexify(const CompressedLayer<TConfig>& w,
   Complexify(w.linear_w, c_w.linear_w);
 }
 
-template <typename TConfig, typename UConfig>
-void Complexify(const CompressedWeights<TConfig>& w,
-                CompressedWeights<UConfig>& c_w) {
-  static constexpr size_t kLayers = TConfig::kLayers;
+template <typename T, typename U>
+void Complexify(const ModelWeightsPtrs<T>& w, ModelWeightsPtrs<U>& c_w) {
+  const size_t kLayers = w.c_layers.size();
   Complexify(w.embedder_input_embedding, c_w.embedder_input_embedding);
   Complexify(w.final_norm_scale, c_w.final_norm_scale);
   for (size_t i = 0; i < kLayers; ++i) {
@@ -88,26 +87,27 @@ void Complexify(const CompressedWeights<TConfig>& w,
   }
 }
 
-// Owns weights and provides access to TConfig.
-template <typename TConfig>
+// Somewhat duplicates ModelWeightsStorage, but that has neither double nor
+// complex types allowed and it would cause code bloat to add them there.
+template <typename T>
 class WeightsWrapper {
  public:
-  WeightsWrapper()
-      : pool_(0),
-        data_(AllocateCompressedWeights<TConfig>()(pool_)),
-        weights_(reinterpret_cast<CompressedWeights<TConfig>*>(data_.get())) {}
+  explicit WeightsWrapper(const ModelConfig& config)
+      : pool_(0), weights_(config, pool_) {
+    weights_.Allocate(data_, pool_);
+  }
 
-  const CompressedWeights<TConfig>& get() const { return *weights_; }
-  CompressedWeights<TConfig>& get() { return *weights_; }
-  void ZeroInit() { weights_->ZeroInit(); }
-  void CopyFrom(const WeightsWrapper<TConfig>& other) {
-    get().CopyFrom(other.get());
+  const ModelWeightsPtrs<T>& get() const { return weights_; }
+  ModelWeightsPtrs<T>& get() { return weights_; }
+  void ZeroInit() { weights_.ZeroInit(); }
+  void CopyFrom(const WeightsWrapper<T>& other) {
+    weights_.CopyFrom(other.weights_);
   }
 
  private:
   hwy::ThreadPool pool_;
-  ByteStorageT data_;
-  CompressedWeights<TConfig>* weights_;
+  std::vector<MatStorage> data_;
+  ModelWeightsPtrs<T> weights_;
 };
 
 template <typename T, typename U>
@@ -173,9 +173,9 @@ void TestGradient(const MatPtrT<T>& grad, MatPtrT<std::complex<double>>& x,
   TestGradient(grad, x, func, 1e-50, max_abs_err, max_rel_error, line);
 }
 
-template <typename T, typename TConfig, typename UConfig, typename FUNC>
-void TestGradient(const CompressedLayer<TConfig>& grad,
-                  CompressedLayer<UConfig>& c_weights, FUNC func, T max_err) {
+template <typename T, typename U, typename FUNC>
+void TestGradient(const LayerWeightsPtrs<T>& grad,
+                  LayerWeightsPtrs<U>& c_weights, FUNC func, T max_err) {
   TestGradient(grad.pre_attention_norm_scale,
                c_weights.pre_attention_norm_scale,
                func, max_err, max_err, __LINE__);
@@ -191,15 +191,15 @@ void TestGradient(const CompressedLayer<TConfig>& grad,
                func, max_err, max_err, __LINE__);
 }
 
-template <typename T, typename TConfig, typename UConfig, typename FUNC>
-void TestGradient(const CompressedWeights<TConfig>& grad,
-                  CompressedWeights<UConfig>& c_weights, FUNC func, T max_err) {
+template <typename T, typename U, typename FUNC>
+void TestGradient(const ModelWeightsPtrs<T>& grad,
+                  ModelWeightsPtrs<U>& c_weights, FUNC func, T max_err) {
   TestGradient(grad.embedder_input_embedding,
                  c_weights.embedder_input_embedding,
                  func,  2 * max_err, max_err, __LINE__);
   TestGradient(grad.final_norm_scale, c_weights.final_norm_scale,
                func, max_err, max_err, __LINE__);
-  for (int i = 0; i < TConfig::kLayers; ++i) {
+  for (size_t i = 0; i < grad.c_layers.size(); ++i) {
     TestGradient(*grad.GetLayer(i), *c_weights.GetLayer(i), func, max_err);
   }
 }
diff --git a/compression/blob_store.cc b/compression/blob_store.cc
index 24248a1..57f50f5 100644
--- a/compression/blob_store.cc
+++ b/compression/blob_store.cc
@@ -21,7 +21,6 @@
 #include <atomic>
 #include <cstdio>
 #include <memory>
-#include <new>
 #include <string>
 #include <vector>
 
@@ -276,6 +275,7 @@ BlobError BlobReader::ReadAll(hwy::ThreadPool& pool) {
            [pfile, &requests, &err](uint64_t i, size_t /*thread*/) {
              if (!pfile->Read(requests[i].offset, requests[i].size,
                               requests[i].data)) {
+               fprintf(stderr, "Failed to read blob %zu\n", i);
                err.test_and_set();
              }
            });
diff --git a/compression/compress.h b/compression/compress.h
index e0ea0d7..adb35a1 100644
--- a/compression/compress.h
+++ b/compression/compress.h
@@ -102,8 +102,8 @@ class CompressedArray {
 class MatPtr {
  public:
   // Full constructor for dynamic sizing.
-  MatPtr(const std::string& name, const std::string& type, size_t element_size,
-         size_t rows, size_t cols)
+  MatPtr(const std::string& name, Type type, size_t element_size, size_t rows,
+         size_t cols)
       : name_(name),
         type_(type),
         element_size_(element_size),
@@ -129,7 +129,7 @@ class MatPtr {
   MatPtr(const hwy::uint128_t& key0, const hwy::uint128_t& key1,
          const hwy::uint128_t& key2, const hwy::uint128_t& key3)
       : name_(StringFromKey(key0)),
-        type_(StringFromKey(key1)),
+        type_(static_cast<Type>(key1.lo)),
         element_size_(key2.hi),
         num_elements_(key2.lo),
         rows_(key3.lo),
@@ -138,7 +138,7 @@ class MatPtr {
   // Adds the contents entry to the table of contents.
   void AddToToc(std::vector<hwy::uint128_t>& toc) const {
     toc.push_back(MakeKey(name_.c_str()));
-    toc.push_back(MakeKey(type_.c_str()));
+    toc.push_back({static_cast<uint64_t>(type_), 0});
     toc.push_back({num_elements_, element_size_});
     toc.push_back({rows_, cols_});
   }
@@ -167,7 +167,7 @@ class MatPtr {
   void SetName(const std::string& name) { name_ = name; }
 
   // Returns the type of the blob.
-  const std::string& Type() const { return type_; }
+  Type GetType() const { return type_; }
 
   // Returns the size of each element in bytes.
   size_t ElementSize() const { return element_size_; }
@@ -219,8 +219,8 @@ class MatPtr {
  protected:
   // Arbitrary name for the array of preferably <= 16 characters.
   std::string name_;
-  // Should be the result of TypeName<T> for CallUpcasted() to work.
-  std::string type_;
+  // Should be the result of TypeEnum<T> for CallUpcasted() to work.
+  Type type_;
   // sizeof(T)
   size_t element_size_ = 0;
   // Number of elements in the array.
@@ -247,7 +247,7 @@ class MatPtrT : public MatPtr {
 
   // Full constructor for dynamic sizing.
   MatPtrT(const std::string& name, size_t rows, size_t cols)
-      : MatPtr(name, TypeName<MatT>(), sizeof(MatT), rows, cols) {}
+      : MatPtr(name, TypeEnum<MatT>(), sizeof(MatT), rows, cols) {}
 
   // Copying allowed as the metadata is small.
   MatPtrT(const MatPtr& other) : MatPtr(other) {}
@@ -330,17 +330,20 @@ class MatPtrT : public MatPtr {
 
 template <class FuncT, typename... TArgs>
 decltype(auto) MatPtr::CallUpcasted(FuncT& func, TArgs&&... args) {
-  if (type_ == TypeName<float>()) {
+  if (type_ == TypeEnum<float>()) {
     return func(dynamic_cast<MatPtrT<float>*>(this),
                 std::forward<TArgs>(args)...);
-  } else if (type_ == TypeName<BF16>()) {
+  } else if (type_ == TypeEnum<BF16>()) {
     return func(dynamic_cast<MatPtrT<BF16>*>(this),
                 std::forward<TArgs>(args)...);
-  } else if (type_ == TypeName<SfpStream>()) {
+  } else if (type_ == TypeEnum<SfpStream>()) {
     return func(dynamic_cast<MatPtrT<SfpStream>*>(this),
                 std::forward<TArgs>(args)...);
+  } else if (type_ == TypeEnum<NuqStream>()) {
+    return func(dynamic_cast<MatPtrT<NuqStream>*>(this),
+                std::forward<TArgs>(args)...);
   } else {
-    HWY_ABORT("Type %s unknown.", type_.c_str());
+    HWY_ABORT("Type %d unknown.", type_);
   }
 }
 
@@ -563,9 +566,10 @@ class CacheLoader {
   }
 
   // Returns whether all tensors are successfully loaded from cache.
-  bool ReadAll(hwy::ThreadPool& pool, std::vector<MatStorage>& model_memory) {
+  BlobError ReadAll(hwy::ThreadPool& pool,
+                    std::vector<MatStorage>& model_memory) {
     // reader_ invalid or any Enqueue failed
-    if (err_ != 0) return false;
+    if (err_ != 0) return err_;
     // Setup the model_memory.
     for (int b = 0; b < model_toc_.size(); ++b) {
       const std::string& file_key = file_keys_[b];
@@ -574,12 +578,12 @@ class CacheLoader {
         const MatPtr* toc_blob = file_toc_.Get(file_key);
         if (toc_blob == nullptr) {
           fprintf(stderr, "Blob %s not found in TOC\n", file_key.c_str());
-          return false;
+          return __LINE__;
         }
         if (toc_blob->Rows() != blob->Rows() ||
             toc_blob->Cols() != blob->Cols()) {
           fprintf(stderr, "Blob %s has size mismatch TOC\n", file_key.c_str());
-          return false;
+          return __LINE__;
         }
         MatStorage toc_blob_array(*toc_blob);
         model_memory.push_back(std::move(toc_blob_array));
@@ -603,17 +607,10 @@ class CacheLoader {
                 "Failed to read blob %s (error %d) of size %zu x %zu x %zu\n",
                 blob.Name().c_str(), err_, blob.Rows(), blob.Cols(),
                 blob.ElementSize());
-        return false;
+        return err_;
       }
     }
-
-    err_ = reader_.ReadAll(pool);
-    if (err_ != 0) {
-      fprintf(stderr, "Failed to read all tensors (error %d)\n", err_);
-      return false;
-    }
-
-    return true;
+    return reader_.ReadAll(pool);
   }
 
  private:
diff --git a/compression/compress_weights.cc b/compression/compress_weights.cc
index 51897af..1a4fc52 100644
--- a/compression/compress_weights.cc
+++ b/compression/compress_weights.cc
@@ -24,6 +24,7 @@
 #include "hwy/highway.h"
 // After highway.h
 #include "compression/compress-inl.h"
+#include "gemma/configs.h"
 
 #ifndef GEMMA_COMPRESS_WEIGHTS_ONCE
 #define GEMMA_COMPRESS_WEIGHTS_ONCE
@@ -150,29 +151,22 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-template <class Configs>
+template <typename T>
 void CompressWeights(const Path& weights_path,
                      const Path& compressed_weights_path, Model model_type,
-                     Type weight_type, hwy::ThreadPool& pool) {
+                     hwy::ThreadPool& pool) {
   if (!weights_path.Exists()) {
     HWY_ABORT("The model weights file '%s' does not exist.",
               weights_path.path.c_str());
   }
   printf("Compressing weights from %s to %s\n", weights_path.path.c_str(),
          compressed_weights_path.path.c_str());
-
-  using CConfig = typename Configs::c;
-  using UCConfig = typename Configs::uc;
-  // Allocate compressed weights.
-  using CWeights = CompressedWeights<CConfig>;
-  ByteStorageT c_weights_u8 = AllocateCompressedWeights<CConfig>()(pool);
-  CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
-
-  // Allocate uncompressed weights.
-  using UCWeights = CompressedWeights<UCConfig>;
-  ByteStorageT uc_weights_u8 = AllocateCompressedWeights<UCConfig>()(pool);
-  UCWeights* uc_weights = reinterpret_cast<UCWeights*>(uc_weights_u8.get());
-
+  ModelConfig config = ConfigFromModel(model_type);
+  std::vector<MatStorage> model_storage;
+  ModelWeightsPtrs<T> c_weights(config, pool);
+  c_weights.Allocate(model_storage, pool);
+  ModelWeightsPtrs<float> uc_weights(config, pool);
+  uc_weights.Allocate(model_storage, pool);
   // Get uncompressed weights, compress, and store.
   FILE* fptr = fopen(weights_path.path.c_str(), "rb");
   if (fptr == nullptr) {
@@ -181,22 +175,22 @@ void CompressWeights(const Path& weights_path,
   }
   bool ok = true;
   uint64_t total_size = 0;
-  CompressedWeights<UCConfig>::ForEachTensor(
-      {uc_weights}, ForEachType::kLoadNoToc,
+  ModelWeightsPtrs<float>::ForEachTensor(
+      {&uc_weights}, ForEachType::kLoadNoToc,
       [&](const char* name, hwy::Span<MatPtr*> tensors) {
         fprintf(stderr, "Loading Parameters (size %zu): %s\n",
                 tensors[0]->SizeBytes(), name);
         ok &= 1 == fread(tensors[0]->Ptr(), tensors[0]->SizeBytes(), 1, fptr);
         total_size += tensors[0]->SizeBytes();
       });
-  const bool scale_for_compression = UCConfig::kNumTensorScales > 0;
+  const bool scale_for_compression = config.num_tensor_scales > 0;
   std::vector<float> scales;
   if (scale_for_compression) {
-    uc_weights->GetOrApplyScales(scales);
+    uc_weights.GetOrApplyScales(scales);
   }
   Compressor compressor(pool);
-  CompressedWeights<CConfig>::ForEachTensor(
-      {reinterpret_cast<CompressedWeights<CConfig>*>(uc_weights), c_weights},
+  ModelWeightsPtrs<T>::ForEachTensor(
+      {reinterpret_cast<ModelWeightsPtrs<T>*>(&uc_weights), &c_weights},
       ForEachType::kLoadNoToc,
       [&compressor](const char* name, hwy::Span<MatPtr*> tensors) {
         tensors[1]->CallUpcasted(
@@ -221,9 +215,26 @@ void Run(Args& args) {
     HWY_ABORT("PaliGemma is not supported in compress_weights.");
   }
   const Type weight_type = args.WeightType();
-  GEMMA_EXPORT_AND_DISPATCH(
-      model_type, weight_type, CompressWeights,
-      (args.weights, args.compressed_weights, model_type, weight_type, pool));
+  switch (weight_type) {
+    case Type::kF32:
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<float>)
+      (args.weights, args.compressed_weights, model_type, pool);
+      break;
+    case Type::kBF16:
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<BF16>)
+      (args.weights, args.compressed_weights, model_type, pool);
+      break;
+    case Type::kSFP:
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<SfpStream>)
+      (args.weights, args.compressed_weights, model_type, pool);
+      break;
+    case Type::kNUQ:
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<NuqStream>)
+      (args.weights, args.compressed_weights, model_type, pool);
+      break;
+    default:
+      HWY_ABORT("Weight type %d unsupported.", static_cast<int>(weight_type));
+  }
 }
 
 }  // namespace gcpp
diff --git a/compression/shared.h b/compression/shared.h
index c216d24..74b7454 100644
--- a/compression/shared.h
+++ b/compression/shared.h
@@ -32,11 +32,6 @@ namespace gcpp {
 
 using BF16 = hwy::bfloat16_t;
 
-template <typename Packed>
-constexpr bool IsF32() {
-  return hwy::IsSame<hwy::RemoveCvRef<Packed>, float>();
-}
-
 // Switching Floating Point: a hybrid 8-bit float representation of bf16/f32
 // inputs that combines the advantages of e4m3 and e5m2 into a single format.
 // It supports seeking at a granularity of 1 and decoding to bf16/f32.
@@ -179,29 +174,67 @@ struct NuqStream {
 };
 #pragma pack(pop)
 
+template <typename Packed>
+constexpr bool IsF32() {
+  return hwy::IsSame<hwy::RemoveCvRef<Packed>, float>();
+}
+
+template <typename Packed>
+constexpr bool IsBF16() {
+  return hwy::IsSame<hwy::RemoveCvRef<Packed>, BF16>();
+}
+
+template <typename Packed>
+constexpr bool IsSfpStream() {
+  return hwy::IsSame<hwy::RemoveCvRef<Packed>, SfpStream>();
+}
+
+template <typename Packed>
+constexpr bool IsNuqStream() {
+  return hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>();
+}
+
+// Instruction-tuned models require extra 'turn structure' tokens in prompts.
+enum class ModelTraining { GEMMA_IT, GEMMA_PT, PALIGEMMA };
+
+// Tensor types for loading weights. Note that not all types are supported as
+// weights for a model, but can be used for other purposes, such as types for
+// ModelWeightsPtrs. When adding a new type that is supported, also
+// update gemma.cc, weights.*, and add instantiations/new_one.cc.
+enum class Type { kUnknown, kF32, kBF16, kSFP, kNUQ, kF64, kC64, kU128 };
+constexpr const char* kTypeStrings[] = {"unknown", "f32", "bf16", "sfp",
+                                        "nuq",     "f64", "c64",  "u128"};
+
+// Returns a Type enum for the type of the template parameter.
 template <typename PackedT>
-const char* TypeName() {
+Type TypeEnum() {
   using Packed = hwy::RemoveCvRef<PackedT>;
   if constexpr (hwy::IsSame<Packed, float>()) {
-    return "f32";
+    return Type::kF32;
   } else if constexpr (hwy::IsSame<Packed, BF16>()) {
-    return "b16";
+    return Type::kBF16;
   } else if constexpr (hwy::IsSame<Packed, SfpStream>()) {
-    return "sfp";
+    return Type::kSFP;
   } else if constexpr (hwy::IsSame<Packed, NuqStream>()) {
-    return "nuq";
+    return Type::kNUQ;
   } else if constexpr (hwy::IsSame<Packed, double>()) {
-    return "f64";
+    return Type::kF64;
   } else if constexpr (hwy::IsSame<Packed, std::complex<double>>()) {
-    return "c64";
+    return Type::kC64;
   } else if constexpr (hwy::IsSame<Packed, hwy::uint128_t>()) {
-    return "u128";
+    return Type::kU128;
   } else {
     HWY_DASSERT(false);
-    return "unknown";
+    return Type::kUnknown;
   }
 }
 
+// Returns a string name for the type of the template parameter.
+template <typename PackedT>
+const char* TypeName() {
+  return kTypeStrings[static_cast<int>(TypeEnum<PackedT>())];
+}
+
 template <typename Packed>
 constexpr bool IsCompressed() {
   return hwy::IsSameEither<hwy::RemoveCvRef<Packed>, SfpStream, NuqStream>();
diff --git a/evals/benchmark.cc b/evals/benchmark.cc
index b59079a..1ea4f65 100644
--- a/evals/benchmark.cc
+++ b/evals/benchmark.cc
@@ -128,8 +128,8 @@ int BenchmarkCrossEntropy(GemmaEnv& env, const Path& text,
     size_t num_tokens = std::min<size_t>(prompt.size() - pos, batch_tokens);
     std::vector<int> prompt_slice(prompt.begin() + pos,
                                   prompt.begin() + pos + num_tokens);
-    KVCache kv_cache = KVCache::Create(
-        env.GetModel()->Info().model, env.MutableConfig().prefill_tbatch_size);
+    KVCache kv_cache = KVCache::Create(env.GetModel()->GetModelConfig(),
+                                       env.MutableConfig().prefill_tbatch_size);
     float entropy = ComputeCrossEntropy(
         *env.GetModel(), num_tokens, prompt_slice, kv_cache, env.Verbosity());
     total_entropy += entropy;
diff --git a/evals/benchmark_helper.cc b/evals/benchmark_helper.cc
index 63553aa..abae040 100644
--- a/evals/benchmark_helper.cc
+++ b/evals/benchmark_helper.cc
@@ -69,8 +69,8 @@ GemmaEnv::GemmaEnv(const LoaderArgs& loader, const InferenceArgs& inference,
     model_ = AllocateGemma(mutable_loader, pools_);
     // Only allocate one for starters because GenerateBatch might not be called.
     kv_caches_.resize(1);
-    kv_caches_[0] =
-        KVCache::Create(model_->Info().model, inference.prefill_tbatch_size);
+    kv_caches_[0] = KVCache::Create(model_->GetModelConfig(),
+                                    inference.prefill_tbatch_size);
   }
   InitGenerator(inference, gen_);
   runtime_config_ = {
@@ -163,7 +163,7 @@ std::vector<QueryResult> GemmaEnv::BatchQueryModel(
   }
   for (size_t i = 1; i < num_queries; ++i) {
     if (kv_caches_[i].seq_len == 0) {
-      kv_caches_[i] = KVCache::Create(model_->Info().model,
+      kv_caches_[i] = KVCache::Create(model_->GetModelConfig(),
                                       runtime_config_.prefill_tbatch_size);
     }
   }
diff --git a/evals/cross_entropy.cc b/evals/cross_entropy.cc
index 870f84c..13ff3d3 100644
--- a/evals/cross_entropy.cc
+++ b/evals/cross_entropy.cc
@@ -103,8 +103,7 @@ float ComputeCrossEntropy(Gemma& gemma, size_t max_generated_tokens,
   const StreamFunc stream_token = [](int /*token*/, float) { return true; };
 
   // TWeight is unused, but we have to pass it to Config*.
-  const int vocab_size =
-      CallForModel</*TWeight=*/float, GetVocabSize>(gemma.Info().model);
+  const int vocab_size = gemma.GetModelConfig().vocab_size;
   float cross_entropy = std::log(vocab_size);  // first token
   size_t pos = 1;
 
diff --git a/examples/hello_world/run.cc b/examples/hello_world/run.cc
index 39d4f9c..2ed9b64 100644
--- a/examples/hello_world/run.cc
+++ b/examples/hello_world/run.cc
@@ -24,7 +24,6 @@
 #include <vector>
 
 // Placeholder for internal header, do not modify.
-#include "gemma/common.h"
 #include "gemma/gemma.h"
 #include "gemma/tokenizer.h"
 #include "util/app.h"  // LoaderArgs
@@ -58,7 +57,8 @@ int main(int argc, char** argv) {
   gcpp::PerClusterPools pools(app.max_clusters, app.max_threads, app.pin);
   gcpp::Gemma model = gcpp::CreateGemma(loader, pools);
   gcpp::KVCache kv_cache =
-      gcpp::KVCache::Create(loader.Info().model, inference.prefill_tbatch_size);
+      gcpp::KVCache::Create(model.GetModelConfig(),
+                            inference.prefill_tbatch_size);
   size_t generated = 0;
 
   // Initialize random number generator
diff --git a/gemma/activations.h b/gemma/activations.h
index b10b562..3983924 100644
--- a/gemma/activations.h
+++ b/gemma/activations.h
@@ -21,6 +21,7 @@
 #include <cmath>
 
 #include "compression/shared.h"  // BF16
+#include "gemma/configs.h"
 #include "ops/matmul.h"          // MatMulEnv
 #include "util/allocator.h"      // RowVectorBatch
 #include "util/threading.h"
@@ -30,6 +31,12 @@
 namespace gcpp {
 
 struct Activations {
+  explicit Activations(const ModelConfig& config)
+      : weights_config(config),
+        layer_config(config.layer_configs[0]),
+        seq_len(config.seq_len),
+        cache_pos_size(config.CachePosSize()) {}
+
   RowVectorBatch<float> x;  // input
   RowVectorBatch<float> q;  // query, also KV if MHA.
   RowVectorBatch<float> logits;
@@ -58,23 +65,24 @@ struct Activations {
 
   MatMulEnv env;
 
+  PostQKType post_qk = PostQKType::Rope;
+  // And the config.
+  const ModelConfig& weights_config;
+  const LayerConfig& layer_config;
+  size_t seq_len;
+  size_t cache_pos_size = 0;
+
   // Multi-Head Attention?
-  template <class TConfig>
-  static constexpr bool IsMHA() {
-    return TConfig::kHeads == TConfig::kKVHeads;
-  }
+  bool IsMHA() const { return layer_config.heads == layer_config.kv_heads; }
 
   // Stride between subsequent queries. Each of Q, K, V are of length kQKVDim,
   // but for MHA we store them as Q,K,V, Q,K,V, .. instead of Q..Q, K..K, V..V.
-  template <class TConfig>
-  static constexpr size_t QStride() {
-    return TConfig::kQKVDim * (IsMHA<TConfig>() ? 3 : 1);
-  }
+  size_t QStride() const { return layer_config.qkv_dim * (IsMHA() ? 3 : 1); }
 
-  template <class TConfig>
-  static RowVectorBatch<float> CreateInvTimescale() {
-    constexpr size_t kQKVDim = TConfig::kQKVDim;
-    const size_t rope_dim = TConfig::kUseHalfRope ? kQKVDim / 2 : kQKVDim;
+  static RowVectorBatch<float> CreateInvTimescale(size_t qkv_dim,
+                                                  PostQKType post_qk) {
+    const size_t rope_dim =
+        post_qk == PostQKType::HalfRope ? qkv_dim / 2 : qkv_dim;
     RowVectorBatch<float> inv_timescale(1, rope_dim / 2);
     for (size_t dim = 0; dim < rope_dim / 2; ++dim) {
       const float freq_exponents =
@@ -86,40 +94,38 @@ struct Activations {
     return inv_timescale;
   }
 
-  template <class TConfig>
   void Allocate(size_t batch_size, PerClusterPools& pools) {
-    constexpr size_t kModelDim = TConfig::kModelDim;
-    constexpr size_t kQKVDim = TConfig::kQKVDim;
-    constexpr size_t kHeads = TConfig::kHeads;
-    constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-    constexpr size_t kVocabSize = TConfig::kVocabSize;
-    constexpr size_t kSeqLen = TConfig::kSeqLen;
-    constexpr size_t kGriffinLayers = TConfig::kGriffinLayers;
+    post_qk = layer_config.post_qk;
+    const size_t model_dim = weights_config.model_dim;
+    const size_t ff_hidden_dim = layer_config.ff_hidden_dim;
+    const size_t vocab_size = weights_config.vocab_size;
 
-    x = RowVectorBatch<float>(batch_size, kModelDim);
-    q = RowVectorBatch<float>(batch_size, kHeads * QStride<TConfig>());
-    if constexpr (kVocabSize > 0) {
-     logits = RowVectorBatch<float>(batch_size, kVocabSize);
+    x = RowVectorBatch<float>(batch_size, model_dim);
+    q = RowVectorBatch<float>(batch_size, layer_config.heads * QStride());
+    if (vocab_size > 0) {
+      logits = RowVectorBatch<float>(batch_size, vocab_size);
     }
 
-    pre_att_rms_out = RowVectorBatch<float>(batch_size, kModelDim);
-    att = RowVectorBatch<float>(batch_size, kHeads * kSeqLen);
-    att_out = RowVectorBatch<float>(batch_size, kHeads * kQKVDim);
-    att_sums = RowVectorBatch<float>(batch_size, kModelDim);
+    pre_att_rms_out = RowVectorBatch<float>(batch_size, model_dim);
+    att = RowVectorBatch<float>(batch_size,
+                                layer_config.heads * weights_config.seq_len);
+    att_out = RowVectorBatch<float>(batch_size,
+                                    layer_config.heads * layer_config.qkv_dim);
+    att_sums = RowVectorBatch<float>(batch_size, model_dim);
 
-    bf_pre_ffw_rms_out = RowVectorBatch<BF16>(batch_size, kModelDim);
-    C1 = RowVectorBatch<float>(batch_size, kFFHiddenDim);
-    C2 = RowVectorBatch<float>(batch_size, kFFHiddenDim);
-    ffw_out = RowVectorBatch<float>(batch_size, kModelDim);
+    bf_pre_ffw_rms_out = RowVectorBatch<BF16>(batch_size, model_dim);
+    C1 = RowVectorBatch<float>(batch_size, ff_hidden_dim);
+    C2 = RowVectorBatch<float>(batch_size, ff_hidden_dim);
+    ffw_out = RowVectorBatch<float>(batch_size, model_dim);
 
-    if constexpr (kGriffinLayers > 0) {
-      griffin_x = RowVectorBatch<float>(batch_size, kModelDim);
-      griffin_y = RowVectorBatch<float>(batch_size, kModelDim);
-      griffin_gate_x = RowVectorBatch<float>(batch_size, kModelDim);
-      griffin_multiplier = RowVectorBatch<float>(batch_size, kModelDim);
+    if (layer_config.type == LayerAttentionType::kGriffinRecurrentBlock) {
+      griffin_x = RowVectorBatch<float>(batch_size, model_dim);
+      griffin_y = RowVectorBatch<float>(batch_size, model_dim);
+      griffin_gate_x = RowVectorBatch<float>(batch_size, model_dim);
+      griffin_multiplier = RowVectorBatch<float>(batch_size, model_dim);
     }
 
-    inv_timescale = CreateInvTimescale<TConfig>();
+    inv_timescale = CreateInvTimescale(layer_config.qkv_dim, post_qk);
 
     env = MatMulEnv(pools);
   }
diff --git a/gemma/common.cc b/gemma/common.cc
index e68347b..447deb6 100644
--- a/gemma/common.cc
+++ b/gemma/common.cc
@@ -15,6 +15,7 @@
 
 #include "gemma/common.h"
 
+#include <math.h>  // sqrtf
 #include <stddef.h>
 #include <string.h>
 
@@ -23,6 +24,7 @@
 #include <string>
 #include <vector>
 
+#include "compression/shared.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
@@ -101,8 +103,6 @@ const char* ModelString(Model model, ModelTraining training) {
             static_cast<int>(training));
 }
 
-constexpr const char* kTypeStrings[] = {"f32", "bf16", "sfp"};
-
 const char* StringFromType(Type type) {
   return kTypeStrings[static_cast<size_t>(type)];
 }
@@ -141,4 +141,19 @@ void Wrap(const ModelInfo& info, size_t pos, std::string& prompt) {
     prompt = start + prompt + "<end_of_turn>\n<start_of_turn>model\n";
   }
 }
+
+float EmbeddingScaling(size_t model_dim) {
+  // Round to bf16 to match Gemma's Embedder, which casts before mul.
+  return hwy::ConvertScalarTo<float>(hwy::ConvertScalarTo<hwy::bfloat16_t>(
+      sqrtf(static_cast<float>(model_dim))));
+}
+
+float ChooseQueryScale(const ModelConfig& config) {
+  if (config.query_scale == QueryScaleType::SqrtModelDimDivNumHeads)
+    return 1.0f / sqrtf(static_cast<float>(config.model_dim /
+                                           config.layer_configs[0].heads));
+  // QueryScaleType::SqrtKeySize
+  return 1.0f / sqrtf(static_cast<float>(config.layer_configs[0].qkv_dim));
+}
+
 }  // namespace gcpp
diff --git a/gemma/common.h b/gemma/common.h
index 18ac5d1..e933e8d 100644
--- a/gemma/common.h
+++ b/gemma/common.h
@@ -16,37 +16,15 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_H_
 
-#include <math.h>  // sqrtf
 #include <stddef.h>
 
 #include <string>
 
-#include "compression/compress.h"
 #include "gemma/configs.h"  // IWYU pragma: export
 #include "hwy/base.h"  // ConvertScalarTo
 
 namespace gcpp {
 
-// Model variants: see configs.h for details. When adding a new one, also
-// update GEMMA_FOREACH* and Call* below, and add instantiations/*.cc.
-enum class Model {
-  GEMMA_2B,
-  GEMMA_7B,
-  GEMMA2_9B,
-  GEMMA2_27B,
-  GRIFFIN_2B,
-  GEMMA_TINY,
-  GEMMA2_2B,
-  PALIGEMMA_224,
-};
-
-// Instruction-tuned models require extra 'turn structure' tokens in prompts.
-enum class ModelTraining { GEMMA_IT, GEMMA_PT, PALIGEMMA };
-
-// Tensor types for loading weights. When adding a new one, also
-// update GEMMA_FOREACH* and Call* below, and add instantiations/*.cc.
-enum class Type { kF32, kBF16, kSFP };
-
 // TODO(janwas): merge with functions below.
 struct ModelInfo {
   Model model;
@@ -66,198 +44,12 @@ const char* StringFromType(Type type);
 
 void Wrap(const ModelInfo& info, size_t pos, std::string& prompt);
 
-// Returns the return value of FuncT<Config*<TWeight>>().operator()(args), where
-// Config* is selected via `model`. Typically called by CallForModelAndWeight,
-// but can also be called directly when FuncT does not actually use TWeight.
-//
-// Note that a T prefix indicates a concrete type template argument, whereas a
-// T suffix indicates the argument is itself a template.
-//
-// `FuncT` must be a functor because function templates cannot be passed as a
-// template template argument, and we prefer to avoid the overhead of
-// std::function.
-template <typename TWeight, template <typename TConfig> class FuncT,
-          typename... TArgs>
-decltype(auto) CallForModel(Model model, TArgs&&... args) {
-  switch (model) {
-    case Model::GEMMA_TINY:
-      return FuncT<ConfigGemmaTiny<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GEMMA_2B:
-      return FuncT<ConfigGemma2B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GEMMA_7B:
-      return FuncT<ConfigGemma7B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GEMMA2_9B:
-      return FuncT<ConfigGemma2_9B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GEMMA2_27B:
-      return FuncT<ConfigGemma2_27B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GRIFFIN_2B:
-      return FuncT<ConfigGriffin2B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::GEMMA2_2B:
-      return FuncT<ConfigGemma2_2B<TWeight>>()(std::forward<TArgs>(args)...);
-    case Model::PALIGEMMA_224:
-      return FuncT<ConfigPaliGemma_224<TWeight>>()(
-          std::forward<TArgs>(args)...);
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
-}
-
-// Returns the return value of FuncT<TConfig>().operator()(args),
-// where `TConfig` is selected based on `model` and `weight`.
-
-// This makes it easy to extend `Model` or `Type` without updating callers.
-//
-// Usage example: LoadWeights is type-erased so that it can be called from other
-// .cc files. It uses this function to call the appropriate instantiation of a
-// template functor LoadCompressedWeightsT<TConfig>.
-template <template <typename TConfig> class FuncT, typename... TArgs>
-decltype(auto) CallForModelAndWeight(Model model, Type weight,
-                                     TArgs&&... args) {
-  switch (weight) {
-    case Type::kF32:
-      return CallForModel<float, FuncT, TArgs...>(  //
-          model, std::forward<TArgs>(args)...);
-    case Type::kBF16:
-      return CallForModel<BF16, FuncT, TArgs...>(model,
-                                                 std::forward<TArgs>(args)...);
-    case Type::kSFP:
-      return CallForModel<SfpStream, FuncT, TArgs...>(
-          model, std::forward<TArgs>(args)...);
-    default:
-      HWY_ABORT("Weight type %d unknown.", static_cast<int>(weight));
-  }
-}
-
-#define GEMMA_FOREACH_WEIGHT(X, CONFIGT) \
-  X(CONFIGT, float)                      \
-  X(CONFIGT, BF16)                       \
-  X(CONFIGT, SfpStream)
-
-#define GEMMA_FOREACH_CONFIG_AND_WEIGHT(X)              \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemmaTiny)              \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemma2B)                \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemma7B)                \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGriffin2B)              \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemma2_2B)              \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemma2_9B)              \
-  GEMMA_FOREACH_WEIGHT(X, ConfigGemma2_27B)             \
-  GEMMA_FOREACH_WEIGHT(X, ConfigPaliGemma_224)          \
-  static_assert(true, "Allow trailing ;")
-
-// Used by GEMMA_EXPORT_AND_DISPATCH. For a given TWEIGHT (e.g. float),
-// calls FUNC<ConfigT<TWEIGHT>> where ConfigT is chosen via MODEL enum.
-#define GEMMA_DISPATCH_MODEL(MODEL, TWEIGHT, FUNC, ARGS)                       \
-  switch (MODEL) {                                                             \
-    case Model::GEMMA_TINY: {                                                  \
-      using CP = ConfigPair<ConfigGemmaTiny<TWEIGHT>, ConfigGemmaTiny<float>>; \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GEMMA_2B: {                                                    \
-      using CP = ConfigPair<ConfigGemma2B<TWEIGHT>, ConfigGemma2B<float>>;     \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GEMMA_7B: {                                                    \
-      using CP = ConfigPair<ConfigGemma7B<TWEIGHT>, ConfigGemma7B<float>>;     \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GRIFFIN_2B: {                                                  \
-      using CP = ConfigPair<ConfigGriffin2B<TWEIGHT>, ConfigGriffin2B<float>>; \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GEMMA2_2B: {                                                   \
-      using CP = ConfigPair<ConfigGemma2_2B<TWEIGHT>, ConfigGemma2_2B<float>>; \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GEMMA2_9B: {                                                   \
-      using CP = ConfigPair<ConfigGemma2_9B<TWEIGHT>, ConfigGemma2_9B<float>>; \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::GEMMA2_27B: {                                                  \
-      using CP =                                                               \
-          ConfigPair<ConfigGemma2_27B<TWEIGHT>, ConfigGemma2_27B<float>>;      \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    case Model::PALIGEMMA_224: {                                               \
-      using CP = ConfigPair<ConfigPaliGemma_224<TWEIGHT>,                      \
-                            ConfigPaliGemma_224<float>>;                       \
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<CP>) ARGS;                        \
-      break;                                                                   \
-    }                                                                          \
-    default:                                                                   \
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(MODEL));            \
-  }
-
-// Like CallForModelAndWeight, but for SIMD function templates. This is a macro
-// because it boils down to N_SSE4::FUNC, which would not work if FUNC was a
-// normal function argument. MODEL and WEIGHT are enums.
-// For gemma.cc, we use overloaded extern functions for faster builds. However,
-// this is still used in compress_weights because its compile time is OK.
-#define GEMMA_EXPORT_AND_DISPATCH(MODEL, WEIGHT, FUNC, ARGS)          \
-  switch (WEIGHT) {                                                   \
-    case Type::kF32:                                                  \
-      GEMMA_DISPATCH_MODEL(MODEL, float, FUNC, ARGS);                 \
-      break;                                                          \
-    case Type::kBF16:                                                 \
-      GEMMA_DISPATCH_MODEL(MODEL, BF16, FUNC, ARGS);                  \
-      break;                                                          \
-    case Type::kSFP:                                                  \
-      GEMMA_DISPATCH_MODEL(MODEL, SfpStream, FUNC, ARGS);             \
-      break;                                                          \
-    default:                                                          \
-      HWY_ABORT("Weight type %d unknown.", static_cast<int>(WEIGHT)); \
-  }
-
 // ----------------------------------------------------------------------------
 //
 
-// __builtin_sqrt is not constexpr as of Clang 17.
-#if HWY_COMPILER_GCC_ACTUAL
-#define GEMMA_CONSTEXPR_SQRT constexpr
-static GEMMA_CONSTEXPR_SQRT HWY_INLINE float Sqrt(float x) {
-  return __builtin_sqrt(x);
-}
-#else
-#define GEMMA_CONSTEXPR_SQRT
-static GEMMA_CONSTEXPR_SQRT HWY_INLINE float Sqrt(float x) { return sqrtf(x); }
-#endif
+float EmbeddingScaling(size_t model_dim);
 
-// `EmbeddingScaling` can be constexpr only if `Sqrt` and `hwy::ConvertScalarTo`
-// are both constexpr
-#if HWY_COMPILER_GCC_ACTUAL
-#define GEMMA_CONSTEXPR_EMBSCALING HWY_BF16_CONSTEXPR
-#else
-#define GEMMA_CONSTEXPR_EMBSCALING
-#endif
-
-template <typename TConfig>
-GEMMA_CONSTEXPR_EMBSCALING float EmbeddingScaling() {
-  // Round to bf16 to match Gemma's Embedder, which casts before mul.
-  return hwy::ConvertScalarTo<float>(
-      hwy::ConvertScalarTo<BF16>(Sqrt(static_cast<float>(TConfig::kModelDim))));
-}
-
-static HWY_INLINE GEMMA_CONSTEXPR_EMBSCALING float EmbeddingScaling(
-    size_t model_dim) {
-  // Round to bf16 to match Gemma's Embedder, which casts before mul.
-  return hwy::ConvertScalarTo<float>(
-      hwy::ConvertScalarTo<BF16>(Sqrt(static_cast<float>(model_dim))));
-}
-
-template <class TConfig>
-GEMMA_CONSTEXPR_SQRT float ChooseQueryScale() {
-  if (TConfig::kQueryScale == QueryScaleType::SqrtModelDimDivNumHeads)
-    return 1.0f /
-           Sqrt(static_cast<float>(TConfig::kModelDim / TConfig::kHeads));
-  // QueryScaleType::SqrtKeySize
-  return 1.0f / Sqrt(static_cast<float>(TConfig::kQKVDim));
-}
+float ChooseQueryScale(const ModelConfig& config);
 
 }  // namespace gcpp
 
diff --git a/gemma/configs.cc b/gemma/configs.cc
new file mode 100644
index 0000000..bc4eee6
--- /dev/null
+++ b/gemma/configs.cc
@@ -0,0 +1,246 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gemma/configs.h"
+
+#include "hwy/base.h"
+
+namespace gcpp {
+
+static ModelConfig ConfigNoSSM() {
+  ModelConfig config = {.scale_names = {"att_ein", "qkv_ein", "gr_lin_x_w",
+                                        "gr_lin_y_w", "gr_lin_out_w",
+                                        "gr_gate_w", "gating_ein", "linear_w"}};
+  return config;
+}
+
+static ModelConfig ConfigBaseGemmaV1() { return ConfigNoSSM(); }
+
+static ModelConfig ConfigBaseGemmaV2() {
+  ModelConfig config = ConfigNoSSM();
+  config.att_cap = 50.0f;
+  config.final_cap = 30.0f;
+  return config;
+}
+
+static ModelConfig ConfigGemma2_27B() {
+  ModelConfig config = ConfigBaseGemmaV2();
+  config.model_name = "Gemma2_27B";
+  config.model = Model::GEMMA2_27B;
+  config.model_dim = 4608;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = 8192;
+  LayerConfig layer_config = {.model_dim = config.model_dim,
+                              .ff_hidden_dim = 16 * 4608 / 2,  // = 36864
+                              .heads = 32,
+                              .kv_heads = 16,
+                              .qkv_dim = 128,
+                              .post_norm = PostNormType::Scale};
+  config.layer_configs = {46, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtModelDimDivNumHeads;
+  config.attention_window_sizes =
+      RepeatedAttentionWindowSizes<46, 2>({4096, 8192});
+  return config;
+}
+
+static ModelConfig ConfigGemma2_9B() {
+  ModelConfig config = ConfigBaseGemmaV2();
+  config.model_name = "Gemma2_9B";
+  config.model = Model::GEMMA2_9B;
+  config.model_dim = 3584;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = 8192;
+  LayerConfig layer_config = {.model_dim = config.model_dim,
+                              .ff_hidden_dim = 8 * 3584 / 2,  // = 14336
+                              .heads = 16,
+                              .kv_heads = 8,
+                              .qkv_dim = 256,
+                              .post_norm = PostNormType::Scale};
+  config.layer_configs = {42, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes =
+      RepeatedAttentionWindowSizes<42, 2>({4096, 8192});
+  return config;
+}
+
+static ModelConfig ConfigGemma2_2B() {
+  ModelConfig config = ConfigBaseGemmaV2();
+  config.model_name = "Gemma2_2B";
+  config.model = Model::GEMMA2_2B;
+  config.model_dim = 2304;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = 8192;
+  LayerConfig layer_config = {.model_dim = config.model_dim,
+                              .ff_hidden_dim = 8 * 2304 / 2,  // = 9216
+                              .heads = 8,
+                              .kv_heads = 4,
+                              .qkv_dim = 256,
+                              .post_norm = PostNormType::Scale};
+  config.layer_configs = {26, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes =
+      RepeatedAttentionWindowSizes<26, 2>({4096, 8192});
+  return config;
+}
+
+static ModelConfig ConfigGemma7B() {
+  ModelConfig config = ConfigBaseGemmaV1();
+  config.model_name = "Gemma7B";
+  config.model = Model::GEMMA_7B;
+  config.model_dim = 3072;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = gcpp::kSeqLen;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .ff_hidden_dim = 16 * 3072 / 2,  // = 24576
+      .heads = 16,
+      .kv_heads = 16,
+      .qkv_dim = 256,
+  };
+  config.layer_configs = {28, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes = FixedAttentionWindowSizes<28>(gcpp::kSeqLen);
+  return config;
+}
+
+static ModelConfig ConfigGemma2B() {
+  ModelConfig config = ConfigBaseGemmaV1();
+  config.model_name = "Gemma2B";
+  config.model = Model::GEMMA_2B;
+  config.model_dim = 2048;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = gcpp::kSeqLen;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .ff_hidden_dim = 16 * 2048 / 2,  // = 16384
+      .heads = 8,
+      .kv_heads = 1,
+      .qkv_dim = 256,
+  };
+  config.layer_configs = {18, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.attention_window_sizes = FixedAttentionWindowSizes<18>(gcpp::kSeqLen);
+  return config;
+}
+
+static ModelConfig ConfigGemmaTiny() {
+  ModelConfig config = ConfigNoSSM();
+  config.model_name = "GemmaTiny";
+  config.model = Model::GEMMA_TINY;
+  config.model_dim = 128;
+  config.vocab_size = 64;
+  config.seq_len = 32;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .ff_hidden_dim = 256,
+      .heads = 4,
+      .kv_heads = 1,
+      .qkv_dim = 16,
+  };
+  config.layer_configs = {3, layer_config};
+  config.num_tensor_scales = 4 * config.layer_configs.size();
+  config.query_scale = QueryScaleType::SqrtKeySize;
+  config.attention_window_sizes = FixedAttentionWindowSizes<3>(32);
+  // This is required for optimize_test to pass.
+  config.final_cap = 30.0f;
+  return config;
+}
+
+static ModelConfig ConfigGriffin2B() {
+  ModelConfig config = ConfigNoSSM();
+  config.model_name = "Griffin2B";
+  config.model = Model::GRIFFIN_2B;
+  // Griffin uses local attention, so kSeqLen is actually the local attention
+  // window.
+  config.model_dim = 2560;
+  config.vocab_size = gcpp::kVocabSize;
+  config.seq_len = 2048;
+  LayerConfig layer_config = {
+      .model_dim = config.model_dim,
+      .griffin_dim = config.model_dim,
+      .ff_hidden_dim = 7680,
+      .heads = 10,
+      .kv_heads = 1,
+      .qkv_dim = 256,
+      .conv1d_width = 4,
+      .ff_biases = true,
+      .softmax_attn_output_biases = true,
+      .type = LayerAttentionType::kGriffinRecurrentBlock,
+      .activation = ActivationType::Gelu,
+      .post_qk = PostQKType::Rope,
+  };
+  config.layer_configs = {26, layer_config};
+  for (size_t i = 2; i < config.layer_configs.size(); i += 3) {
+    config.layer_configs[i].type = LayerAttentionType::kGemma;
+    config.layer_configs[i].griffin_dim = 0;
+  }
+  config.num_tensor_scales = 140;
+  config.attention_window_sizes = FixedAttentionWindowSizes<26>(config.seq_len);
+  config.use_local_attention = true;
+  // This is required for optimize_test to pass.
+  config.final_cap = 0.0f;
+  return config;
+}
+
+static ModelConfig ConfigPaliGemma_224() {
+  ModelConfig config = ConfigGemma2B();
+  config.model_name = "PaliGemma_224";
+  config.model = Model::PALIGEMMA_224;
+  config.vit_model_dim = 1152;
+  config.vocab_size = 256000 + 1024 + 128;  // = 257152
+  config.vit_seq_len = 16 * 16;
+  LayerConfig layer_config = {
+      .model_dim = config.vit_model_dim,
+      .ff_hidden_dim = 4304,
+      .heads = 16,
+      .kv_heads = 16,
+      .qkv_dim = 72,
+      .type = LayerAttentionType::kVit,
+      .patch_width = 14,
+      .image_size = 224,
+  };
+  config.vit_layer_configs = {27, layer_config};
+  config.num_vit_scales = 4 * config.vit_layer_configs.size();
+  return config;
+}
+
+ModelConfig ConfigFromModel(Model model) {
+  switch (model) {
+    case Model::GEMMA_2B:
+      return ConfigGemma2B();
+    case Model::GEMMA_7B:
+      return ConfigGemma7B();
+    case Model::GEMMA2_2B:
+      return ConfigGemma2_2B();
+    case Model::GEMMA2_9B:
+      return ConfigGemma2_9B();
+    case Model::GEMMA2_27B:
+      return ConfigGemma2_27B();
+    case Model::GRIFFIN_2B:
+      return ConfigGriffin2B();
+    case Model::GEMMA_TINY:
+      return ConfigGemmaTiny();
+    case Model::PALIGEMMA_224:
+      return ConfigPaliGemma_224();
+    default:
+      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
+  }
+}
+
+}  // namespace gcpp
diff --git a/gemma/configs.h b/gemma/configs.h
index 7c1ce88..ac82ab4 100644
--- a/gemma/configs.h
+++ b/gemma/configs.h
@@ -21,6 +21,9 @@
 #include <stddef.h>
 
 #include <array>
+#include <string>
+#include <unordered_set>
+#include <vector>
 
 #include "compression/shared.h"  // BF16
 
@@ -57,6 +60,7 @@ enum class PostNormType {
 // Post qk projection operation type.
 enum class PostQKType {
   Rope,
+  HalfRope,
 };
 
 // FFW activation function.
@@ -76,358 +80,115 @@ enum class ResidualType {
 };
 
 template <size_t kNum>
-constexpr std::array<LayerAttentionType, kNum> FixedLayerConfig(
-    LayerAttentionType type) {
-  std::array<LayerAttentionType, kNum> config = {};
-  for (LayerAttentionType& l : config) {
-    l = type;
-  }
-  return config;
+std::vector<LayerAttentionType> FixedLayerConfig(LayerAttentionType type) {
+  return std::vector<LayerAttentionType>(kNum, type);
 }
 
 template <size_t kNum>
-constexpr std::array<size_t, kNum> FixedAttentionWindowSizes(
-    size_t window_size) {
-  std::array<size_t, kNum> window_size_configs = {};
-  for (size_t& l : window_size_configs) {
-    l = window_size;
-  }
-  return window_size_configs;
+std::vector<size_t> FixedAttentionWindowSizes(size_t window_size) {
+  return std::vector<size_t>(kNum, window_size);
 }
 
 // Repeat window_size_pattern for kNum / kPatternSize times.
 template <size_t kNum, size_t kPatternSize>
-constexpr std::array<size_t, kNum> RepeatedAttentionWindowSizes(
+std::vector<size_t> RepeatedAttentionWindowSizes(
     const std::array<size_t, kPatternSize>& window_size_pattern) {
   static_assert(kNum % kPatternSize == 0,
                 "kNum must be a multiple of kPatternSize");
-  std::array<size_t, kNum> window_size_configs = {};
+  std::vector<size_t> window_size_configs(kNum);
   for (size_t i = 0; i < kNum; ++i) {
     window_size_configs[i] = window_size_pattern[i % kPatternSize];
   }
   return window_size_configs;
 }
 
-template <size_t kNumLayers>
-constexpr size_t NumLayersOfTypeBefore(
-    const std::array<LayerAttentionType, kNumLayers>& layers,
-    LayerAttentionType type, size_t num) {
-  size_t count = 0;
-  for (size_t i = 0; i < num; i++) {
-    if (layers[i] == type) count++;
+// Model variants: see configs.cc for details.
+enum class Model {
+  UNKNOWN,
+  GEMMA_2B,
+  GEMMA_7B,
+  GEMMA2_9B,
+  GEMMA2_27B,
+  GRIFFIN_2B,
+  GEMMA_TINY,
+  GEMMA2_2B,
+  PALIGEMMA_224,
+};
+
+struct LayerConfig {
+  size_t CacheLayerSize() const { return kv_heads * qkv_dim * 2; }
+
+  size_t model_dim = 0;
+  size_t griffin_dim = 0;
+  size_t ff_hidden_dim = 0;
+  size_t heads = 0;
+  size_t kv_heads = 0;
+  size_t qkv_dim = 0;
+  size_t conv1d_width = 0;
+  bool ff_biases = false;
+  bool softmax_attn_output_biases = false;
+  PostNormType post_norm = PostNormType::None;
+  LayerAttentionType type = LayerAttentionType::kGemma;
+  ActivationType activation = ActivationType::Gelu;
+  PostQKType post_qk = PostQKType::Rope;
+  // Dimensions related to image processing.
+  int patch_width = 14;
+  int image_size = 224;
+};
+
+struct ModelConfig {
+  size_t CachePosSize() const {
+    size_t num_layers = layer_configs.size();
+    return num_layers * layer_configs[0].CacheLayerSize();
   }
-  return count;
-}
 
-template <class TConfig, typename = void>
-struct CacheLayerSize {
-  constexpr size_t operator()() const {
-    return TConfig::kKVHeads * TConfig::kQKVDim * 2;
+  size_t NumLayersOfTypeBefore(LayerAttentionType type, size_t num) const {
+    size_t count = 0;
+    for (size_t i = 0; i < num; i++) {
+      if (layer_configs[i].type == type) ++count;
+    }
+    return count;
   }
-};
 
-template <class TConfig, typename = void>
-struct CachePosSize {
-  constexpr size_t operator()() const {
-    return TConfig::kGemmaLayers * CacheLayerSize<TConfig>()();
+  size_t NumLayersOfType(LayerAttentionType type) const {
+    return NumLayersOfTypeBefore(type, layer_configs.size());
   }
+
+  size_t NumHeads() const {
+    size_t num_heads = 0;
+    for (const auto& layer_config : layer_configs) {
+      num_heads = std::max(num_heads, layer_config.heads);
+    }
+    return num_heads;
+  }
+
+  std::string model_name;
+  Model model;
+  ModelTraining training;
+  Type weight;
+  size_t model_dim = 0;
+  size_t vit_model_dim = 0;
+  size_t vocab_size = 0;
+  size_t seq_len = 0;
+  size_t vit_seq_len = 0;
+  size_t num_tensor_scales = 0;
+  size_t num_vit_scales = 0;
+  size_t top_k = kTopK;
+  float att_cap = 0.0f;
+  float final_cap = 0.0f;
+  bool absolute_pe = false;
+  bool use_local_attention = false;
+  QueryScaleType query_scale = QueryScaleType::SqrtKeySize;
+  std::vector<LayerConfig> layer_configs;
+  std::vector<size_t> attention_window_sizes;
+  std::vector<LayerConfig> vit_layer_configs;
+  std::unordered_set<std::string> scale_names;
+  int norm_num_groups = 1;
+  int model_family_version = 1;
 };
 
-struct ConfigNoVit {
-  struct VitConfig {
-    // Some of these are needed to make the compiler happy when trying to
-    // generate code that will actually never be used.
-    using Weight = float;
-    static constexpr int kLayers = 0;
-    static constexpr std::array<LayerAttentionType, 0> kLayerConfig =
-        FixedLayerConfig<0>(LayerAttentionType::kVit);
-    static constexpr int kModelDim = 0;
-    static constexpr int kFFHiddenDim = 0;
-    static constexpr int kHeads = 1;  // Avoid division by 0 in griffin gate_w.
-    static constexpr int kKVHeads = 0;
-    static constexpr int kQKVDim = 0;
-    static constexpr int kSeqLen = 0;
-    static constexpr ResidualType kResidual = ResidualType::Add;
-    static constexpr int kGriffinLayers = 0;
-    static constexpr int kConv1dWidth = 0;
-    static constexpr bool kFFBiases = false;
-    static constexpr bool kSoftmaxAttnOutputBiases = false;
-    static constexpr PostNormType kPostNorm = PostNormType::None;
-  };
-};
-
-struct ConfigNoSSM : ConfigNoVit {
-  static constexpr int kGriffinLayers = 0;
-
-  static constexpr int kConv1dWidth = 0;
-  static constexpr bool kFFBiases = false;
-  static constexpr bool kSoftmaxAttnOutputBiases = false;
-  static constexpr bool kUseHalfRope = false;
-  static constexpr bool kUseLocalAttention = false;
-  static constexpr bool kInterleaveQKV = true;
-  static constexpr PostQKType kPostQK = PostQKType::Rope;
-  static constexpr ActivationType kActivation = ActivationType::Gelu;
-  static constexpr ResidualType kResidual = ResidualType::Add;
-};
-
-struct ConfigBaseGemmaV1 : ConfigNoSSM {
-  static constexpr float kAttCap = 0.0f;
-  static constexpr float kFinalCap = 0.0f;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-};
-
-struct ConfigBaseGemmaV2 : ConfigNoSSM {
-  static constexpr float kAttCap = 50.0f;
-  static constexpr float kFinalCap = 30.0f;
-  static constexpr PostNormType kPostNorm = PostNormType::Scale;
-};
-
-template <typename TWeight>
-struct ConfigGemma2_27B : public ConfigBaseGemmaV2 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 8192;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 46> kLayerConfig =
-      FixedLayerConfig<46>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 46> kAttentionWindowSizes =
-      RepeatedAttentionWindowSizes<46, 2>({4096, kSeqLen});
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 4608;
-  static constexpr int kFFHiddenDim = 16 * 4608 / 2;  // = 36864
-  static constexpr int kHeads = 32;
-  static constexpr int kKVHeads = 16;
-  static constexpr int kQKVDim = 128;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr QueryScaleType kQueryScale =
-      QueryScaleType::SqrtModelDimDivNumHeads;
-};
-
-template <typename TWeight>
-struct ConfigGemma2_9B : public ConfigBaseGemmaV2 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 8192;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 42> kLayerConfig =
-      FixedLayerConfig<42>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 42> kAttentionWindowSizes =
-      RepeatedAttentionWindowSizes<42, 2>({4096, kSeqLen});
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 3584;
-  static constexpr int kFFHiddenDim = 8 * 3584 / 2;  // = 14336
-  static constexpr int kHeads = 16;
-  static constexpr int kKVHeads = 8;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-};
-
-template <typename TWeight>
-struct ConfigGemma7B : public ConfigBaseGemmaV1 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = gcpp::kSeqLen;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 28> kLayerConfig =
-      FixedLayerConfig<28>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 28> kAttentionWindowSizes =
-      FixedAttentionWindowSizes<28>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 3072;
-  static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
-  static constexpr int kHeads = 16;
-  static constexpr int kKVHeads = 16;  // standard MHA
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-};
-
-template <typename TWeight>
-struct ConfigGemma2B : public ConfigBaseGemmaV1 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = gcpp::kSeqLen;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 18> kLayerConfig =
-      FixedLayerConfig<18>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 18> kAttentionWindowSizes =
-      FixedAttentionWindowSizes<18>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 2048;
-  static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
-  static constexpr int kHeads = 8;
-  static constexpr int kKVHeads = 1;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-};
-
-template <typename TWeight>
-struct ConfigPaliGemma_224 : public ConfigGemma2B<TWeight> {
-  // On the LM side, the vocab size is one difference to Gemma1-2B in the
-  // architecture. PaliGemma adds 1024 <locNNNN> and 128 <segNNN> tokens.
-  static constexpr int kVocabSize = 256000 + 1024 + 128;  // = 257152
-
-  // Sub-config for the Vision-Transformer part.
-  struct VitConfig : public ConfigNoSSM {
-    using Weight = TWeight;
-    // The ViT parts. https://arxiv.org/abs/2305.13035
-    // "SoViT-400m/14 [...] has a width of 1152, depth 27, and MLP dim 4304."
-    static constexpr std::array<LayerAttentionType, 27> kLayerConfig =
-        FixedLayerConfig<27>(LayerAttentionType::kVit);
-    static constexpr int kLayers = kLayerConfig.size();
-    static constexpr int kNumTensorScales = 4 * kLayers;
-    static constexpr int kModelDim = 1152;
-    static constexpr int kFFHiddenDim = 4304;
-    static constexpr int kHeads = 16;
-    static constexpr int kKVHeads = 16;  // standard MHA
-    static constexpr int kQKVDim = 72;
-    static constexpr int kSeqLen = 16 * 16;  // 256
-    static constexpr bool kFFBiases = true;
-    // The Vit part does not have a vocabulary, the image patches are embedded.
-    static constexpr int kVocabSize = 0;
-    // Dimensions related to image processing.
-    static constexpr int kPatchWidth = 14;
-    static constexpr int kImageSize = 224;
-    // Necessary constant for the layer configuration.
-    static constexpr PostNormType kPostNorm = PostNormType::None;
-  };
-};
-
-template <typename TWeight>
-struct ConfigGemma2_2B : public ConfigBaseGemmaV2 {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 8192;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 26> kLayerConfig =
-      FixedLayerConfig<26>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
-      RepeatedAttentionWindowSizes<26, 2>({4096, kSeqLen});
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 2304;
-  static constexpr int kFFHiddenDim = 8 * 2304 / 2;  // = 9216
-  static constexpr int kHeads = 8;
-  static constexpr int kKVHeads = 4;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-};
-
-template <typename TWeight>
-struct ConfigGemmaTiny : public ConfigNoSSM {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  static constexpr int kSeqLen = 32;
-  static constexpr int kVocabSize = 64;
-  static constexpr std::array<LayerAttentionType, 3> kLayerConfig =
-      FixedLayerConfig<3>(LayerAttentionType::kGemma);
-  static constexpr std::array<size_t, 3> kAttentionWindowSizes =
-      FixedAttentionWindowSizes<3>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kNumTensorScales = 4 * kLayers;
-  static constexpr int kGemmaLayers = kLayers;
-  static constexpr int kModelDim = 128;
-  static constexpr int kFFHiddenDim = 256;
-  static constexpr int kHeads = 4;
-  static constexpr int kKVHeads = 1;
-  static constexpr int kQKVDim = 16;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-
-  static constexpr float kAttCap = 0.0f;
-  // This is required for optimize_test to pass.
-  static constexpr float kFinalCap = 30.0f;
-};
-
-template <typename TWeight>
-struct ConfigGriffin2B : ConfigNoVit {
-  using Weight = TWeight;  // make accessible where we only have a TConfig
-
-  // Griffin uses local attention, so kSeqLen is actually the local attention
-  // window.
-  static constexpr int kSeqLen = 2048;
-  static constexpr int kVocabSize = gcpp::kVocabSize;
-  static constexpr std::array<LayerAttentionType, 26> kLayerConfig = {
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGemma,
-      LayerAttentionType::kGriffinRecurrentBlock,
-      LayerAttentionType::kGriffinRecurrentBlock,
-  };
-  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
-      FixedAttentionWindowSizes<26>(kSeqLen);
-  static constexpr int kLayers = kLayerConfig.size();
-  static constexpr int kGemmaLayers =
-      NumLayersOfTypeBefore(kLayerConfig, LayerAttentionType::kGemma, kLayers);
-  static constexpr int kGriffinLayers =
-      NumLayersOfTypeBefore(kLayerConfig,
-                            LayerAttentionType::kGriffinRecurrentBlock,
-                            kLayers);
-  static constexpr int kModelDim = 2560;
-  static constexpr int kFFHiddenDim = 7680;
-  static constexpr int kHeads = 10;
-  static constexpr int kKVHeads = 1;
-  static constexpr int kQKVDim = 256;  // query size == key size == value size
-  static constexpr int kTopK = gcpp::kTopK;
-  static constexpr bool kAbsolutePE = false;
-  static constexpr PostNormType kPostNorm = PostNormType::None;
-
-  // No SoftCap.
-  static constexpr float kAttCap = 0.0f;
-  static constexpr float kFinalCap = 0.0f;
-
-  // SSM config.
-  static constexpr int kConv1dWidth = 4;
-  static constexpr bool kFFBiases = true;
-  static constexpr bool kSoftmaxAttnOutputBiases = true;
-  static constexpr bool kUseHalfRope = true;
-  static constexpr bool kUseLocalAttention = true;
-  static constexpr bool kInterleaveQKV = false;
-  static constexpr int kNumTensorScales = 140;
-  static constexpr PostQKType kPostQK = PostQKType::Rope;
-  static constexpr ActivationType kActivation = ActivationType::Gelu;
-  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
-  static constexpr ResidualType kResidual = ResidualType::Add;
-};
+// Returns the config for the given model.
+ModelConfig ConfigFromModel(Model model);
 
 }  // namespace gcpp
 
diff --git a/gemma/configs_test.cc b/gemma/configs_test.cc
new file mode 100644
index 0000000..a6668a4
--- /dev/null
+++ b/gemma/configs_test.cc
@@ -0,0 +1,445 @@
+#include "gemma/configs.h"
+
+#include <array>
+#include <cstddef>
+#include <type_traits>
+
+#include "gtest/gtest.h"
+
+namespace gcpp {
+
+template <size_t kNum>
+constexpr std::array<LayerAttentionType, kNum> OldFixedLayerConfig(
+    LayerAttentionType type) {
+  std::array<LayerAttentionType, kNum> config = {};
+  for (LayerAttentionType& l : config) {
+    l = type;
+  }
+  return config;
+}
+
+template <size_t kNum>
+constexpr std::array<size_t, kNum> OldFixedAttentionWindowSizes(
+    size_t window_size) {
+  std::array<size_t, kNum> window_size_configs = {};
+  for (size_t& l : window_size_configs) {
+    l = window_size;
+  }
+  return window_size_configs;
+}
+
+// Repeat window_size_pattern for kNum / kPatternSize times.
+template <size_t kNum, size_t kPatternSize>
+constexpr std::array<size_t, kNum> OldRepeatedAttentionWindowSizes(
+    const std::array<size_t, kPatternSize>& window_size_pattern) {
+  static_assert(kNum % kPatternSize == 0,
+                "kNum must be a multiple of kPatternSize");
+  std::array<size_t, kNum> window_size_configs = {};
+  for (size_t i = 0; i < kNum; ++i) {
+    window_size_configs[i] = window_size_pattern[i % kPatternSize];
+  }
+  return window_size_configs;
+}
+
+template <size_t kNumLayers>
+constexpr size_t OldNumLayersOfTypeBefore(
+    const std::array<LayerAttentionType, kNumLayers>& layers,
+    LayerAttentionType type, size_t num) {
+  size_t count = 0;
+  for (size_t i = 0; i < num; i++) {
+    if (layers[i] == type) count++;
+  }
+  return count;
+}
+
+template <class TConfig, typename = void>
+struct CacheLayerSize {
+  constexpr size_t operator()() const {
+    return TConfig::kKVHeads * TConfig::kQKVDim * 2;
+  }
+};
+
+template <class TConfig, typename = void>
+struct CachePosSize {
+  constexpr size_t operator()() const {
+    return TConfig::kGemmaLayers * CacheLayerSize<TConfig>()();
+  }
+};
+
+struct OldConfigNoVit {
+  struct VitConfig {
+    // Some of these are needed to make the compiler happy when trying to
+    // generate code that will actually never be used.
+    using Weight = float;
+    static constexpr int kLayers = 0;
+    static constexpr std::array<LayerAttentionType, 0> kLayerConfig =
+        OldFixedLayerConfig<0>(LayerAttentionType::kVit);
+    static constexpr int kModelDim = 0;
+    static constexpr int kFFHiddenDim = 0;
+    static constexpr int kHeads = 1;  // Avoid division by 0 in griffin gate_w.
+    static constexpr int kKVHeads = 0;
+    static constexpr int kQKVDim = 0;
+    static constexpr int kSeqLen = 0;
+    static constexpr ResidualType kResidual = ResidualType::Add;
+    static constexpr int kGriffinLayers = 0;
+    static constexpr int kConv1dWidth = 0;
+    static constexpr bool kFFBiases = false;
+    static constexpr bool kSoftmaxAttnOutputBiases = false;
+    static constexpr PostNormType kPostNorm = PostNormType::None;
+  };
+};
+
+struct OldConfigNoSSM : OldConfigNoVit {
+  static constexpr int kGriffinLayers = 0;
+
+  static constexpr int kConv1dWidth = 0;
+  static constexpr bool kFFBiases = false;
+  static constexpr bool kSoftmaxAttnOutputBiases = false;
+  static constexpr bool kUseHalfRope = false;
+  static constexpr bool kUseLocalAttention = false;
+  static constexpr bool kInterleaveQKV = true;
+  static constexpr PostQKType kPostQK = PostQKType::Rope;
+  static constexpr ActivationType kActivation = ActivationType::Gelu;
+  static constexpr ResidualType kResidual = ResidualType::Add;
+};
+
+struct OldConfigBaseGemmaV1 : OldConfigNoSSM {
+  static constexpr float kAttCap = 0.0f;
+  static constexpr float kFinalCap = 0.0f;
+  static constexpr PostNormType kPostNorm = PostNormType::None;
+  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
+};
+
+struct OldConfigBaseGemmaV2 : OldConfigNoSSM {
+  static constexpr float kAttCap = 50.0f;
+  static constexpr float kFinalCap = 30.0f;
+  static constexpr PostNormType kPostNorm = PostNormType::Scale;
+};
+
+template <typename TWeight>
+struct OldConfigGemma2_27B : public OldConfigBaseGemmaV2 {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = 8192;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 46> kLayerConfig =
+      OldFixedLayerConfig<46>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 46> kAttentionWindowSizes =
+      OldRepeatedAttentionWindowSizes<46, 2>({4096, kSeqLen});
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 4608;
+  static constexpr int kFFHiddenDim = 16 * 4608 / 2;  // = 36864
+  static constexpr int kHeads = 32;
+  static constexpr int kKVHeads = 16;
+  static constexpr int kQKVDim = 128;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+  static constexpr QueryScaleType kQueryScale =
+      QueryScaleType::SqrtModelDimDivNumHeads;
+};
+
+template <typename TWeight>
+struct OldConfigGemma2_9B : public OldConfigBaseGemmaV2 {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = 8192;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 42> kLayerConfig =
+      OldFixedLayerConfig<42>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 42> kAttentionWindowSizes =
+      OldRepeatedAttentionWindowSizes<42, 2>({4096, kSeqLen});
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 3584;
+  static constexpr int kFFHiddenDim = 8 * 3584 / 2;  // = 14336
+  static constexpr int kHeads = 16;
+  static constexpr int kKVHeads = 8;
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
+};
+
+template <typename TWeight>
+struct OldConfigGemma7B : public OldConfigBaseGemmaV1 {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = gcpp::kSeqLen;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 28> kLayerConfig =
+      OldFixedLayerConfig<28>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 28> kAttentionWindowSizes =
+      OldFixedAttentionWindowSizes<28>(kSeqLen);
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 3072;
+  static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
+  static constexpr int kHeads = 16;
+  static constexpr int kKVHeads = 16;  // standard MHA
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+};
+
+template <typename TWeight>
+struct OldConfigGemma2B : public OldConfigBaseGemmaV1 {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = gcpp::kSeqLen;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 18> kLayerConfig =
+      OldFixedLayerConfig<18>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 18> kAttentionWindowSizes =
+      OldFixedAttentionWindowSizes<18>(kSeqLen);
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 2048;
+  static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
+  static constexpr int kHeads = 8;
+  static constexpr int kKVHeads = 1;
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+};
+
+template <typename TWeight>
+struct OldConfigPaliGemma_224 : public OldConfigGemma2B<TWeight> {
+  // On the LM side, the vocab size is one difference to Gemma1-2B in the
+  // architecture. PaliGemma adds 1024 <locNNNN> and 128 <segNNN> tokens.
+  static constexpr int kVocabSize = 256000 + 1024 + 128;  // = 257152
+
+  // Sub-config for the Vision-Transformer part.
+  struct VitConfig : public OldConfigNoSSM {
+    using Weight = TWeight;
+    // The ViT parts. https://arxiv.org/abs/2305.13035
+    // "SoViT-400m/14 [...] has a width of 1152, depth 27, and MLP dim 4304."
+    static constexpr std::array<LayerAttentionType, 27> kLayerConfig =
+        OldFixedLayerConfig<27>(LayerAttentionType::kVit);
+    static constexpr int kLayers = kLayerConfig.size();
+    static constexpr int kNumTensorScales = 4 * kLayers;
+    static constexpr int kModelDim = 1152;
+    static constexpr int kFFHiddenDim = 4304;
+    static constexpr int kHeads = 16;
+    static constexpr int kKVHeads = 16;  // standard MHA
+    static constexpr int kQKVDim = 72;
+    static constexpr int kSeqLen = 16 * 16;  // 256
+    static constexpr bool kFFBiases = true;
+    // The Vit part does not have a vocabulary, the image patches are embedded.
+    static constexpr int kVocabSize = 0;
+    // Dimensions related to image processing.
+    static constexpr int kPatchWidth = 14;
+    static constexpr int kImageSize = 224;
+    // Necessary constant for the layer configuration.
+    static constexpr PostNormType kPostNorm = PostNormType::None;
+  };
+};
+
+template <typename TWeight>
+struct OldConfigGemma2_2B : public OldConfigBaseGemmaV2 {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = 8192;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 26> kLayerConfig =
+      OldFixedLayerConfig<26>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
+      OldRepeatedAttentionWindowSizes<26, 2>({4096, kSeqLen});
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 2304;
+  static constexpr int kFFHiddenDim = 8 * 2304 / 2;  // = 9216
+  static constexpr int kHeads = 8;
+  static constexpr int kKVHeads = 4;
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
+};
+
+template <typename TWeight>
+struct OldConfigGemmaTiny : public OldConfigNoSSM {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  static constexpr int kSeqLen = 32;
+  static constexpr int kVocabSize = 64;
+  static constexpr std::array<LayerAttentionType, 3> kLayerConfig =
+      OldFixedLayerConfig<3>(LayerAttentionType::kGemma);
+  static constexpr std::array<size_t, 3> kAttentionWindowSizes =
+      OldFixedAttentionWindowSizes<3>(kSeqLen);
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kNumTensorScales = 4 * kLayers;
+  static constexpr int kGemmaLayers = kLayers;
+  static constexpr int kModelDim = 128;
+  static constexpr int kFFHiddenDim = 256;
+  static constexpr int kHeads = 4;
+  static constexpr int kKVHeads = 1;
+  static constexpr int kQKVDim = 16;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+  static constexpr PostNormType kPostNorm = PostNormType::None;
+  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
+
+  static constexpr float kAttCap = 0.0f;
+  // This is required for optimize_test to pass.
+  static constexpr float kFinalCap = 30.0f;
+};
+
+template <typename TWeight>
+struct OldConfigGriffin2B : OldConfigNoVit {
+  using Weight = TWeight;  // make accessible where we only have a TConfig
+
+  // Griffin uses local attention, so kSeqLen is actually the local attention
+  // window.
+  static constexpr int kSeqLen = 2048;
+  static constexpr int kVocabSize = gcpp::kVocabSize;
+  static constexpr std::array<LayerAttentionType, 26> kLayerConfig = {
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+  };
+  static constexpr std::array<size_t, 26> kAttentionWindowSizes =
+      OldFixedAttentionWindowSizes<26>(kSeqLen);
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kGemmaLayers = OldNumLayersOfTypeBefore(
+      kLayerConfig, LayerAttentionType::kGemma, kLayers);
+  static constexpr int kGriffinLayers = OldNumLayersOfTypeBefore(
+      kLayerConfig, LayerAttentionType::kGriffinRecurrentBlock, kLayers);
+  static constexpr int kModelDim = 2560;
+  static constexpr int kFFHiddenDim = 7680;
+  static constexpr int kHeads = 10;
+  static constexpr int kKVHeads = 1;
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+  static constexpr bool kAbsolutePE = false;
+  static constexpr PostNormType kPostNorm = PostNormType::None;
+
+  // No SoftCap.
+  static constexpr float kAttCap = 0.0f;
+  static constexpr float kFinalCap = 0.0f;
+
+  // SSM config.
+  static constexpr int kConv1dWidth = 4;
+  static constexpr bool kFFBiases = true;
+  static constexpr bool kSoftmaxAttnOutputBiases = true;
+  static constexpr bool kUseHalfRope = true;
+  static constexpr bool kUseLocalAttention = true;
+  static constexpr bool kInterleaveQKV = false;
+  static constexpr int kNumTensorScales = 140;
+  static constexpr PostQKType kPostQK = PostQKType::Rope;
+  static constexpr ActivationType kActivation = ActivationType::Gelu;
+  static constexpr QueryScaleType kQueryScale = QueryScaleType::SqrtKeySize;
+  static constexpr ResidualType kResidual = ResidualType::Add;
+};
+
+template <class TConfig>
+void AssertMatch(const ModelConfig& config) {
+  ASSERT_EQ(TConfig::kModelDim, config.model_dim);
+  if constexpr (TConfig::VitConfig::kModelDim != 0) {
+    ASSERT_EQ(TConfig::VitConfig::kModelDim, config.vit_model_dim);
+    ASSERT_EQ(TConfig::VitConfig::kSeqLen, config.vit_seq_len);
+    ASSERT_EQ(TConfig::VitConfig::kNumTensorScales, config.num_vit_scales);
+    for (size_t i = 0; i < config.vit_layer_configs.size(); ++i) {
+      ASSERT_EQ(TConfig::VitConfig::kLayerConfig[i],
+                config.vit_layer_configs[i].type);
+    }
+  }
+  ASSERT_EQ(TConfig::kVocabSize, config.vocab_size);
+  ASSERT_EQ(TConfig::kSeqLen, config.seq_len);
+  ASSERT_EQ(TConfig::kTopK, config.top_k);
+  ASSERT_EQ(TConfig::kAttCap, config.att_cap);
+  ASSERT_EQ(TConfig::kFinalCap, config.final_cap);
+  ASSERT_EQ(TConfig::kAbsolutePE, config.absolute_pe);
+  ASSERT_EQ(TConfig::kUseLocalAttention, config.use_local_attention);
+  ASSERT_EQ(TConfig::kQueryScale, config.query_scale);
+  ASSERT_EQ(TConfig::kGemmaLayers,
+            config.NumLayersOfType(LayerAttentionType::kGemma));
+  ASSERT_EQ(TConfig::kGriffinLayers,
+            config.NumLayersOfType(LayerAttentionType::kGriffinRecurrentBlock));
+  for (size_t i = 0; i < config.layer_configs.size(); ++i) {
+    ASSERT_EQ(TConfig::kModelDim, config.layer_configs[i].model_dim);
+    ASSERT_EQ(TConfig::kFFHiddenDim, config.layer_configs[i].ff_hidden_dim);
+    ASSERT_EQ(TConfig::kHeads, config.layer_configs[i].heads);
+    ASSERT_EQ(TConfig::kKVHeads, config.layer_configs[i].kv_heads);
+    ASSERT_EQ(TConfig::kQKVDim, config.layer_configs[i].qkv_dim);
+    ASSERT_EQ(TConfig::kConv1dWidth, config.layer_configs[i].conv1d_width);
+    ASSERT_EQ(TConfig::kFFBiases, config.layer_configs[i].ff_biases);
+    ASSERT_EQ(TConfig::kSoftmaxAttnOutputBiases,
+              config.layer_configs[i].softmax_attn_output_biases);
+    ASSERT_EQ(TConfig::kPostNorm, config.layer_configs[i].post_norm);
+    ASSERT_EQ(TConfig::kLayerConfig[i], config.layer_configs[i].type);
+    ASSERT_EQ(TConfig::kActivation, config.layer_configs[i].activation);
+    ASSERT_EQ(TConfig::kPostQK, config.layer_configs[i].post_qk);
+  }
+
+  ASSERT_EQ(TConfig::kAttentionWindowSizes.size(),
+            config.attention_window_sizes.size());
+  for (size_t i = 0; i < config.attention_window_sizes.size(); ++i) {
+    ASSERT_EQ(TConfig::kAttentionWindowSizes[i],
+              config.attention_window_sizes[i]);
+  }
+  ASSERT_EQ(TConfig::kNumTensorScales, config.num_tensor_scales);
+}
+
+TEST(ConfigsTest, OldConfigGemma2B) {
+  AssertMatch<OldConfigGemma2B<float>>(ConfigFromModel(Model::GEMMA_2B));
+}
+
+TEST(ConfigsTest, OldConfigGemma7B) {
+  AssertMatch<OldConfigGemma7B<float>>(ConfigFromModel(Model::GEMMA_7B));
+}
+
+TEST(ConfigsTest, OldConfigGemma2_2B) {
+  AssertMatch<OldConfigGemma2_2B<float>>(ConfigFromModel(Model::GEMMA2_2B));
+}
+
+TEST(ConfigsTest, OldConfigGemma2_9B) {
+  AssertMatch<OldConfigGemma2_9B<float>>(ConfigFromModel(Model::GEMMA2_9B));
+}
+
+TEST(ConfigsTest, OldConfigGemma2_27B) {
+  AssertMatch<OldConfigGemma2_27B<float>>(ConfigFromModel(Model::GEMMA2_27B));
+}
+
+TEST(ConfigsTest, OldConfigGriffin2B) {
+  AssertMatch<OldConfigGriffin2B<float>>(ConfigFromModel(Model::GRIFFIN_2B));
+}
+
+TEST(ConfigsTest, OldConfigGemmaTiny) {
+  AssertMatch<OldConfigGemmaTiny<float>>(ConfigFromModel(Model::GEMMA_TINY));
+}
+
+TEST(ConfigsTest, OldConfigPaliGemma_224) {
+  AssertMatch<OldConfigPaliGemma_224<float>>(
+      ConfigFromModel(Model::PALIGEMMA_224));
+}
+
+}  // namespace gcpp
diff --git a/gemma/gemma-inl.h b/gemma/gemma-inl.h
index 5e3135b..ce72a04 100644
--- a/gemma/gemma-inl.h
+++ b/gemma/gemma-inl.h
@@ -15,6 +15,7 @@
 
 // SIMD functions for Gemma/Griffin transformers.
 
+#include <math.h>  // sqrtf
 #include <stddef.h>
 #include <stdio.h>
 
@@ -53,14 +54,14 @@
 #include "ops/ops-inl.h"
 #include "hwy/profiler.h"  // also uses SIMD
 
-#ifndef GEMMA_CONFIG
+#ifndef GEMMA_TYPE
 #if HWY_IDE
 // Provide a definition so the IDE does not complain.
-#define GEMMA_CONFIG ConfigGemmaTiny<float>
+#define GEMMA_TYPE float
 #else
-#error "Only include from instantiations/*.cc, which must define GEMMA_CONFIG"
+#error "Only include from instantiations/*.cc, which must define GEMMA_TYPE"
 #endif  // HWY_IDE
-#endif  // GEMMA_CONFIG
+#endif  // GEMMA_TYPE
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
@@ -72,31 +73,31 @@ namespace HWY_NAMESPACE {
 // `Attention`, use separate `num_tokens` and `num_queries`.
 
 // TODO: add batch query support for Griffin (QueriesPos).
-template <class TConfig>
-HWY_NOINLINE void GriffinRecurrent(
-    size_t batch_start, size_t num_tokens, size_t layer,
-    Activations& activations, const CompressedLayer<TConfig>* layer_weights,
-    const KVCaches& kv_caches) {
+template <typename T>
+HWY_NOINLINE void GriffinRecurrent(size_t batch_start, size_t num_tokens,
+                                   size_t layer, Activations& activations,
+                                   const LayerWeightsPtrs<T>* layer_weights,
+                                   const KVCaches& kv_caches) {
   PROFILER_ZONE("Gen.Griffin");
   KVCache& kv_cache = kv_caches[0];
   hwy::ThreadPool& pool = activations.env.Pool();
   namespace hn = hwy::HWY_NAMESPACE;
   using D = hn::ScalableTag<float>;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
-  static constexpr size_t kHeads = TConfig::kHeads;
+  const size_t model_dim = layer_weights->layer_config.model_dim;
+  const size_t conv_1d_width = layer_weights->layer_config.conv1d_width;
+  const size_t heads = layer_weights->layer_config.heads;
 
   // X / Y linear layers.
   for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
     float* HWY_RESTRICT y = activations.griffin_y.Batch(batch_idx);
     float* HWY_RESTRICT x = activations.griffin_x.Batch(batch_idx);
-    TwoMatVecAdd<kModelDim, kModelDim>(
-        layer_weights->griffin.linear_x_w, layer_weights->griffin.linear_y_w, 0,
-        activations.pre_att_rms_out.Batch(batch_idx),
-        /*add0=*/layer_weights->griffin.linear_x_biases.data_scale1(),
-        /*add1=*/layer_weights->griffin.linear_y_biases.data_scale1(),
-        /*out0=*/x, /*out1=*/y, pool);
-    Gelu(y, kModelDim);
+    TwoMatVecAdd(layer_weights->griffin.linear_x_w,
+                 layer_weights->griffin.linear_y_w, 0, model_dim, model_dim,
+                 activations.pre_att_rms_out.Batch(batch_idx),
+                 /*add0=*/layer_weights->griffin.linear_x_biases.data_scale1(),
+                 /*add1=*/layer_weights->griffin.linear_y_biases.data_scale1(),
+                 /*out0=*/x, /*out1=*/y, pool);
+    Gelu(y, model_dim);
   }
 
   // Conv1D.
@@ -104,33 +105,35 @@ HWY_NOINLINE void GriffinRecurrent(
     const size_t pos = batch_start + batch_idx;
     float* HWY_RESTRICT x = activations.griffin_x.Batch(batch_idx);
     HWY_FULL(float) df;
-    HWY_DASSERT(kModelDim % hn::Lanes(df) == 0);
-    const size_t layer_offset = layer * kModelDim * (kConv1dWidth - 1);
+    HWY_DASSERT(model_dim % hn::Lanes(df) == 0);
+    const size_t layer_offset = layer * model_dim * (conv_1d_width - 1);
 
     // cache[i] = input at time t-i.
-    float* HWY_RESTRICT cache[HWY_MAX(kConv1dWidth, 1)];
+    float* HWY_RESTRICT cache[HWY_MAX(conv_1d_width, 1)];
     cache[0] = x;
-    for (size_t i = 1; i < kConv1dWidth; i++) {
+    for (size_t i = 1; i < conv_1d_width; i++) {
       cache[i] =
           kv_cache.conv1d_cache.get() + layer_offset +
-          ((pos + kConv1dWidth - 1 - i) % (kConv1dWidth - 1)) * kModelDim;
+          ((pos + conv_1d_width - 1 - i) % (conv_1d_width - 1)) * model_dim;
     }
-    for (size_t i = 0; i < kModelDim; i += hn::Lanes(df)) {
+    for (size_t i = 0; i < model_dim; i += hn::Lanes(df)) {
       auto xv = hn::Load(df, x + i);
       auto accum0 =
           hn::Load(df, layer_weights->griffin.conv_biases.data_scale1() + i);
       auto accum1 = hn::Zero(df);
-      static_assert(kConv1dWidth % 2 == 0, "Conv width must be even");
-      for (size_t l = 0; 2 * l < kConv1dWidth; l++) {
-        auto wv0 = hn::Load(df, layer_weights->griffin.conv_w.data_scale1() +
-                                (kConv1dWidth - 1 - 2 * l) * kModelDim + i);
-        auto wv1 = hn::Load(df, layer_weights->griffin.conv_w.data_scale1() +
-                                (kConv1dWidth - 2 - 2 * l) * kModelDim + i);
+      HWY_ASSERT_M(conv_1d_width % 2 == 0, "Conv width must be even");
+      for (size_t l = 0; 2 * l < conv_1d_width; l++) {
+        auto wv0 =
+            hn::Load(df, layer_weights->griffin.conv_w.data_scale1() +
+                             (conv_1d_width - 1 - 2 * l) * model_dim + i);
+        auto wv1 =
+            hn::Load(df, layer_weights->griffin.conv_w.data_scale1() +
+                             (conv_1d_width - 2 - 2 * l) * model_dim + i);
         accum0 = hn::MulAdd(wv0, hn::Load(df, cache[l * 2] + i), accum0);
         accum1 = hn::MulAdd(wv1, hn::Load(df, cache[l * 2 + 1] + i), accum1);
       }
       hn::Store(hn::Add(accum0, accum1), df, x + i);
-      hn::Store(xv, df, cache[HWY_MAX(kConv1dWidth, 1) - 1] + i);
+      hn::Store(xv, df, cache[HWY_MAX(conv_1d_width, 1) - 1] + i);
     }
   }
 
@@ -142,19 +145,19 @@ HWY_NOINLINE void GriffinRecurrent(
     float* HWY_RESTRICT gate_x = activations.griffin_gate_x.Batch(batch_idx);
     float* HWY_RESTRICT a = activations.griffin_multiplier.Batch(batch_idx);
     float* HWY_RESTRICT rnn_state =
-        kv_cache.rglru_cache.get() + layer * kModelDim;
+        kv_cache.rglru_cache.get() + layer * model_dim;
 
-    pool.Run(0, kHeads, [&](const uint64_t head, size_t /*thread*/) HWY_ATTR {
-      constexpr size_t kHeadDim = kModelDim / kHeads;
-      constexpr size_t kMatrixSize = kHeadDim * kHeadDim;
+    pool.Run(0, heads, [&](const uint64_t head, size_t /*thread*/) HWY_ATTR {
+      const size_t kHeadDim = model_dim / heads;
+      const size_t kMatrixSize = kHeadDim * kHeadDim;
       size_t head_offset = head * kHeadDim;
-      TwoOfsMatVecAddLoop<kHeadDim, kHeadDim>(
+      TwoOfsMatVecAddLoop(
           layer_weights->griffin.gate_w, kMatrixSize * head,
-          kMatrixSize * (kHeads + head), x + head_offset,
+          kMatrixSize * (heads + head), kHeadDim, kHeadDim, x + head_offset,
           /*add0=*/layer_weights->griffin.gate_biases.data_scale1() +
               head_offset,
           /*add1=*/layer_weights->griffin.gate_biases.data_scale1() +
-              kModelDim + head_offset,
+              model_dim + head_offset,
           /*out0=*/gate_x + head_offset, /*out1=*/a + head_offset);
       Sigmoid(gate_x + head_offset, kHeadDim);
       Sigmoid(a + head_offset, kHeadDim);
@@ -192,89 +195,86 @@ HWY_NOINLINE void GriffinRecurrent(
   for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
     float* HWY_RESTRICT x = activations.griffin_x.Batch(batch_idx);
     float* out_ptr = activations.att_sums.Batch(batch_idx);
-    MatVecAdd<kModelDim, kModelDim>(
-        layer_weights->griffin.linear_out_w, 0, x,
-        layer_weights->griffin.linear_out_biases.data_scale1(), out_ptr, pool);
+    MatVecAdd(layer_weights->griffin.linear_out_w, 0, model_dim, model_dim, x,
+              layer_weights->griffin.linear_out_biases.data_scale1(), out_ptr,
+              pool);
   }
 }
 
 // Wrapper class; holds arguments in member variables to shorten call sites.
-template <class TConfig>
+template <typename T>
 class GemmaAttention {
-  static constexpr size_t kCacheLayerSize = CacheLayerSize<TConfig>()();
-  static constexpr size_t kCachePosSize = CachePosSize<TConfig>()();
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kKVHeads = TConfig::kKVHeads;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kQStride = Activations::QStride<TConfig>();
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-  static constexpr bool kIsMHA = Activations::IsMHA<TConfig>();
-
   // The attention window usually starts at 0 unless `pos` is larger than
   // the attention window size, then it is `pos` - window_size + 1.
-  static HWY_INLINE size_t StartPos(size_t pos, size_t layer) {
-    const size_t att_window_size = TConfig::kAttentionWindowSizes[layer];
+  HWY_INLINE size_t StartPos(size_t pos, size_t layer) {
+    const size_t att_window_size =
+        activations_.weights_config.attention_window_sizes[layer];
     return pos - std::min(att_window_size - 1, pos);
   }
 
-  template <typename T>
-  HWY_INLINE void PositionalEncodingQK(const T* qk, size_t pos, size_t layer,
-                                       const float mul, T* qk_out) {
+  template <typename U>
+  HWY_INLINE void PositionalEncodingQK(const U* qk, size_t pos, size_t layer,
+                                       const float mul, U* qk_out) {
     const float* inv_timescale = activations_.inv_timescale.Const();
     // PostQKType::Rope
     (void)layer;
-    if (TConfig::kUseHalfRope) {
-      hwy::CopyBytes(qk, qk_out, kQKVDim * sizeof(*qk));
-      Rope(qk_out, kQKVDim / 2, inv_timescale, pos);
-      MulByConst(mul, qk_out, kQKVDim);
+    if (layer_weights_.layer_config.post_qk == PostQKType::HalfRope) {
+      hwy::CopyBytes(qk, qk_out, layer_config_.qkv_dim * sizeof(*qk));
+      Rope(qk_out, layer_config_.qkv_dim / 2, inv_timescale, pos);
+      MulByConst(mul, qk_out, layer_config_.qkv_dim);
     } else {
-      RopeAndMulBy(mul, qk, kQKVDim, inv_timescale, pos, qk_out);
+      RopeAndMulBy(mul, qk, layer_config_.qkv_dim, inv_timescale, pos, qk_out);
     }
   }
 
-  // Fills activations.q and computes KV. For kIsMHA, a single MatMul suffices
+  // Fills activations.q and computes KV. For is_mha_, a single MatMul suffices
   // and we later copy KV from q to KVCache. Otherwise, a second MatMul writes
   // KV directly to KVCache.
   HWY_NOINLINE void ComputeQKV(const size_t num_interleaved) {
     PROFILER_ZONE("Gen.Attention.QKV");
     // For the computation of Q, K, and V, it is useful to remember that
-    // qkv_einsum_w has shape [(kHeads + kKVHeads * 2), kKQVDim, kModelDim]
-    // and kQStride = kQKVDim * (kIsMHA ? 3 : 1);
+    // qkv_einsum_w has shape [(layer_config_.heads + layer_config_.kv_heads *
+    // 2), kKQVDim, layer_config_.model_dim] and q_stride_ =
+    // layer_config_.qkv_dim * (is_mha_ ? 3 : 1);
 
     const auto pre_att_rms_out =
-        ConstMat(activations_.pre_att_rms_out.All(), kModelDim);
-    const auto w_q1 =
-        layer_weights_.qkv_einsum_w.data() == nullptr
-            ? ConstMat(layer_weights_.qkv_einsum_w1.data(), kModelDim)
-            : ConstMat(layer_weights_.qkv_einsum_w.data(), kModelDim);
+        ConstMat(activations_.pre_att_rms_out.All(), layer_config_.model_dim);
+    const auto w_q1 = layer_weights_.qkv_einsum_w.data() == nullptr
+                          ? ConstMat(layer_weights_.qkv_einsum_w1.data(),
+                                     layer_config_.model_dim)
+                          : ConstMat(layer_weights_.qkv_einsum_w.data(),
+                                     layer_config_.model_dim);
     const auto w_q2 =
         layer_weights_.qkv_einsum_w.data() == nullptr
-            ? ConstMat(layer_weights_.qkv_einsum_w2.data(), kModelDim)
-            : ConstMat(layer_weights_.qkv_einsum_w.data(), kModelDim, kModelDim,
-                       kHeads * kQKVDim * kModelDim);
-    MatMul</*kAdd=*/false>(num_interleaved, pre_att_rms_out, w_q1,
-                           layer_weights_.qkv_einsum_w.scale(), /*add=*/nullptr,
-                           activations_.env,
-                           MutableMat(activations_.q.All(), kHeads * kQStride));
+            ? ConstMat(layer_weights_.qkv_einsum_w2.data(),
+                       layer_config_.model_dim)
+            : ConstMat(layer_weights_.qkv_einsum_w.data(),
+                       layer_config_.model_dim, layer_config_.model_dim,
+                       layer_config_.heads * layer_config_.qkv_dim *
+                           layer_config_.model_dim);
+    MatMul</*kAdd=*/false>(
+        num_interleaved, pre_att_rms_out, w_q1,
+        layer_weights_.qkv_einsum_w.scale(), /*add=*/nullptr, activations_.env,
+        MutableMat(activations_.q.All(), layer_config_.heads * q_stride_));
 
-    if constexpr (kIsMHA) {
-      static_assert(TConfig::kInterleaveQKV, "MHA implies interleaved");
+    if (is_mha_) {
       // Multi-Head Attention a.k.a. "use_qkv_einsum" computed QKV already.
     } else {
       // Single query and no wraparound means we can use a matmul and write
-      // directly into the KV cache with a stride of kCachePosSize.
+      // directly into the KV cache with a stride of cache_pos_size_.
       if (num_queries_ == 1 &&
           queries_pos_[0] + num_tokens_ <= div_seq_len_.GetDivisor()) {
         const size_t kv_ofs =
-            queries_pos_[0] * kCachePosSize + layer_ * kCacheLayerSize;
-        // KV structure is [k, v, k, v, ....] = kKVHeads pairs of (k, v).
+            queries_pos_[0] * cache_pos_size_ + layer_ * cache_layer_size_;
+        // KV structure is [k, v, k, v, ....] = layer_config_.kv_heads pairs of
+        // (k, v).
         float* HWY_RESTRICT kv = kv_caches_[0].kv_cache.get() + kv_ofs;
         MatMul</*kAdd=*/false>(
             num_tokens_, pre_att_rms_out, w_q2,
             layer_weights_.qkv_einsum_w.scale(), /*add=*/nullptr,
             activations_.env,
-            MutableMat(kv, kKVHeads * 2 * kQKVDim, kCachePosSize));
+            MutableMat(kv, layer_config_.kv_heads * 2 * layer_config_.qkv_dim,
+                       cache_pos_size_));
       } else {
         // Proceed row by row because there will be wraparound.
         for (size_t interleaved_idx = 0; interleaved_idx < num_interleaved;
@@ -286,71 +286,77 @@ class GemmaAttention {
           const size_t cache_pos =
               div_seq_len_.Remainder(queries_pos_[query_idx] + batch_idx);
           const size_t kv_offset =
-              cache_pos * kCachePosSize + layer_ * kCacheLayerSize;
+              cache_pos * cache_pos_size_ + layer_ * cache_layer_size_;
           float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + kv_offset;
-          // KV structure is [k, v, k, v, ....] = kKVHeads pairs of (k, v).
+          // KV structure is [k, v, k, v, ....] = layer_config_.kv_heads pairs
+          // of (k, v).
           if (layer_weights_.qkv_einsum_w.data() == nullptr) {
-            MatVec<kKVHeads * 2 * kQKVDim, kModelDim>(
-                layer_weights_.qkv_einsum_w2, 0, x, kv, pool_);
+            MatVec(layer_weights_.qkv_einsum_w2, 0,
+                   layer_config_.kv_heads * 2 * layer_config_.qkv_dim,
+                   layer_config_.model_dim, x, kv, pool_);
           } else {
-            MatVec<kKVHeads * 2 * kQKVDim, kModelDim>(
-                layer_weights_.qkv_einsum_w, kHeads * kQKVDim * kModelDim, x,
-                kv, pool_);
+            MatVec(layer_weights_.qkv_einsum_w,
+                   layer_config_.heads * layer_config_.qkv_dim *
+                       layer_config_.model_dim,
+                   layer_config_.kv_heads * 2 * layer_config_.qkv_dim,
+                   layer_config_.model_dim, x, kv, pool_);
           }
         }
       }
     }
 
     // Apply positional encodings for K (and copy KV to cache if MHA).
-    pool_.Run(
-        0, kKVHeads * num_interleaved,
-        [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-          const size_t head = task % kKVHeads;
-          const size_t interleaved_idx = task / kKVHeads;
-          const size_t query_idx = interleaved_idx % num_queries_;
-          const size_t batch_idx = interleaved_idx / num_queries_;
-          const size_t pos = queries_pos_[query_idx] + batch_idx;
-          const size_t cache_pos = div_seq_len_.Remainder(pos);
-          const size_t kv_offset = cache_pos * kCachePosSize +
-                                   layer_ * kCacheLayerSize +
-                                   head * kQKVDim * 2;
-          KVCache& kv_cache = kv_caches_[query_idx];
-          float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + kv_offset;
-          const float* HWY_RESTRICT mha_kv =
-              activations_.q.Batch(interleaved_idx) + head * kQStride + kQKVDim;
+    pool_.Run(0, layer_config_.kv_heads * num_interleaved,
+              [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
+                const size_t head = task % layer_config_.kv_heads;
+                const size_t interleaved_idx = task / layer_config_.kv_heads;
+                const size_t query_idx = interleaved_idx % num_queries_;
+                const size_t batch_idx = interleaved_idx / num_queries_;
+                const size_t pos = queries_pos_[query_idx] + batch_idx;
+                const size_t cache_pos = div_seq_len_.Remainder(pos);
+                const size_t kv_offset = cache_pos * cache_pos_size_ +
+                                         layer_ * cache_layer_size_ +
+                                         head * layer_config_.qkv_dim * 2;
+                KVCache& kv_cache = kv_caches_[query_idx];
+                float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + kv_offset;
+                const float* HWY_RESTRICT mha_kv =
+                    activations_.q.Batch(interleaved_idx) + head * q_stride_ +
+                    layer_config_.qkv_dim;
 
-          // Copy from `q` if MHA, or apply in-place.
-          PositionalEncodingQK(kIsMHA ? mha_kv : kv, pos, layer_, 1.0f, kv);
+                // Copy from `q` if MHA, or apply in-place.
+                PositionalEncodingQK(is_mha_ ? mha_kv : kv, pos, layer_, 1.0f,
+                                     kv);
 
-          // If MHA, also copy V into KVCache.
-          if (kIsMHA) {
-            hwy::CopyBytes(mha_kv + kQKVDim, kv + kQKVDim,
-                           kQKVDim * sizeof(*kv));
-          }
-        });
+                // If MHA, also copy V into KVCache.
+                if (is_mha_) {
+                  hwy::CopyBytes(mha_kv + layer_config_.qkv_dim,
+                                 kv + layer_config_.qkv_dim,
+                                 layer_config_.qkv_dim * sizeof(*kv));
+                }
+              });
   }
 
   // Computes Q.K scores, which are "logits" (or scores) stored to head_att.
   HWY_INLINE void QDotK(const size_t start_pos, const size_t last_pos,
                         const size_t head_offset, const float* HWY_RESTRICT q,
                         const KVCache& kv_cache, float* HWY_RESTRICT head_att) {
-    if (HWY_LIKELY(last_pos < kSeqLen)) {
+    if (HWY_LIKELY(last_pos < activations_.seq_len)) {
       // Slightly faster: no wraparound.
       for (size_t pos = start_pos; pos <= last_pos; ++pos) {
         const size_t kv_offset =
-            pos * kCachePosSize + layer_ * kCacheLayerSize + head_offset;
+            pos * cache_pos_size_ + layer_ * cache_layer_size_ + head_offset;
         const float* HWY_RESTRICT k = &kv_cache.kv_cache[kv_offset];
-        const float score = Dot(q, k, kQKVDim);
+        const float score = Dot(q, k, layer_config_.qkv_dim);
         head_att[pos] = score;
       }
     } else {
       for (size_t pos = start_pos; pos <= last_pos; ++pos) {
         const size_t cache_pos = div_seq_len_.Remainder(pos);
-        const size_t kv_offset =
-            cache_pos * kCachePosSize + layer_ * kCacheLayerSize + head_offset;
+        const size_t kv_offset = cache_pos * cache_pos_size_ +
+                                 layer_ * cache_layer_size_ + head_offset;
         const float* HWY_RESTRICT k = &kv_cache.kv_cache[kv_offset];
-        const float score = Dot(q, k, kQKVDim);
-        head_att[pos % kSeqLen] = score;
+        const float score = Dot(q, k, layer_config_.qkv_dim);
+        head_att[pos % activations_.seq_len] = score;
       }
     }
   }
@@ -358,59 +364,60 @@ class GemmaAttention {
   // Accumulates the sum of v (from `kv_cache`) * probability (`head_att`) into
   // `att_out`. Equivalent in gemma/modules.py:
   // encoded = jnp.einsum('BTNS,BSNH->BTNH', probs, value_proj)
-  static HWY_INLINE void WeightedSumV(
-      const size_t start_pos, const size_t last_pos,
-      const float* HWY_RESTRICT head_att, const size_t layer,
-      const size_t head_offset, const hwy::Divisor& div_seq_len,
-      const KVCache& kv_cache, float* HWY_RESTRICT att_out) {
-    hwy::ZeroBytes(att_out, kQKVDim * sizeof(*att_out));
+  HWY_INLINE void WeightedSumV(const size_t start_pos, const size_t last_pos,
+                               const float* HWY_RESTRICT head_att,
+                               const size_t layer, const size_t head_offset,
+                               const hwy::Divisor& div_seq_len,
+                               const KVCache& kv_cache,
+                               float* HWY_RESTRICT att_out) const {
+    hwy::ZeroBytes(att_out, layer_config_.qkv_dim * sizeof(*att_out));
 
-    if (HWY_LIKELY(last_pos < kSeqLen)) {
+    if (HWY_LIKELY(last_pos < activations_.seq_len)) {
       // Slightly faster: no wraparound.
       for (size_t pos = start_pos; pos <= last_pos; ++pos) {
         const size_t kv_offset =
-            pos * kCachePosSize + layer * kCacheLayerSize + head_offset;
+            pos * cache_pos_size_ + layer * cache_layer_size_ + head_offset;
         const float* HWY_RESTRICT v =
-            kv_cache.kv_cache.get() + kv_offset + kQKVDim;
-        MulByConstAndAdd(head_att[pos], v, att_out, kQKVDim);
+            kv_cache.kv_cache.get() + kv_offset + layer_config_.qkv_dim;
+        MulByConstAndAdd(head_att[pos], v, att_out, layer_config_.qkv_dim);
       }
     } else {
       for (size_t pos = start_pos; pos <= last_pos; ++pos) {
         const size_t cache_pos = div_seq_len.Remainder(pos);
-        const size_t kv_offset =
-            cache_pos * kCachePosSize + layer * kCacheLayerSize + head_offset;
+        const size_t kv_offset = cache_pos * cache_pos_size_ +
+                                 layer * cache_layer_size_ + head_offset;
         const float* HWY_RESTRICT v =
-            kv_cache.kv_cache.get() + kv_offset + kQKVDim;
-        MulByConstAndAdd(head_att[pos % kSeqLen], v, att_out, kQKVDim);
+            kv_cache.kv_cache.get() + kv_offset + layer_config_.qkv_dim;
+        MulByConstAndAdd(head_att[pos % activations_.seq_len], v, att_out,
+                         layer_config_.qkv_dim);
       }
     }
   }
 
   HWY_NOINLINE void DotSoftmaxWeightedSum(const size_t num_interleaved) {
     PROFILER_ZONE("Gen.Attention.DotSoftmax");
-    GEMMA_CONSTEXPR_SQRT float kQueryScale = ChooseQueryScale<TConfig>();
+    const float query_scale = ChooseQueryScale(activations_.weights_config);
 
     // A "head group" in the context of GQA refers to a collection of query
     // heads that share the same key and value heads.
-    static_assert((kHeads % kKVHeads) == 0,
-                  "query heads must be a multiple of key-value heads");
-    constexpr size_t kHeadGroups = kHeads / kKVHeads;
+    const size_t kHeadGroups = layer_config_.heads / layer_config_.kv_heads;
 
     // For each head (token, query), compute Q.K, softmax, and weighted V.
-    pool_.Run(0, kHeads * num_interleaved,
+    pool_.Run(0, layer_config_.heads * num_interleaved,
               [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-                const size_t head = task % kHeads;
-                const size_t interleaved_idx = task / kHeads;
+                const size_t head = task % layer_config_.heads;
+                const size_t interleaved_idx = task / layer_config_.heads;
                 const size_t query_idx = interleaved_idx % num_queries_;
                 const size_t batch_idx = interleaved_idx / num_queries_;
-                const size_t head_offset = (head / kHeadGroups) * kQKVDim * 2;
+                const size_t head_offset =
+                    (head / kHeadGroups) * layer_config_.qkv_dim * 2;
                 KVCache& kv_cache = kv_caches_[query_idx];
                 float* HWY_RESTRICT q =
-                    activations_.q.Batch(interleaved_idx) + head * kQStride;
+                    activations_.q.Batch(interleaved_idx) + head * q_stride_;
 
                 // Apply rope and scaling to Q.
                 const size_t pos = queries_pos_[query_idx] + batch_idx;
-                PositionalEncodingQK(q, pos, layer_, kQueryScale, q);
+                PositionalEncodingQK(q, pos, layer_, query_scale, q);
 
                 const size_t start_pos = StartPos(pos, layer_);
                 size_t last_pos = pos;
@@ -421,39 +428,62 @@ class GemmaAttention {
                 }
 
                 float* HWY_RESTRICT head_att =
-                    activations_.att.Batch(interleaved_idx) + head * kSeqLen;
+                    activations_.att.Batch(interleaved_idx) +
+                    head * activations_.seq_len;
                 QDotK(start_pos, last_pos, head_offset, q, kv_cache, head_att);
                 // SoftMax with optional SoftCap yields "probabilities" in
                 // head_att.
-                const size_t head_att_len = std::min(last_pos + 1, kSeqLen);
-                MaybeLogitsSoftCap(TConfig::kAttCap, head_att, head_att_len);
+                const size_t head_att_len =
+                    std::min(last_pos + 1, activations_.seq_len);
+                MaybeLogitsSoftCap(activations_.weights_config.att_cap,
+                                   head_att, head_att_len);
                 Softmax(head_att, head_att_len);
 
                 float* HWY_RESTRICT att_out =
                     activations_.att_out.Batch(interleaved_idx) +
-                    head * kQKVDim;
+                    head * layer_config_.qkv_dim;
                 WeightedSumV(start_pos, last_pos, head_att, layer_, head_offset,
                              div_seq_len_, kv_cache, att_out);
               });
   }
 
-  // Sums encoded (`att_out`) over num_heads (`kHeads`) and head_dim (`kQKVDim`)
-  // into output (`layer_out`).
+  // Sums encoded (`att_out`) over num_heads (`layer_config_.heads`) and
+  // head_dim
+  // (`layer_config_.qkv_dim`) into output (`layer_out`).
   HWY_NOINLINE void SumHeads(const size_t num_interleaved) {
     PROFILER_ZONE("Gen.Attention.SumHeads");
-    constexpr bool kAdd = TConfig::kSoftmaxAttnOutputBiases;
-    const float* bias =
-        kAdd ? layer_weights_.attention_output_biases.data_scale1() : nullptr;
-
-    // att_weights and att_out are concatenated heads, each of length kQKVDim.
-    // Thus the [num_interleaved, kModelDim] matmul output is the sum over
-    // heads. Compare gemma/modules.py:
-    // attn_output = self.attn_vec_einsum('BTNH,NHD->BTD', encoded)
-    MatMul<kAdd>(
-        num_interleaved, ConstMat(activations_.att_out.All(), kHeads * kQKVDim),
-        ConstMat(layer_weights_.att_weights.data(), kHeads * kQKVDim),
-        layer_weights_.att_weights.scale(), bias, activations_.env,
-        MutableMat(activations_.att_sums.All(), kModelDim));
+    // att_weights and att_out are concatenated heads, each of length
+    // layer_config_.qkv_dim. Thus the [num_interleaved,
+    // layer_config_.model_dim] matmul output is the sum over heads. Compare
+    // gemma/modules.py: attn_output = self.attn_vec_einsum('BTNH,NHD->BTD',
+    // encoded)
+    HWY_DASSERT(layer_config_.model_dim > 0);
+    HWY_DASSERT(layer_config_.heads > 0);
+    HWY_DASSERT(layer_config_.qkv_dim > 0);
+    HWY_DASSERT(layer_weights_.att_weights.data() != nullptr);
+    HWY_DASSERT(activations_.att_out.All() != nullptr);
+    HWY_DASSERT(activations_.att_sums.All() != nullptr);
+    if (layer_weights_.layer_config.softmax_attn_output_biases) {
+      MatMul</*kAdd=*/true>(
+          num_interleaved,
+          ConstMat(activations_.att_out.All(),
+                   layer_config_.heads * layer_config_.qkv_dim),
+          ConstMat(layer_weights_.att_weights.data(),
+                   layer_config_.heads * layer_config_.qkv_dim),
+          layer_weights_.att_weights.scale(),
+          layer_weights_.attention_output_biases.data_scale1(),
+          activations_.env,
+          MutableMat(activations_.att_sums.All(), layer_config_.model_dim));
+    } else {
+      MatMul</*kAdd=*/false>(
+          num_interleaved,
+          ConstMat(activations_.att_out.All(),
+                   layer_config_.heads * layer_config_.qkv_dim),
+          ConstMat(layer_weights_.att_weights.data(),
+                   layer_config_.heads * layer_config_.qkv_dim),
+          layer_weights_.att_weights.scale(), nullptr, activations_.env,
+          MutableMat(activations_.att_sums.All(), layer_config_.model_dim));
+    }
   }
 
  public:
@@ -463,39 +493,17 @@ class GemmaAttention {
   GemmaAttention(const QueriesPos& queries_pos,
                  const QueriesPos& queries_prefix_end, size_t num_tokens,
                  size_t layer, Activations& activations,
-                 const CompressedLayer<TConfig>* layer_weights,
+                 const LayerWeightsPtrs<T>* layer_weights,
                  const hwy::Divisor& div_seq_len, const KVCaches& kv_caches)
-      : queries_pos_(queries_pos),
-        queries_prefix_end_(queries_prefix_end),
-        num_queries_(queries_pos.size()),
-        num_tokens_(num_tokens),
-        layer_(layer),
-        activations_(activations),
-        layer_weights_(*layer_weights),
-        div_seq_len_(div_seq_len),
-        kv_caches_(kv_caches),
-        pool_(activations.env.Pool()) {
-    HWY_DASSERT(num_queries_ <= kv_caches_.size());
-  }
+      : GemmaAttention(queries_pos, &queries_prefix_end, num_tokens, layer,
+                       activations, layer_weights, div_seq_len, kv_caches) {}
   // Constructor with default initialization to 0 for queries_prefix_end.
   GemmaAttention(const QueriesPos& queries_pos, size_t num_tokens, size_t layer,
                  Activations& activations,
-                 const CompressedLayer<TConfig>* layer_weights,
+                 const LayerWeightsPtrs<T>* layer_weights,
                  const hwy::Divisor& div_seq_len, const KVCaches& kv_caches)
-      : queries_pos_(queries_pos),
-        queries_prefix_end_vec_(queries_pos.size(), 0),
-        queries_prefix_end_(queries_prefix_end_vec_.data(),
-                            queries_prefix_end_vec_.size()),
-        num_queries_(queries_pos.size()),
-        num_tokens_(num_tokens),
-        layer_(layer),
-        activations_(activations),
-        layer_weights_(*layer_weights),
-        div_seq_len_(div_seq_len),
-        kv_caches_(kv_caches),
-        pool_(activations.env.Pool()) {
-    HWY_DASSERT(num_queries_ <= kv_caches_.size());
-  }
+      : GemmaAttention(queries_pos, nullptr, num_tokens, layer, activations,
+                       layer_weights, div_seq_len, kv_caches) {}
 
   // Full attention computation in three steps.
   HWY_INLINE void operator()() {
@@ -506,37 +514,76 @@ class GemmaAttention {
   }
 
  private:
+  // Delegated Constructor that does most of the common work.
+  GemmaAttention(const QueriesPos& queries_pos,
+                 const QueriesPos* queries_prefix_end, size_t num_tokens,
+                 size_t layer, Activations& activations,
+                 const LayerWeightsPtrs<T>* layer_weights,
+                 const hwy::Divisor& div_seq_len, const KVCaches& kv_caches)
+      : queries_pos_(queries_pos),
+        num_queries_(queries_pos.size()),
+        num_tokens_(num_tokens),
+        layer_(layer),
+        q_stride_(activations.QStride()),
+        cache_layer_size_(layer_weights->layer_config.CacheLayerSize()),
+        cache_pos_size_(activations.cache_pos_size),
+        is_mha_(activations.IsMHA()),
+        activations_(activations),
+        layer_weights_(*layer_weights),
+        layer_config_(layer_weights->layer_config),
+        div_seq_len_(div_seq_len),
+        kv_caches_(kv_caches),
+        pool_(activations.env.Pool()) {
+    HWY_DASSERT(num_queries_ <= kv_caches_.size());
+    HWY_DASSERT_M((layer_config_.heads % layer_config_.kv_heads) == 0,
+                  "query heads must be a multiple of key-value heads");
+    if (queries_prefix_end != nullptr) {
+      queries_prefix_end_ = *queries_prefix_end;
+    } else {
+      queries_prefix_end_vec_.assign(num_queries_, 0);
+      queries_prefix_end_ = QueriesPos(queries_prefix_end_vec_.data(),
+                                       queries_prefix_end_vec_.size());
+    }
+  }
+
   const QueriesPos& queries_pos_;
-  const std::vector<size_t> queries_prefix_end_vec_;
-  const QueriesPos queries_prefix_end_;
+  std::vector<size_t> queries_prefix_end_vec_;
+  QueriesPos queries_prefix_end_;
   const size_t num_queries_;
   const size_t num_tokens_;
   const size_t layer_;
+  const size_t q_stride_ = 0;
+  const size_t cache_layer_size_ = 0;
+  const size_t cache_pos_size_ = 0;
+  const bool is_mha_ = false;
+
   Activations& activations_;
-  const CompressedLayer<TConfig>& layer_weights_;
+  const LayerWeightsPtrs<T>& layer_weights_;
+  const LayerConfig& layer_config_;
   const hwy::Divisor& div_seq_len_;
   const KVCaches& kv_caches_;
   hwy::ThreadPool& pool_;
 };
 
-template <class TConfig>
+template <typename T>
 HWY_NOINLINE void Attention(
     LayerAttentionType type, const QueriesPos& queries_pos,
     const QueriesPos& queries_prefix_end, size_t num_tokens, size_t layer,
-    Activations& activations, const CompressedLayer<TConfig>* layer_weights,
+    Activations& activations, const LayerWeightsPtrs<T>* layer_weights,
     const hwy::Divisor& div_seq_len, const KVCaches& kv_caches) {
   if (type == LayerAttentionType::kGemma) {
-    GemmaAttention<TConfig>(queries_pos, queries_prefix_end, num_tokens, layer,
-                            activations, layer_weights, div_seq_len,
-                            kv_caches)();
+    GemmaAttention<T>(queries_pos, queries_prefix_end, num_tokens, layer,
+                      activations, layer_weights, div_seq_len, kv_caches)();
   } else {
-    // Only reached if the model is Griffin. `if constexpr` prevents generating
-    // this code for non-Griffin models.
-    if constexpr (TConfig::kGriffinLayers > 0) {
-      HWY_ASSERT(queries_pos.size() == 1);
-      GriffinRecurrent<TConfig>(queries_pos[0], num_tokens, layer, activations,
-                                layer_weights, kv_caches);
-    }
+    // Only reached if the model is Griffin.
+    // The kv_caches are allocated only for the griffin layers, so we need to
+    // map the layer index to the griffin layer index.
+    auto type = layer_weights->layer_config.type;
+    size_t layer_of_type =
+        activations.weights_config.NumLayersOfTypeBefore(type, layer);
+    HWY_ASSERT(queries_pos.size() == 1);
+    GriffinRecurrent(queries_pos[0], num_tokens, layer_of_type, activations,
+                     layer_weights, kv_caches);
   }
 }
 
@@ -549,90 +596,93 @@ HWY_NOINLINE void Attention(
 // This results in a much simpler implementation. However, to avoid duplicating
 // code, we should still consider merging the two classes.
 // TODO(keysers): Refactor to share code with GemmaAttention.
-template <class TConfig>
+template <typename T>
 class VitAttention {
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kKVHeads = TConfig::kKVHeads;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kQStride = 3 * kQKVDim;
-  static constexpr size_t kSeqLen = TConfig::kSeqLen;
-
   // Computes Q, K, V for all heads, stored in activations_.q.
   HWY_NOINLINE void ComputeQKV() {
     PROFILER_ZONE("Gen.VitAttention.QKV");
     const auto y =
-        ConstMat(activations_.pre_att_rms_out.All(), kModelDim);
+        ConstMat(activations_.pre_att_rms_out.All(), layer_config_.model_dim);
     auto& qkv = activations_.q;
     HWY_ASSERT(qkv.BatchSize() == num_tokens_);
-    HWY_ASSERT(qkv.Len() == kHeads * kQStride);
+    HWY_ASSERT(qkv.Len() == layer_config_.heads * 3 * layer_config_.qkv_dim);
     MatMul</*kAdd=*/true>(
         num_tokens_, y,
-        ConstMat(layer_weights_.vit.qkv_einsum_w.data_scale1(), kModelDim),
+        ConstMat(layer_weights_.vit.qkv_einsum_w.data_scale1(),
+                 layer_config_.model_dim),
         /*scale=*/1.0f, layer_weights_.vit.qkv_einsum_b.data_scale1(),
         activations_.env, MutableMat(qkv.All(), qkv.Len()));
   }
 
   HWY_NOINLINE void DotSoftmaxWeightedSum() {
-    GEMMA_CONSTEXPR_SQRT float kQueryScale =
-        1.0f / Sqrt(static_cast<float>(TConfig::kQKVDim));
+    const float query_scale =
+        1.0f / sqrtf(static_cast<float>(layer_config_.qkv_dim));
     PROFILER_ZONE("Gen.VitAttention.DotSoftmax");
     // A "head group" in the context of GQA refers to a collection of query
     // heads that share the same key and value heads.
-    static_assert(kHeads == kKVHeads, "Vit expects MHA");
+    HWY_ASSERT_M(layer_config_.heads == layer_config_.kv_heads,
+                 "Vit expects MHA");
 
     // Compute Q.K, softmax, and weighted V.
-    pool_.Run(0, kHeads * num_tokens_,
-              [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
-                const size_t head = task % kHeads;
-                const size_t token = task / kHeads;
-                // Compute Q.K scores, which are "logits" stored in head_att.
-                float* HWY_RESTRICT q =
-                    activations_.q.Batch(token) + head * kQStride;
-                MulByConst(kQueryScale, q, kQKVDim);
-                float* HWY_RESTRICT head_att =
-                    activations_.att.Batch(token) + head * kSeqLen;
-                for (size_t i = 0; i < kSeqLen; ++i) {
-                  float* HWY_RESTRICT k =
-                      activations_.q.Batch(i) + head * kQStride + kQKVDim;
-                  head_att[i] = Dot(q, k, kQKVDim);  // score = q.k
-                }
-                // SoftMax yields "probabilities" in head_att.
-                Softmax(head_att, kSeqLen);
-                // Compute weighted sum of v into att_out.
-                float* HWY_RESTRICT att_out =
-                    activations_.att_out.Batch(token) + head * kQKVDim;
-                hwy::ZeroBytes(att_out, kQKVDim * sizeof(*att_out));
-                for (size_t i = 0; i < kSeqLen; ++i) {
-                  float* HWY_RESTRICT v =
-                      activations_.q.Batch(i) + head * kQStride + 2 * kQKVDim;
-                  MulByConstAndAdd(head_att[i], v, att_out, kQKVDim);
-                }
-              });
+    pool_.Run(
+        0, layer_config_.heads * num_tokens_,
+        [&](uint64_t task, size_t /*thread*/) HWY_ATTR {
+          const size_t head = task % layer_config_.heads;
+          const size_t token = task / layer_config_.heads;
+          // Compute Q.K scores, which are "logits" stored in head_att.
+          float* HWY_RESTRICT q =
+              activations_.q.Batch(token) + head * 3 * layer_config_.qkv_dim;
+          MulByConst(query_scale, q, layer_config_.qkv_dim);
+          float* HWY_RESTRICT head_att =
+              activations_.att.Batch(token) + head * activations_.seq_len;
+          for (size_t i = 0; i < activations_.seq_len; ++i) {
+            float* HWY_RESTRICT k = activations_.q.Batch(i) +
+                                    head * 3 * layer_config_.qkv_dim +
+                                    layer_config_.qkv_dim;
+            head_att[i] = Dot(q, k, layer_config_.qkv_dim);  // score = q.k
+          }
+          // SoftMax yields "probabilities" in head_att.
+          Softmax(head_att, activations_.seq_len);
+          // Compute weighted sum of v into att_out.
+          float* HWY_RESTRICT att_out =
+              activations_.att_out.Batch(token) + head * layer_config_.qkv_dim;
+          hwy::ZeroBytes(att_out, layer_config_.qkv_dim * sizeof(*att_out));
+          for (size_t i = 0; i < activations_.seq_len; ++i) {
+            float* HWY_RESTRICT v = activations_.q.Batch(i) +
+                                    head * 3 * layer_config_.qkv_dim +
+                                    2 * layer_config_.qkv_dim;
+            MulByConstAndAdd(head_att[i], v, att_out, layer_config_.qkv_dim);
+          }
+        });
   }
 
-  // Sums encoded (`att_out`) over num_heads (`kHeads`) and head_dim (`kQKVDim`)
-  // into output (`att_sums`).
+  // Sums encoded (`att_out`) over num_heads (`layer_config_.heads`) and
+  // head_dim
+  // (`layer_config_.qkv_dim`) into output (`att_sums`).
   HWY_NOINLINE void SumHeads() {
     PROFILER_ZONE("Gen.VitAttention.SumHeads");
     auto* bias = layer_weights_.vit.attn_out_b.data_scale1();
-    auto att_out = ConstMat(activations_.att_out.All(), kHeads * kQKVDim);
+    auto att_out = ConstMat(activations_.att_out.All(),
+                            layer_config_.heads * layer_config_.qkv_dim);
     auto att_weights = ConstMat(layer_weights_.vit.attn_out_w.data_scale1(),
-                                kHeads * kQKVDim);
-    auto att_sums = MutableMat(activations_.att_sums.All(), kModelDim);
-    // att_weights and att_out are concatenated heads, each of length kQKVDim.
-    // Thus the [num_tokens_, kModelDim] matmul output is the sum over heads.
+                                layer_config_.heads * layer_config_.qkv_dim);
+    auto att_sums =
+        MutableMat(activations_.att_sums.All(), layer_config_.model_dim);
+    // att_weights and att_out are concatenated heads, each of length
+    // layer_config_.qkv_dim. Thus the [num_tokens_, layer_config_.model_dim]
+    // matmul output is the sum over heads.
     MatMul</*kAdd=*/true>(num_tokens_, att_out, att_weights, /*scale=*/1.0f,
                           bias, activations_.env, att_sums);
   }
 
  public:
   VitAttention(size_t num_tokens, size_t layer, Activations& activations,
-               const CompressedLayer<TConfig>* layer_weights)
+               const LayerWeightsPtrs<T>* layer_weights)
       : num_tokens_(num_tokens),
         layer_(layer),
         activations_(activations),
         layer_weights_(*layer_weights),
+        layer_config_(layer_weights->layer_config),
         pool_(activations.env.Pool()) {}
 
   HWY_INLINE void operator()() {
@@ -645,13 +695,14 @@ class VitAttention {
   const size_t num_tokens_;
   const size_t layer_;
   Activations& activations_;
-  const CompressedLayer<TConfig>& layer_weights_;
+  const LayerWeightsPtrs<T>& layer_weights_;
+  const LayerConfig& layer_config_;
   hwy::ThreadPool& pool_;
 };
 
-template <class TConfig, typename T>
-HWY_NOINLINE void Activation(T* HWY_RESTRICT c1, T* HWY_RESTRICT c2,
-                             size_t count) {
+template <typename T>
+HWY_NOINLINE void Activation(ActivationType activation, T* HWY_RESTRICT c1,
+                             T* HWY_RESTRICT c2, size_t count) {
   PROFILER_ZONE("Gen.Activation");
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<T>;
@@ -667,22 +718,18 @@ HWY_NOINLINE void Activation(T* HWY_RESTRICT c1, T* HWY_RESTRICT c2,
   });
 }
 
-template <class TConfig>
-HWY_NOINLINE void FFW(Activations& activations, size_t num_interleaved,
-                      const CompressedLayer<TConfig>* layer_weights) {
+template <typename T>
+HWY_NOINLINE void FFWNoVit(Activations& activations, size_t num_interleaved,
+                           const LayerWeightsPtrs<T>* layer_weights) {
   PROFILER_ZONE("Gen.FFW");
-  constexpr size_t kModelDim = TConfig::kModelDim;
-  constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-  constexpr bool kAddBias = TConfig::kFFBiases;
-  constexpr bool kIsVit = TConfig::kLayerConfig[0] == LayerAttentionType::kVit;
-  using WeightType =
-      hwy::If<kIsVit,
-              typename CompressedLayer<TConfig>::WeightF32OrBF16,
-              typename CompressedLayer<TConfig>::Weight>;
+  const size_t model_dim = layer_weights->layer_config.model_dim;
+  const size_t ffh_hidden_dim = layer_weights->layer_config.ff_hidden_dim;
+  const bool add_bias = layer_weights->layer_config.ff_biases;
+  using WeightType = T;
   HWY_DASSERT(num_interleaved <= activations.bf_pre_ffw_rms_out.BatchSize());
 
   // Define slightly more readable names for the weights and activations.
-  const auto x = ConstMat(activations.bf_pre_ffw_rms_out.All(), kModelDim);
+  const auto x = ConstMat(activations.bf_pre_ffw_rms_out.All(), model_dim);
   Mat<const WeightType> w1;
   const float* bias1 = nullptr;
   Mat<const WeightType> w2;
@@ -691,63 +738,120 @@ HWY_NOINLINE void FFW(Activations& activations, size_t num_interleaved,
   Mat<const WeightType> w_output;
   const float* output_bias = nullptr;
   float output_scale = 1.0f;
-  auto hidden_activations = MutableMat(activations.C1.All(), kFFHiddenDim);
-  auto multiplier = MutableMat(activations.C2.All(), kFFHiddenDim);
-  auto ffw_out = MutableMat(activations.ffw_out.All(), kModelDim);
+  auto hidden_activations = MutableMat(activations.C1.All(), ffh_hidden_dim);
+  auto multiplier = MutableMat(activations.C2.All(), ffh_hidden_dim);
+  auto ffw_out = MutableMat(activations.ffw_out.All(), model_dim);
 
   // For some of the weights and activations, it depends on the config where to
   // get them from or whether to use them at all.
-  if constexpr (kAddBias && !kIsVit) {
-    bias1 = layer_weights->ffw_gating_biases.data_scale1();
-    bias2 = bias1 + kFFHiddenDim;
-    output_bias = layer_weights->ffw_output_biases.data_scale1();
-  }
-  if constexpr (!kIsVit) {
-    w1 = layer_weights->gating_einsum_w.data() == nullptr
-             ? ConstMat(layer_weights->gating_einsum_w1.data(), kModelDim)
-             : ConstMat(layer_weights->gating_einsum_w.data(), kModelDim);
-    w2 = layer_weights->gating_einsum_w.data() == nullptr
-             ? ConstMat(layer_weights->gating_einsum_w2.data(), kModelDim)
-             : ConstMat(layer_weights->gating_einsum_w.data(), kModelDim,
-                        kModelDim, kModelDim * kFFHiddenDim);
-    scale = layer_weights->gating_einsum_w.data() == nullptr
-                ? layer_weights->gating_einsum_w1.scale()
-                : layer_weights->gating_einsum_w.scale();
-    w_output = ConstMat(layer_weights->linear_w.data(), kFFHiddenDim);
-    output_scale = layer_weights->linear_w.scale();
-  } else {
-    w1 = ConstMat(layer_weights->vit.linear_0_w.data_scale1(), kModelDim);
-    bias1 = layer_weights->vit.linear_0_b.data_scale1();
-    multiplier.ptr = nullptr;
-    w_output =
-        ConstMat(layer_weights->vit.linear_1_w.data_scale1(), kFFHiddenDim);
-    output_bias = layer_weights->vit.linear_1_b.data_scale1();
-  }
+  bias1 = layer_weights->ffw_gating_biases.data_scale1();
+  bias2 = bias1 + ffh_hidden_dim;
+  output_bias = layer_weights->ffw_output_biases.data_scale1();
+  w1 = layer_weights->gating_einsum_w.data() == nullptr
+           ? ConstMat(layer_weights->gating_einsum_w1.data(), model_dim)
+           : ConstMat(layer_weights->gating_einsum_w.data(), model_dim);
+  w2 = layer_weights->gating_einsum_w.data() == nullptr
+           ? ConstMat(layer_weights->gating_einsum_w2.data(), model_dim)
+           : ConstMat(layer_weights->gating_einsum_w.data(), model_dim,
+                      model_dim, model_dim * ffh_hidden_dim);
+  scale = layer_weights->gating_einsum_w.data() == nullptr
+              ? layer_weights->gating_einsum_w1.scale()
+              : layer_weights->gating_einsum_w.scale();
+  w_output = ConstMat(layer_weights->linear_w.data(), ffh_hidden_dim);
+  output_scale = layer_weights->linear_w.scale();
 
   // Compute the hidden layer activations.
-  MatMul<kAddBias>(num_interleaved, x, w1, scale, bias1, activations.env,
-                   hidden_activations);
-  if constexpr (!kIsVit) {
-    MatMul<kAddBias>(num_interleaved, x, w2, scale, bias2, activations.env,
-                     multiplier);
+  if (add_bias) {
+    MatMul</*kAddBias=*/true>(num_interleaved, x, w1, scale, bias1,
+                              activations.env, hidden_activations);
+    MatMul</*kAddBias=*/true>(num_interleaved, x, w2, scale, bias2,
+                              activations.env, multiplier);
+  } else {
+    MatMul</*kAddBias=*/false>(num_interleaved, x, w1, scale, bias1,
+                               activations.env, hidden_activations);
+    MatMul</*kAddBias=*/false>(num_interleaved, x, w2, scale, bias2,
+                               activations.env, multiplier);
   }
 
   // Activation (Gelu) and maybe multiply by gate. Store activations in act.
-  Activation<TConfig>(hidden_activations.ptr, multiplier.ptr,
-                      kFFHiddenDim * num_interleaved);
+  Activation(layer_weights->layer_config.activation, hidden_activations.ptr,
+             multiplier.ptr, ffh_hidden_dim * num_interleaved);
 
   // Hidden layer -> output layer.
-  MatMul<kAddBias>(num_interleaved, ConstMat(hidden_activations), w_output,
-                   output_scale, output_bias, activations.env, ffw_out);
+  if (add_bias) {
+    MatMul</*kAddBias=*/true>(num_interleaved, ConstMat(hidden_activations),
+                              w_output, output_scale, output_bias,
+                              activations.env, ffw_out);
+  } else {
+    MatMul</*kAddBias=*/false>(num_interleaved, ConstMat(hidden_activations),
+                               w_output, output_scale, output_bias,
+                               activations.env, ffw_out);
+  }
+}
+
+template <typename T>
+HWY_NOINLINE void FFWVit(Activations& activations, size_t num_interleaved,
+                         const LayerWeightsPtrs<T>* layer_weights) {
+  PROFILER_ZONE("Gen.FFW");
+  const size_t model_dim = layer_weights->layer_config.model_dim;
+  const size_t ff_hidden_dim = layer_weights->layer_config.ff_hidden_dim;
+  const bool add_bias = layer_weights->layer_config.ff_biases;
+  using WeightType = typename LayerWeightsPtrs<T>::WeightF32OrBF16;
+  HWY_DASSERT(num_interleaved <= activations.bf_pre_ffw_rms_out.BatchSize());
+
+  // Define slightly more readable names for the weights and activations.
+  const auto x = ConstMat(activations.bf_pre_ffw_rms_out.All(), model_dim);
+  Mat<const WeightType> w1;
+  const float* bias1 = nullptr;
+  float scale = 1.0f;
+  Mat<const WeightType> w_output;
+  const float* output_bias = nullptr;
+  float output_scale = 1.0f;
+  auto hidden_activations = MutableMat(activations.C1.All(), ff_hidden_dim);
+  auto multiplier = MutableMat(activations.C2.All(), ff_hidden_dim);
+  auto ffw_out = MutableMat(activations.ffw_out.All(), model_dim);
+
+  // For some of the weights and activations, it depends on the config where to
+  // get them from or whether to use them at all.
+  w1 = ConstMat(layer_weights->vit.linear_0_w.data_scale1(), model_dim);
+  bias1 = layer_weights->vit.linear_0_b.data_scale1();
+  multiplier.ptr = nullptr;
+  w_output =
+      ConstMat(layer_weights->vit.linear_1_w.data_scale1(), ff_hidden_dim);
+  output_bias = layer_weights->vit.linear_1_b.data_scale1();
+
+  // Compute the hidden layer activations.
+  if (add_bias) {
+    MatMul</*kAddBias=*/true>(num_interleaved, x, w1, scale, bias1,
+                              activations.env, hidden_activations);
+  } else {
+    MatMul</*kAddBias=*/false>(num_interleaved, x, w1, scale, bias1,
+                               activations.env, hidden_activations);
+  }
+
+  // Activation (Gelu) and maybe multiply by gate. Store activations in act.
+  Activation(layer_weights->layer_config.activation, hidden_activations.ptr,
+             multiplier.ptr, ff_hidden_dim * num_interleaved);
+
+  // Hidden layer -> output layer.
+  if (add_bias) {
+    MatMul</*kAddBias=*/true>(num_interleaved, ConstMat(hidden_activations),
+                              w_output, output_scale, output_bias,
+                              activations.env, ffw_out);
+  } else {
+    MatMul</*kAddBias=*/false>(num_interleaved, ConstMat(hidden_activations),
+                               w_output, output_scale, output_bias,
+                               activations.env, ffw_out);
+  }
 }
 
 // `batch_idx` indicates which row of `x` to write to.
 // `pos` is the *token*'s position, not the start of the batch, because this is
 // called for batches of tokens in prefill, but batches of queries in decode.
-template <class TConfig>
+template <typename T>
 HWY_NOINLINE void EmbedToken(int token, size_t batch_idx, size_t pos,
                              size_t pos_in_prompt,
-                             const CompressedWeights<TConfig>& weights,
+                             const ModelWeightsPtrs<T>& weights,
                              RowVectorBatch<float>& x,
                              const ImageTokens* image_tokens) {
   // Image tokens just need to be copied.
@@ -757,82 +861,85 @@ HWY_NOINLINE void EmbedToken(int token, size_t batch_idx, size_t pos,
     return;
   }
 
-  constexpr size_t kModelDim = TConfig::kModelDim;
-  constexpr size_t kVocabSize = TConfig::kVocabSize;
-  GEMMA_CONSTEXPR_EMBSCALING const float kEmbScaling =
-      EmbeddingScaling<TConfig>();
+  const size_t model_dim = weights.weights_config.model_dim;
+  const size_t vocab_size = weights.weights_config.vocab_size;
+  const float emb_scaling = EmbeddingScaling(model_dim);
 
   HWY_DASSERT(token >= 0);
-  HWY_DASSERT(token < static_cast<int>(kVocabSize));
+  HWY_DASSERT(token < static_cast<int>(vocab_size));
 
   const hn::ScalableTag<float> df;
   DecompressAndZeroPad(
       df,
-      MakeSpan(weights.embedder_input_embedding.data(), kVocabSize * kModelDim),
-      token * kModelDim, x.Batch(batch_idx), kModelDim);
-  MulByConst(kEmbScaling * weights.embedder_input_embedding.scale(),
-             x.Batch(batch_idx), kModelDim);
-  if constexpr (TConfig::kAbsolutePE) {
-    AddAbsolutePositionalEmbeddings(x.Batch(batch_idx), kModelDim, pos);
+      MakeSpan(weights.embedder_input_embedding.data(), vocab_size * model_dim),
+      token * model_dim, x.Batch(batch_idx), model_dim);
+  MulByConst(emb_scaling * weights.embedder_input_embedding.scale(),
+             x.Batch(batch_idx), model_dim);
+  if (weights.weights_config.absolute_pe) {
+    AddAbsolutePositionalEmbeddings(x.Batch(batch_idx), model_dim, pos);
   }
 }
 
-template <class TConfig, typename T>
+template <typename Weights, typename T>
 HWY_NOINLINE void ResidualConnection(
     size_t num_interleaved, T* HWY_RESTRICT other, T* HWY_RESTRICT x,
-    const CompressedLayer<TConfig>* layer_weights, bool is_attention) {
-  constexpr size_t kModelDim = TConfig::kModelDim;
+    const LayerWeightsPtrs<Weights>* layer_weights, bool is_attention) {
   // ResidualType::Add
-  AddFromBatched(num_interleaved, other, x, kModelDim);
+  AddFromBatched(num_interleaved, other, x,
+                 layer_weights->layer_config.model_dim);
 }
 
-template <class TConfig, typename WeightT, typename InOutT>
-void PostNorm(size_t num_interleaved, const WeightT& weights, InOutT* inout) {
-  if (TConfig::kPostNorm == PostNormType::Scale) {
+template <typename WeightT, typename InOutT>
+void PostNorm(PostNormType post_norm, size_t num_interleaved,
+              const WeightT& weights, InOutT* inout) {
+  if (post_norm == PostNormType::Scale) {
     RMSNormInplaceBatched(num_interleaved, weights.data_scale1(), inout,
-                          TConfig::kModelDim);
+                          weights.NumElements());
   }
 }
 
-template <class TConfig>
-HWY_NOINLINE void TransformerLayer(
-    const QueriesPos& queries_pos, const QueriesPos& queries_prefix_end,
-    size_t num_tokens, size_t layer,
-    const CompressedLayer<TConfig>* layer_weights, Activations& activations,
-    const hwy::Divisor& div_seq_len, const KVCaches& kv_caches) {
-  constexpr size_t kModelDim = TConfig::kModelDim;
+template <typename T>
+HWY_NOINLINE void TransformerLayer(const QueriesPos& queries_pos,
+                                   const QueriesPos& queries_prefix_end,
+                                   size_t num_tokens, size_t cache_layer_idx,
+                                   const LayerWeightsPtrs<T>* layer_weights,
+                                   Activations& activations,
+                                   const hwy::Divisor& div_seq_len,
+                                   const KVCaches& kv_caches) {
+  const size_t model_dim = activations.weights_config.model_dim;
   const size_t num_interleaved = num_tokens * queries_pos.size();
-  auto type = TConfig::kLayerConfig[layer];
-  size_t layer_of_type =
-      NumLayersOfTypeBefore(TConfig::kLayerConfig, type, layer);
+  auto type = layer_weights->layer_config.type;
 
   RMSNormBatched(num_interleaved, activations.x.All(),
                  layer_weights->pre_attention_norm_scale.data_scale1(),
-                 activations.pre_att_rms_out.All(), kModelDim);
+                 activations.pre_att_rms_out.All(), model_dim);
 
-  Attention<TConfig>(type, queries_pos, queries_prefix_end, num_tokens,
-                     layer_of_type, activations, layer_weights, div_seq_len,
-                     kv_caches);
+  Attention(type, queries_pos, queries_prefix_end, num_tokens, cache_layer_idx,
+            activations, layer_weights, div_seq_len, kv_caches);
 
-  PostNorm<TConfig>(num_interleaved, layer_weights->post_attention_norm_scale,
-                    activations.att_sums.All());
+  PostNorm(layer_weights->layer_config.post_norm, num_interleaved,
+           layer_weights->post_attention_norm_scale,
+           activations.att_sums.All());
 
-  ResidualConnection<TConfig>(num_interleaved, activations.att_sums.All(),
-                              activations.x.All(), layer_weights,
-                              /*is_attention=*/true);
+  ResidualConnection(num_interleaved, activations.att_sums.All(),
+                     activations.x.All(), layer_weights, /*is_attention=*/true);
 
   RMSNormBatched(num_interleaved, activations.x.All(),
                  layer_weights->pre_ffw_norm_scale.data_scale1(),
-                 activations.bf_pre_ffw_rms_out.All(), kModelDim);
+                 activations.bf_pre_ffw_rms_out.All(), model_dim);
 
-  FFW<TConfig>(activations, num_interleaved, layer_weights);
+  if (layer_weights->layer_config.type == LayerAttentionType::kVit) {
+    FFWVit(activations, num_interleaved, layer_weights);
+  } else {
+    FFWNoVit(activations, num_interleaved, layer_weights);
+  }
 
-  PostNorm<TConfig>(num_interleaved, layer_weights->post_ffw_norm_scale,
-                    activations.ffw_out.All());
+  PostNorm(layer_weights->layer_config.post_norm, num_interleaved,
+           layer_weights->post_ffw_norm_scale, activations.ffw_out.All());
 
-  ResidualConnection<TConfig>(num_interleaved, activations.ffw_out.All(),
-                              activations.x.All(), layer_weights,
-                              /*is_attention=*/false);
+  ResidualConnection(num_interleaved, activations.ffw_out.All(),
+                     activations.x.All(), layer_weights,
+                     /*is_attention=*/false);
 }
 
 // Vit transformer layer. Some comments below refer to the Vit implementation in
@@ -840,62 +947,62 @@ HWY_NOINLINE void TransformerLayer(
 // github.com/google-research/big_vision/blob/main/big_vision/models/vit.py
 // TODO(keysers): consider adding a wrapper for both LayerNorm with RMSNorm and
 // try mergig this with TransformerLayer.
-template <class TConfig>
-HWY_NOINLINE void VitTransformerLayer(
-    size_t num_tokens, size_t layer,
-    const CompressedLayer<TConfig>* layer_weights, Activations& activations) {
-  constexpr size_t kModelDim = TConfig::kModelDim;
-  auto type = TConfig::kLayerConfig[layer];
-  HWY_ASSERT(type == LayerAttentionType::kVit);
+template <typename T>
+HWY_NOINLINE void VitTransformerLayer(size_t num_tokens, size_t layer,
+                                      const LayerWeightsPtrs<T>* layer_weights,
+                                      Activations& activations) {
+  const size_t model_dim = activations.weights_config.model_dim;
+  auto type = layer_weights->layer_config.type;
+  HWY_DASSERT(type == LayerAttentionType::kVit);
 
   auto& x = activations.x;
-  HWY_ASSERT(x.BatchSize() == num_tokens);
-  HWY_ASSERT(x.Len() == kModelDim);
+  HWY_DASSERT(x.BatchSize() == num_tokens);
+  HWY_DASSERT(x.Len() == model_dim);
 
   // y = nn.LayerNorm()(x)
   // y ~ pre_att_rms_out
   LayerNormBatched(num_tokens, x.All(),
                    layer_weights->vit.layer_norm_0_scale.data_scale1(),
                    layer_weights->vit.layer_norm_0_bias.data_scale1(),
-                   activations.pre_att_rms_out.All(), kModelDim);
+                   activations.pre_att_rms_out.All(), model_dim);
   // y = out["sa"] = nn.MultiHeadDotProductAttention(...)(y, y)
   // y ~ att_sums
-  VitAttention<TConfig>(num_tokens, layer, activations, layer_weights)();
+  VitAttention<T>(num_tokens, layer, activations, layer_weights)();
 
   // x = out["+sa"] = x + y
-  AddFromBatched(num_tokens, activations.att_sums.All(), x.All(), kModelDim);
+  AddFromBatched(num_tokens, activations.att_sums.All(), x.All(), model_dim);
 
   // y = nn.LayerNorm()(x)
   // y ~ bf_pre_ffw_rms_out
   LayerNormBatched(num_tokens, x.All(),
                    layer_weights->vit.layer_norm_1_scale.data_scale1(),
                    layer_weights->vit.layer_norm_1_bias.data_scale1(),
-                   activations.bf_pre_ffw_rms_out.All(), kModelDim);
+                   activations.bf_pre_ffw_rms_out.All(), model_dim);
 
   // y = out["mlp"] = MlpBlock(...)(y)
   // y ~ ffw_out
-  FFW<TConfig>(activations, num_tokens, layer_weights);
+  FFWVit(activations, num_tokens, layer_weights);
 
   // x = out["+mlp"] = x + y
-  AddFromBatched(num_tokens, activations.ffw_out.All(), x.All(), kModelDim);
+  AddFromBatched(num_tokens, activations.ffw_out.All(), x.All(), model_dim);
 }
 
 // Prefill() and Transformer() increment positions in-place.
 using QueriesMutablePos = hwy::Span<size_t>;
 
 // Populates KV cache for batches of tokens from one query at a time.
-template <class TConfig>
+template <typename T>
 HWY_NOINLINE void Prefill(
     const QueriesPromptTokens& queries_prompt,
     const QueriesMutablePos& queries_pos, const QueriesPos& queries_prefix_end,
-    const size_t query_idx_start, const CompressedWeights<TConfig>& weights,
+    const size_t query_idx_start, const ModelWeightsPtrs<T>& weights,
     Activations& activations, const RuntimeConfig& runtime_config,
     const hwy::Divisor& div_seq_len, const KVCaches& kv_caches) {
   PROFILER_ZONE("Gen.Prefill");
   const size_t num_queries = queries_prompt.size();
-  HWY_ASSERT(queries_pos.size() == num_queries);
-  HWY_ASSERT(queries_prefix_end.size() == num_queries);
-  HWY_ASSERT(kv_caches.size() == num_queries);
+  HWY_DASSERT(queries_pos.size() == num_queries);
+  HWY_DASSERT(queries_prefix_end.size() == num_queries);
+  HWY_DASSERT(kv_caches.size() == num_queries);
 
   // Batches are important for amortizing loading weights over multiple tokens.
   // This is possible in prefill because we know all tokens beforehand, whereas
@@ -949,16 +1056,17 @@ HWY_NOINLINE void Prefill(
         const size_t pos = queries_pos[qi] + ti;
         const size_t pos_in_prompt = tbatch_start + ti;
         const int token = queries_prompt[qi][pos_in_prompt];
-        EmbedToken<TConfig>(token, ti, pos, pos_in_prompt, weights,
-                            activations.x, runtime_config.image_tokens);
+        EmbedToken(token, ti, pos, pos_in_prompt, weights, activations.x,
+                   runtime_config.image_tokens);
       }
 
       // Transformer with one batch of tokens from a single query.
-      for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
+      for (size_t layer = 0;
+           layer < weights.weights_config.layer_configs.size(); ++layer) {
         const auto* layer_weights = weights.GetLayer(layer);
-        TransformerLayer<TConfig>(single_query_pos, single_query_prefix_end,
-                                  tbatch_size, layer, layer_weights,
-                                  activations, div_seq_len, single_kv_cache);
+        TransformerLayer(single_query_pos, single_query_prefix_end, tbatch_size,
+                         layer, layer_weights, activations, div_seq_len,
+                         single_kv_cache);
       }
 
       // NOTE: we unconditionally call StreamToken, even if EOS.
@@ -991,20 +1099,21 @@ HWY_NOINLINE void Prefill(
 
 // Gets the patches of the image and embeds them with the image embedding
 // kernel. The result is stored in activations.x.
-template <class TConfig>
+template <typename T>
 HWY_NOINLINE void EmbedImagePatches(const Image& image,
-                                    const CompressedWeights<TConfig>& weights,
+                                    const ModelWeightsPtrs<T>& weights,
                                     Activations& activations) {
-  static constexpr size_t kModelDim = TConfig::VitConfig::kModelDim;
-  static constexpr size_t kPatchWidth = TConfig::VitConfig::kPatchWidth;
-  static constexpr size_t kSeqLen = TConfig::VitConfig::kSeqLen;
-  constexpr size_t kPatchSize = kPatchWidth * kPatchWidth * 3;
-  HWY_ASSERT(weights.vit_img_embedding_kernel.NumElements() ==
-             kPatchSize * kModelDim);
-  HWY_ASSERT(activations.x.Len() == kModelDim);
-  std::vector<hwy::AlignedFreeUniquePtr<float[]>> image_patches(kSeqLen);
-  for (size_t i = 0; i < kSeqLen; ++i) {
-    image_patches[i] = hwy::AllocateAligned<float>(kPatchSize);
+  const size_t model_dim = weights.weights_config.vit_model_dim;
+  const size_t patch_width =
+      weights.weights_config.vit_layer_configs[0].patch_width;
+  const size_t seq_len = weights.weights_config.vit_seq_len;
+  const size_t patch_size = patch_width * patch_width * 3;
+  HWY_DASSERT(weights.vit_img_embedding_kernel.NumElements() ==
+              patch_size * model_dim);
+  HWY_DASSERT(activations.x.Len() == model_dim);
+  std::vector<hwy::AlignedFreeUniquePtr<float[]>> image_patches(seq_len);
+  for (size_t i = 0; i < seq_len; ++i) {
+    image_patches[i] = hwy::AllocateAligned<float>(patch_size);
     image.GetPatch(i, image_patches[i].get());
   }
   // img/embedding/kernel has original shape (14, 14, 3, 1152)
@@ -1022,60 +1131,59 @@ HWY_NOINLINE void EmbedImagePatches(const Image& image,
   //   A.cols % (2 * hn::Lanes(hn::ScalableTag<MulT>())) == 0
   // which is not the case here. We should relax that requirement on MatMul and
   // then use the above. For now, we rely on MatVecAdd instead.
-  for (size_t i = 0; i < kSeqLen; ++i) {
-    MatVecAdd<kModelDim, kPatchSize>(
-        weights.vit_img_embedding_kernel, 0, image_patches[i].get(),
-        weights.vit_img_embedding_bias.data_scale1(), activations.x.Batch(i),
-        activations.env.Pools().Outer());
+  for (size_t i = 0; i < seq_len; ++i) {
+    MatVecAdd(weights.vit_img_embedding_kernel, 0, model_dim, patch_size,
+              image_patches[i].get(),
+              weights.vit_img_embedding_bias.data_scale1(),
+              activations.x.Batch(i), activations.env.Pools().Outer());
   }
   // Add position embeddings.
   AddFrom(weights.vit_img_pos_embedding.data_scale1(), activations.x.All(),
-          kSeqLen * kModelDim);
+          seq_len * model_dim);
 }
 
 // Prefills the image tokens with the ViT encoder.
-template <class TConfig>
-HWY_NOINLINE void PrefillVit(const CompressedWeights<TConfig>& weights,
+template <typename T>
+HWY_NOINLINE void PrefillVit(const ModelWeightsPtrs<T>& weights,
                              const RuntimeConfig& runtime_config,
                              const Image& image, ImageTokens& image_tokens,
                              Activations& activations) {
   PROFILER_ZONE("Gen.PrefillVit");
-  const size_t num_tokens = TConfig::VitConfig::kSeqLen;
-  const size_t kVitModelDim = TConfig::VitConfig::kModelDim;
+  const size_t num_tokens = weights.weights_config.vit_seq_len;
+  const size_t vit_model_dim = weights.weights_config.vit_model_dim;
   HWY_ASSERT(num_tokens == activations.x.BatchSize());
   // Embed the image patches.
-  EmbedImagePatches<TConfig>(image, weights, activations);
+  EmbedImagePatches(image, weights, activations);
   // Go through all layers.
-  for (size_t layer = 0; layer < TConfig::VitConfig::kLayers; ++layer) {
+  for (size_t layer = 0;
+       layer < weights.weights_config.vit_layer_configs.size(); ++layer) {
     const auto* layer_weights = weights.GetVitLayer(layer);
-    VitTransformerLayer<typename TConfig::VitConfig>(
-        num_tokens, layer, layer_weights, activations);
+    VitTransformerLayer(num_tokens, layer, layer_weights, activations);
   }
   // Final Layernorm.
   LayerNormBatched(num_tokens, activations.x.All(),
                    weights.vit_encoder_norm_scale.data_scale1(),
                    weights.vit_encoder_norm_bias.data_scale1(),
-                   activations.x.All(), kVitModelDim);
+                   activations.x.All(), vit_model_dim);
 
   // Apply head embedding into image_tokens of size of the LLM kModelDim.
   MatMul</*kAdd=*/true>(
-      num_tokens, ConstMat(activations.x.All(), kVitModelDim),
-      ConstMat(weights.vit_img_head_kernel.data_scale1(), kVitModelDim),
+      num_tokens, ConstMat(activations.x.All(), vit_model_dim),
+      ConstMat(weights.vit_img_head_kernel.data_scale1(), vit_model_dim),
       /*scale=*/1.0f, weights.vit_img_head_bias.data_scale1(), activations.env,
-      MutableMat(image_tokens.All(), TConfig::kModelDim));
+      MutableMat(image_tokens.All(), weights.weights_config.model_dim));
 }
 
 // Generates one token for each query. `queries_token` is the previous token
 // from each query, and `queries_pos` are their position in the sequence.
-template <class TConfig>
+template <typename T>
 HWY_NOINLINE void Transformer(
     const QueriesToken& queries_token, const QueriesMutablePos& queries_pos,
-    const QueriesPos& queries_prefix_end,
-    const CompressedWeights<TConfig>& weights, Activations& activations,
-    const hwy::Divisor& div_seq_len, const KVCaches& kv_caches,
-    const LayersOutputFunc& layers_output,
+    const QueriesPos& queries_prefix_end, const ModelWeightsPtrs<T>& weights,
+    Activations& activations, const hwy::Divisor& div_seq_len,
+    const KVCaches& kv_caches, const LayersOutputFunc& layers_output,
     const ActivationsObserverFunc& activations_observer) {
-  constexpr size_t kModelDim = TConfig::kModelDim;
+  const size_t model_dim = weights.weights_config.model_dim;
   const size_t num_queries = queries_token.size();
   HWY_DASSERT(queries_pos.size() == num_queries);
   HWY_DASSERT(queries_prefix_end.size() == num_queries);
@@ -1089,16 +1197,15 @@ HWY_NOINLINE void Transformer(
   }
 
   for (size_t query_idx = 0; query_idx < num_queries; ++query_idx) {
-    EmbedToken<TConfig>(queries_token[query_idx], query_idx,
-                        queries_pos[query_idx], /*pos_in_prompt=*/0, weights,
-                        activations.x, /*image_tokens=*/nullptr);
+    EmbedToken(queries_token[query_idx], query_idx, queries_pos[query_idx],
+               /*pos_in_prompt=*/0, weights, activations.x,
+               /*image_tokens=*/nullptr);
   }
 
-  for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
-    const CompressedLayer<TConfig>* layer_weights = weights.GetLayer(layer);
-    TransformerLayer<TConfig>(queries_pos, queries_prefix_end, /*num_tokens=*/1,
-                              layer, layer_weights, activations, div_seq_len,
-                              kv_caches);
+  for (size_t layer = 0; layer < weights.c_layers.size(); ++layer) {
+    const LayerWeightsPtrs<T>* layer_weights = weights.GetLayer(layer);
+    TransformerLayer(queries_pos, queries_prefix_end, /*num_tokens=*/1, layer,
+                     layer_weights, activations, div_seq_len, kv_caches);
 
     if (activations_observer) {
       activations_observer(queries_pos, layer, activations);
@@ -1106,7 +1213,7 @@ HWY_NOINLINE void Transformer(
   }
 
   RMSNormInplaceBatched(num_queries, weights.final_norm_scale.data_scale1(),
-                        activations.x.All(), kModelDim);
+                        activations.x.All(), model_dim);
 
   if (activations_observer) {
     activations_observer(queries_pos, -1, activations);
@@ -1116,19 +1223,6 @@ HWY_NOINLINE void Transformer(
   }
 }
 
-template <class TConfig>
-void RangeChecks(size_t& max_generated_tokens, const size_t prompt_size) {
-  if (!TConfig::kUseLocalAttention) {
-    if (max_generated_tokens > TConfig::kSeqLen) {
-      fprintf(stderr,
-              "WARNING: max_generated_tokens %zu > kSeqLen %d, truncating.\n",
-              max_generated_tokens, TConfig::kSeqLen);
-      max_generated_tokens = static_cast<size_t>(TConfig::kSeqLen);
-    }
-  }
-  HWY_ASSERT(prompt_size > 0);
-}
-
 // Placeholder for internal test3, do not remove
 
 // Returns the min and max number of tokens for all queries.
@@ -1165,15 +1259,13 @@ class TokenStreamer {
   hwy::BitSet4096<> is_eos_;
 };
 
-template <class TConfig>
-SampleFunc ChooseSampleFunc(const RuntimeConfig& runtime_config) {
-  constexpr size_t kTopK = TConfig::kTopK;
-
+HWY_INLINE SampleFunc ChooseSampleFunc(int top_k,
+                                       const RuntimeConfig& runtime_config) {
   // If user provided a sample_func, use it.
   if (runtime_config.sample_func) return runtime_config.sample_func;
 
   // Fast path for top-1 with no accept_token.
-  if (kTopK == 1 && !runtime_config.accept_token) {
+  if (top_k == 1 && !runtime_config.accept_token) {
     return [](float* logits, size_t vocab_size) HWY_ATTR -> TokenAndProb {
       PROFILER_ZONE("Gen.Sample Top1");
       return Top1OfSoftmax(logits, vocab_size);
@@ -1181,13 +1273,13 @@ SampleFunc ChooseSampleFunc(const RuntimeConfig& runtime_config) {
   }
 
   // General case: Softmax with top-k sampling.
-  return [&runtime_config](float* logits,
-                           size_t vocab_size) HWY_ATTR -> TokenAndProb {
+  return [top_k, &runtime_config](float* logits,
+                                  size_t vocab_size) HWY_ATTR -> TokenAndProb {
     PROFILER_ZONE("Gen.Sample general");
     Softmax(logits, vocab_size);
-    const int token = SampleTopK<kTopK>(logits, vocab_size, *runtime_config.gen,
-                                        runtime_config.temperature,
-                                        runtime_config.accept_token);
+    const int token =
+        SampleTopK(logits, top_k, vocab_size, *runtime_config.gen,
+                   runtime_config.temperature, runtime_config.accept_token);
     return TokenAndProb{.token = token, .prob = logits[token]};
   };
 }
@@ -1203,18 +1295,17 @@ SampleFunc ChooseSampleFunc(const RuntimeConfig& runtime_config) {
 // `StreamFunc` gets the global query index, not relative to the batch.
 //
 // `kv_caches` is for the batch, size must match `queries_prompt`.
-template <class TConfig>
-void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
+template <typename T>
+void GenerateT(const ModelWeightsStorage& model, Activations& activations,
                const RuntimeConfig& runtime_config,
                const QueriesPromptTokens& queries_prompt,
                const QueriesPos& queries_pos_in,
                const QueriesPos& queries_prefix_end,
                const size_t query_idx_start, const KVCaches& kv_caches,
                TimingInfo& timing_info) {
-  constexpr size_t kModelDim = TConfig::kModelDim;
-  constexpr size_t kVocabSize = TConfig::kVocabSize;
-  const CompressedWeights<TConfig>& weights =
-      *reinterpret_cast<const CompressedWeights<TConfig>*>(weights_u8.get());
+  const size_t model_dim = model.Config().model_dim;
+  const size_t vocab_size = model.Config().vocab_size;
+  const ModelWeightsPtrs<T>& weights = *model.GetWeightsOfType<T>();
 
   // Copy so we can increment without requiring users to pass in a mutable span.
   std::vector<size_t> queries_pos_copy(queries_pos_in.cbegin(),
@@ -1244,8 +1335,9 @@ void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
 
   size_t max_prompt_size = MaxQueryLength(queries_prompt);
   size_t max_generated_tokens = runtime_config.max_generated_tokens;
-  RangeChecks<TConfig>(max_generated_tokens, max_prompt_size);
-  const SampleFunc sample_token = ChooseSampleFunc<TConfig>(runtime_config);
+  RangeChecks(weights.weights_config, max_generated_tokens, max_prompt_size);
+  const SampleFunc sample_token =
+      ChooseSampleFunc(weights.weights_config.top_k, runtime_config);
 
   // Prefill stops before min_prompt_size - 1 because the last prompt
   // token is the first input token for generation.
@@ -1254,15 +1346,15 @@ void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
   // allocate prefill_activations, otherwise reuse.
   const bool use_prefill_activations =
       runtime_config.prefill_tbatch_size > activations.x.BatchSize();
-  Activations prefill_activations;
+  Activations prefill_activations(weights.weights_config);
   if (use_prefill_activations) {
-    prefill_activations.Allocate<TConfig>(runtime_config.prefill_tbatch_size,
-                                          activations.env.Pools());
+    prefill_activations.Allocate(runtime_config.prefill_tbatch_size,
+                                 activations.env.Pools());
   }
-  Prefill<TConfig>(queries_prompt, queries_mutable_pos, queries_prefix_end,
-                   query_idx_start, weights,
-                   use_prefill_activations ? prefill_activations : activations,
-                   runtime_config, div_seq_len, kv_caches);
+  Prefill(queries_prompt, queries_mutable_pos, queries_prefix_end,
+          query_idx_start, weights,
+          use_prefill_activations ? prefill_activations : activations,
+          runtime_config, div_seq_len, kv_caches);
   // Compute the number of tokens that were prefilled and notify timing_info.
   size_t prefilled_tokens = 0;
   for (size_t qi = 0; qi < num_queries; ++qi) {
@@ -1289,10 +1381,10 @@ void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
   const double gen_start = hwy::platform::Now();
   for (size_t gen = 0; gen < max_generated_tokens; ++gen) {
     // Decode generates one token per query and increments queries_mutable_pos.
-    Transformer<TConfig>(
-        QueriesToken(gen_tokens.data(), num_queries), queries_mutable_pos,
-        queries_prefix_end, weights, activations, div_seq_len, kv_caches,
-        runtime_config.layers_output, runtime_config.activations_observer);
+    Transformer(QueriesToken(gen_tokens.data(), num_queries),
+                queries_mutable_pos, queries_prefix_end, weights, activations,
+                div_seq_len, kv_caches, runtime_config.layers_output,
+                runtime_config.activations_observer);
     // queries_pos are incremented by Transformer.
 
     bool all_queries_eos = true;
@@ -1300,16 +1392,16 @@ void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
       PROFILER_ZONE("Gen.EmbeddingMatmul");
       // Compute logits from last layer activations.
       MatMul</*kAdd=*/false>(
-          num_queries, ConstMat(activations.x.All(), kModelDim),
-          ConstMat(weights.embedder_input_embedding.data(), kModelDim),
+          num_queries, ConstMat(activations.x.All(), model_dim),
+          ConstMat(weights.embedder_input_embedding.data(), model_dim),
           weights.embedder_input_embedding.scale(), /*add=*/nullptr,
-          activations.env, MutableMat(activations.logits.All(), kVocabSize));
+          activations.env, MutableMat(activations.logits.All(), vocab_size));
     }
     PROFILER_ZONE("Gen.Softcap+Sample+Stream");
     for (size_t query_idx = 0; query_idx < num_queries; ++query_idx) {
       float* HWY_RESTRICT logits = activations.logits.Batch(query_idx);
-      MaybeLogitsSoftCap(TConfig::kFinalCap, logits, kVocabSize);
-      const TokenAndProb tp = sample_token(logits, kVocabSize);
+      MaybeLogitsSoftCap(weights.weights_config.final_cap, logits, vocab_size);
+      const TokenAndProb tp = sample_token(logits, vocab_size);
       timing_info.NotifyGenerated(prefill_start, gen_start);
 
       const bool is_eos =
@@ -1324,8 +1416,8 @@ void GenerateT(const ByteStorageT& weights_u8, Activations& activations,
   timing_info.NotifyGenerateDone(gen_start);
 }
 
-template <class TConfig>
-void GenerateSingleT(const ByteStorageT& weights_u8,
+template <typename T>
+void GenerateSingleT(const ModelWeightsStorage& model,
                      const RuntimeConfig& runtime_config,
                      const PromptTokens& prompt, size_t pos, size_t prefix_end,
                      KVCache& kv_cache, PerClusterPools& pools,
@@ -1334,21 +1426,20 @@ void GenerateSingleT(const ByteStorageT& weights_u8,
   const size_t qbatch_start = 0;
 
   // TODO: move into Gemma?
-  Activations activations;
-  activations.Allocate<TConfig>(kNumQueries, pools);
+  Activations activations(model.Config());
+  activations.Allocate(kNumQueries, pools);
 
   const QueriesPromptTokens queries_prompt(&prompt, kNumQueries);
   QueriesPos queries_pos(&pos, kNumQueries);
   const QueriesPos queries_prefix_end(&prefix_end, kNumQueries);
   const KVCaches kv_caches{&kv_cache, kNumQueries};
 
-  GenerateT<TConfig>(weights_u8, activations, runtime_config, queries_prompt,
-                     queries_pos, queries_prefix_end, qbatch_start, kv_caches,
-                     timing_info);
+  GenerateT<T>(model, activations, runtime_config, queries_prompt, queries_pos,
+               queries_prefix_end, qbatch_start, kv_caches, timing_info);
 }
 
-template <class TConfig>
-void GenerateBatchT(const ByteStorageT& weights_u8,
+template <typename T>
+void GenerateBatchT(const ModelWeightsStorage& model,
                     const RuntimeConfig& runtime_config,
                     const QueriesPromptTokens& queries_prompt,
                     const QueriesPos& queries_pos,
@@ -1359,11 +1450,16 @@ void GenerateBatchT(const ByteStorageT& weights_u8,
   HWY_ASSERT(queries_pos.size() == num_queries);
   HWY_ASSERT(kv_caches.size() == num_queries);
   // Griffin does not support query batching.
-  const size_t max_qbatch_size =
-      (TConfig::kGriffinLayers > 0) ? 1 : runtime_config.decode_qbatch_size;
+  size_t max_qbatch_size = runtime_config.decode_qbatch_size;
+  for (const auto& layer_config : model.Config().layer_configs) {
+    if (layer_config.type == LayerAttentionType::kGriffinRecurrentBlock) {
+      max_qbatch_size = 1;
+      break;
+    }
+  }
 
-  Activations activations;
-  activations.Allocate<TConfig>(max_qbatch_size, pools);
+  Activations activations(model.Config());
+  activations.Allocate(max_qbatch_size, pools);
 
   for (size_t qbatch_start = 0; qbatch_start < num_queries;
        qbatch_start += max_qbatch_size) {
@@ -1376,30 +1472,27 @@ void GenerateBatchT(const ByteStorageT& weights_u8,
     const QueriesPos qbatch_prefix_end(&queries_prefix_end[qbatch_start],
                                              qbatch_size);
     const KVCaches qbatch_kv(&kv_caches[qbatch_start], qbatch_size);
-    GenerateT<TConfig>(weights_u8, activations, runtime_config, qbatch_prompts,
-                       qbatch_pos, qbatch_prefix_end, qbatch_start, qbatch_kv,
-                       timing_info);
+    GenerateT<T>(model, activations, runtime_config, qbatch_prompts, qbatch_pos,
+                 qbatch_prefix_end, qbatch_start, qbatch_kv, timing_info);
   }
 }
 
-template <class TConfig>
-void GenerateImageTokensT(const ByteStorageT& weights_u8,
+template <typename T>
+void GenerateImageTokensT(const ModelWeightsStorage& model,
                           const RuntimeConfig& runtime_config,
                           const Image& image, ImageTokens& image_tokens,
                           PerClusterPools& pools) {
-  if constexpr (TConfig::VitConfig::kLayers == 0) {
+  if (model.Config().vit_layer_configs.empty()) {
     return;
   } else {
-    Activations prefill_activations;
+    Activations prefill_activations(model.Config());
     RuntimeConfig prefill_runtime_config = runtime_config;
-    prefill_runtime_config.prefill_tbatch_size = TConfig::VitConfig::kSeqLen;
-    prefill_activations.Allocate<typename TConfig::VitConfig>(
-        prefill_runtime_config.prefill_tbatch_size, pools);
+    prefill_runtime_config.prefill_tbatch_size = model.Config().vit_seq_len;
+    prefill_activations.Allocate(prefill_runtime_config.prefill_tbatch_size,
+                                 pools);
     // Weights are for the full PaliGemma model, not just the ViT part.
-    const CompressedWeights<TConfig>& weights =
-        *reinterpret_cast<const CompressedWeights<TConfig>*>(weights_u8.get());
-    PrefillVit<TConfig>(weights, prefill_runtime_config, image, image_tokens,
-                        prefill_activations);
+    PrefillVit(*model.GetWeightsOfType<T>(), prefill_runtime_config, image,
+               image_tokens, prefill_activations);
   }
 }
 
@@ -1410,32 +1503,32 @@ void GenerateImageTokensT(const ByteStorageT& weights_u8,
 // These are extern functions defined by instantiations/*.cc, which include this
 // 'header' after defining GEMMA_CONFIG, which is for function overloading.
 void GenerateSingle(  // NOLINT(misc-definitions-in-headers)
-    GEMMA_CONFIG, const ByteStorageT& weights_u8,
+    GEMMA_TYPE, const ModelWeightsStorage& model,
     const RuntimeConfig& runtime_config, const PromptTokens& prompt, size_t pos,
     size_t prefix_end, KVCache& kv_cache, PerClusterPools& pools,
     TimingInfo& timing_info) {
-  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateSingleT<GEMMA_CONFIG>)
-  (weights_u8, runtime_config, prompt, pos, prefix_end, kv_cache, pools,
+  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateSingleT<GEMMA_TYPE>)
+  (model, runtime_config, prompt, pos, prefix_end, kv_cache, pools,
    timing_info);
 }
 
 void GenerateBatch(  // NOLINT(misc-definitions-in-headers)
-    GEMMA_CONFIG, const ByteStorageT& weights_u8,
+    GEMMA_TYPE, const ModelWeightsStorage& model,
     const RuntimeConfig& runtime_config,
     const QueriesPromptTokens& queries_prompt, const QueriesPos& queries_pos,
     const QueriesPos& queries_prefix_end, const KVCaches& kv_caches,
     PerClusterPools& pools, TimingInfo& timing_info) {
-  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateBatchT<GEMMA_CONFIG>)
-  (weights_u8, runtime_config, queries_prompt, queries_pos, queries_prefix_end,
+  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateBatchT<GEMMA_TYPE>)
+  (model, runtime_config, queries_prompt, queries_pos, queries_prefix_end,
    kv_caches, pools, timing_info);
 }
 
 void GenerateImageTokens(  // NOLINT(misc-definitions-in-headers)
-    GEMMA_CONFIG, const ByteStorageT& weights_u8,
+    GEMMA_TYPE, const ModelWeightsStorage& model,
     const RuntimeConfig& runtime_config, const Image& image,
     ImageTokens& image_tokens, PerClusterPools& pools) {
-  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateImageTokensT<GEMMA_CONFIG>)
-  (weights_u8, runtime_config, image, image_tokens, pools);
+  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateImageTokensT<GEMMA_TYPE>)
+  (model, runtime_config, image, image_tokens, pools);
 }
 
 #endif  // HWY_ONCE
diff --git a/gemma/gemma.cc b/gemma/gemma.cc
index 722adcb..0c5f089 100644
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@@ -29,88 +29,90 @@
 #include "compression/io.h"  // Path
 #include "gemma/common.h"
 #include "gemma/weights.h"
+#include "ops/ops-inl.h"
 #include "paligemma/image.h"
 #include "util/threading.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
+#include "hwy/profiler.h"  // also uses SIMD
 
 namespace gcpp {
 
 Gemma::Gemma(const Path& tokenizer_path, const Path& weights,
              const ModelInfo& info, PerClusterPools& pools)
     : pools_(pools), tokenizer_(tokenizer_path), info_(info) {
-  weights_u8_ =
-      LoadCompressedWeights(weights, info.model, info.weight, pools_.Inner(0));
+  model_.Load(weights, info.model, info.weight, pools_.Inner(0));
 }
 
 Gemma::Gemma(GemmaTokenizer&& tokenizer, const ModelInfo& info,
              PerClusterPools& pools)
     : pools_(pools), tokenizer_(std::move(tokenizer)), info_(info) {
   HWY_ASSERT(info.weight == Type::kF32);
-  weights_u8_ = CallForModel<float, AllocateCompressedWeights>(info.model,
-                                                               pools_.Inner(0));
+  model_.Allocate(info.model, info.weight, pools_.Inner(0));
 }
 
 Gemma::~Gemma() {
 }
 
-// There are >100 instantiations of the inference code. To reduce compile time,
+// There are >=3 types of the inference code. To reduce compile time,
 // we shard them across multiple translation units in instantiations/*.cc.
 // This declares the functions defined there. We use overloading because
 // explicit instantiations are still too slow to compile.
-#define GEMMA_DECLARE(CONFIGT, TWEIGHT)                                        \
-  extern void GenerateSingle(CONFIGT<TWEIGHT>, const ByteStorageT& weights_u8, \
+#define GEMMA_DECLARE(TWEIGHT)                                                 \
+  extern void GenerateSingle(TWEIGHT, const ModelWeightsStorage& model,        \
                              const RuntimeConfig& runtime_config,              \
                              const PromptTokens& prompt, size_t pos,           \
                              size_t prefix_end, KVCache& kv_cache,             \
                              PerClusterPools& pools, TimingInfo& timing_info); \
   extern void GenerateBatch(                                                   \
-      CONFIGT<TWEIGHT>, const ByteStorageT& weights_u8,                        \
+      TWEIGHT, const ModelWeightsStorage& model,                               \
       const RuntimeConfig& runtime_config, const QueriesPromptTokens& prompts, \
-      const QueriesPos& queries_pos,                                           \
-      const QueriesPos& queries_prefix_end, const KVCaches& kv_caches,   \
-      PerClusterPools& pools, TimingInfo& timing_info);                        \
+      const QueriesPos& queries_pos, const QueriesPos& queries_prefix_end,     \
+      const KVCaches& kv_caches, PerClusterPools& pools,                       \
+      TimingInfo& timing_info);                                                \
   extern void GenerateImageTokens(                                             \
-      CONFIGT<TWEIGHT>, const ByteStorageT& weights_u8,                        \
+      TWEIGHT, const ModelWeightsStorage& model,                               \
       const RuntimeConfig& runtime_config, const Image& image,                 \
       ImageTokens& image_tokens, PerClusterPools& pools);
-GEMMA_FOREACH_CONFIG_AND_WEIGHT(GEMMA_DECLARE);
+GEMMA_DECLARE(float)
+GEMMA_DECLARE(BF16)
+GEMMA_DECLARE(NuqStream)
+GEMMA_DECLARE(SfpStream)
 
-// Adapters to select from the above overloads via CallForModelAndWeight.
+// Adapters to select from the above overloads via CallForModelWeight.
 template <class TConfig>
 struct GenerateSingleT {
-  void operator()(const ByteStorageT& weights_u8,
+  void operator()(const ModelWeightsStorage& model,
                   const RuntimeConfig& runtime_config,
                   const PromptTokens& prompt, size_t pos, size_t prefix_end,
                   KVCache& kv_cache, PerClusterPools& pools,
                   TimingInfo& timing_info) const {
-    GenerateSingle(TConfig(), weights_u8, runtime_config, prompt, pos,
-                   prefix_end, kv_cache, pools, timing_info);
+    GenerateSingle(TConfig(), model, runtime_config, prompt, pos, prefix_end,
+                   kv_cache, pools, timing_info);
   }
 };
 
 template <class TConfig>
 struct GenerateBatchT {
-  void operator()(const ByteStorageT& weights_u8,
+  void operator()(const ModelWeightsStorage& model,
                   const RuntimeConfig& runtime_config,
                   const QueriesPromptTokens& queries_prompt,
                   const QueriesPos& queries_pos,
                   const QueriesPos& queries_prefix_end,
                   const KVCaches& kv_caches, PerClusterPools& pools,
                   TimingInfo& timing_info) const {
-    GenerateBatch(TConfig(), weights_u8, runtime_config, queries_prompt,
-                  queries_pos, queries_prefix_end, kv_caches, pools,
-                  timing_info);
+    GenerateBatch(TConfig(), model, runtime_config, queries_prompt, queries_pos,
+                  queries_prefix_end, kv_caches, pools, timing_info);
   }
 };
 
 template <class TConfig>
 struct GenerateImageTokensT {
-  void operator()(const ByteStorageT& weights_u8,
+  void operator()(const ModelWeightsStorage& model,
                   const RuntimeConfig& runtime_config, const Image& image,
                   ImageTokens& image_tokens, PerClusterPools& pools) const {
-    GenerateImageTokens(TConfig(), weights_u8, runtime_config, image,
-                        image_tokens, pools);
+    GenerateImageTokens(TConfig(), model, runtime_config, image, image_tokens,
+                        pools);
   }
 };
 
@@ -119,9 +121,8 @@ void Gemma::Generate(const RuntimeConfig& runtime_config,
                      KVCache& kv_cache, TimingInfo& timing_info) {
   if (runtime_config.use_spinning) pools_.StartSpinning();
 
-  CallForModelAndWeight<GenerateSingleT>(
-      info_.model, info_.weight, weights_u8_, runtime_config, prompt, pos,
-      prefix_end, kv_cache, pools_, timing_info);
+  model_.CallForModelWeight<GenerateSingleT>(
+      runtime_config, prompt, pos, prefix_end, kv_cache, pools_, timing_info);
 
   if (runtime_config.use_spinning) pools_.StopSpinning();
 }
@@ -142,9 +143,9 @@ void Gemma::GenerateBatch(const RuntimeConfig& runtime_config,
 
   if (runtime_config.use_spinning) pools_.StartSpinning();
 
-  CallForModelAndWeight<GenerateBatchT>(
-      info_.model, info_.weight, weights_u8_, runtime_config, queries_prompt,
-      queries_pos, mutable_queries_prefix_end, kv_caches, pools_, timing_info);
+  model_.CallForModelWeight<GenerateBatchT>(
+      runtime_config, queries_prompt, queries_pos, mutable_queries_prefix_end,
+      kv_caches, pools_, timing_info);
 
   if (runtime_config.use_spinning) pools_.StopSpinning();
 }
@@ -153,28 +154,25 @@ void Gemma::GenerateImageTokens(const RuntimeConfig& runtime_config,
                                 const Image& image, ImageTokens& image_tokens) {
   if (runtime_config.use_spinning) pools_.StartSpinning();
 
-  CallForModelAndWeight<GenerateImageTokensT>(info_.model, info_.weight,
-                                              weights_u8_, runtime_config,
-                                              image, image_tokens, pools_);
+  model_.CallForModelWeight<GenerateImageTokensT>(runtime_config, image,
+                                                  image_tokens, pools_);
 
   if (runtime_config.use_spinning) pools_.StopSpinning();
 }
 
-template <typename TConfig>
-struct GetModelConfig {
-  ModelConfigInfo operator()() const {
-    return ModelConfigInfo{
-        .layers = TConfig::kLayers,
-        .model_dim = TConfig::kModelDim,
-        .heads = TConfig::kHeads,
-        .kv_heads = TConfig::kKVHeads,
-        .qkv_dim = TConfig::kQKVDim,
-    };
-  }
-};
+// Non-template functions moved from gemma-inl.h to avoid ODR violations.
 
-ModelConfigInfo Gemma::ModelConfig() const {
-  return CallForModel<float, GetModelConfig>(info_.model);
+void RangeChecks(const ModelConfig& weights_config,
+                 size_t& max_generated_tokens, const size_t prompt_size) {
+  if (!weights_config.use_local_attention) {
+    if (max_generated_tokens > weights_config.seq_len) {
+      fprintf(stderr,
+              "WARNING: max_generated_tokens %zu > kSeqLen %zu, truncating.\n",
+              max_generated_tokens, weights_config.seq_len);
+      max_generated_tokens = weights_config.seq_len;
+    }
+  }
+  HWY_ASSERT(prompt_size > 0);
 }
 
 }  // namespace gcpp
diff --git a/gemma/gemma.h b/gemma/gemma.h
index 6b74008..ce7b835 100644
--- a/gemma/gemma.h
+++ b/gemma/gemma.h
@@ -27,6 +27,7 @@
 #include "gemma/common.h"
 #include "gemma/kv_cache.h"
 #include "gemma/tokenizer.h"
+#include "gemma/weights.h"
 #include "paligemma/image.h"
 #include "util/allocator.h"  // RowVectorBatch
 #include "util/basics.h"     // TokenAndProb
@@ -179,15 +180,6 @@ struct TimingInfo {
   size_t tokens_generated = 0;
 };
 
-// ModelConfigInfo holds model configuration details: number of layers, etc.
-struct ModelConfigInfo {
-  const int layers;
-  const int model_dim;
-  const int heads;
-  const int kv_heads;
-  const int qkv_dim;
-};
-
 class Gemma {
  public:
   Gemma(const Path& tokenizer_path, const Path& weights, const ModelInfo& info,
@@ -198,11 +190,11 @@ class Gemma {
         PerClusterPools& pools);
   ~Gemma();
 
-  ModelConfigInfo ModelConfig() const;
+  const ModelConfig& GetModelConfig() const { return model_.Config(); }
   const ModelInfo& Info() const { return info_; }
   const GemmaTokenizer& Tokenizer() const { return tokenizer_; }
-  const ByteStorageT& Weights() const { return weights_u8_; }
-  ByteStorageT& MutableWeights() { return weights_u8_; }
+  const ModelWeightsStorage& Weights() const { return model_; }
+  ModelWeightsStorage& MutableWeights() { return model_; }
 
   // `pos` is the position in the KV cache. Users are responsible for
   // incrementing it in the `*StreamFunc`, or setting to zero for single-turn.
@@ -241,7 +233,7 @@ class Gemma {
 
   GemmaTokenizer tokenizer_;
   // Type-erased so that this can be defined in the header.
-  ByteStorageT weights_u8_;
+  ModelWeightsStorage model_;
   ModelInfo info_;
 };
 
@@ -251,6 +243,8 @@ class Gemma {
 std::vector<int> WrapAndTokenize(const GemmaTokenizer& tokenizer,
                                  const ModelInfo& info, size_t pos,
                                  std::string& prompt);
+void RangeChecks(const ModelConfig& weights_config,
+                 size_t& max_generated_tokens, size_t prompt_size);
 
 }  // namespace gcpp
 
diff --git a/gemma/instantiations/27b_bf16.cc b/gemma/instantiations/27b_bf16.cc
deleted file mode 100644
index 8698c7a..0000000
--- a/gemma/instantiations/27b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/27b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_27B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/27b_f32.cc b/gemma/instantiations/27b_f32.cc
deleted file mode 100644
index f4b5d6c..0000000
--- a/gemma/instantiations/27b_f32.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/27b_f32.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_27B<float>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/27b_sfp.cc b/gemma/instantiations/27b_sfp.cc
deleted file mode 100644
index 7d0072a..0000000
--- a/gemma/instantiations/27b_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/27b_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_27B<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/2b_bf16.cc b/gemma/instantiations/2b_bf16.cc
deleted file mode 100644
index bd03de7..0000000
--- a/gemma/instantiations/2b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/2b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/7b_bf16.cc b/gemma/instantiations/7b_bf16.cc
deleted file mode 100644
index 03bc369..0000000
--- a/gemma/instantiations/7b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/7b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma7B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/7b_sfp.cc b/gemma/instantiations/7b_sfp.cc
deleted file mode 100644
index 78a768d..0000000
--- a/gemma/instantiations/7b_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/7b_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma7B<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/9b_bf16.cc b/gemma/instantiations/9b_bf16.cc
deleted file mode 100644
index 1cd5d13..0000000
--- a/gemma/instantiations/9b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/9b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_9B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/9b_sfp.cc b/gemma/instantiations/9b_sfp.cc
deleted file mode 100644
index b822524..0000000
--- a/gemma/instantiations/9b_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/9b_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_9B<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/2b_f32.cc b/gemma/instantiations/bf16.cc
similarity index 87%
rename from gemma/instantiations/2b_f32.cc
rename to gemma/instantiations/bf16.cc
index fd49571..19ae585 100644
--- a/gemma/instantiations/2b_f32.cc
+++ b/gemma/instantiations/bf16.cc
@@ -14,8 +14,7 @@
 // limitations under the License.
 
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/2b_f32.cc"
+#define HWY_TARGET_INCLUDE "gemma/instantiations/bf16.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2B<float>
+#define GEMMA_TYPE hwy::bfloat16_t
 #include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/7b_f32.cc b/gemma/instantiations/f32.cc
similarity index 87%
rename from gemma/instantiations/7b_f32.cc
rename to gemma/instantiations/f32.cc
index 7f09e85..6b5496d 100644
--- a/gemma/instantiations/7b_f32.cc
+++ b/gemma/instantiations/f32.cc
@@ -14,8 +14,7 @@
 // limitations under the License.
 
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/7b_f32.cc"
+#define HWY_TARGET_INCLUDE "gemma/instantiations/f32.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma7B<float>
+#define GEMMA_TYPE float
 #include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gemma2_2b_bf16.cc b/gemma/instantiations/gemma2_2b_bf16.cc
deleted file mode 100644
index d817137..0000000
--- a/gemma/instantiations/gemma2_2b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gemma2_2b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_2B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gemma2_2b_f32.cc b/gemma/instantiations/gemma2_2b_f32.cc
deleted file mode 100644
index c2f52a1..0000000
--- a/gemma/instantiations/gemma2_2b_f32.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gemma2_2b_f32.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_2B<float>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gemma2_2b_sfp.cc b/gemma/instantiations/gemma2_2b_sfp.cc
deleted file mode 100644
index 1122ba9..0000000
--- a/gemma/instantiations/gemma2_2b_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gemma2_2b_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_2B<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gr2b_bf16.cc b/gemma/instantiations/gr2b_bf16.cc
deleted file mode 100644
index 4c0b36e..0000000
--- a/gemma/instantiations/gr2b_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gr2b_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGriffin2B<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gr2b_f32.cc b/gemma/instantiations/gr2b_f32.cc
deleted file mode 100644
index 8d12b1a..0000000
--- a/gemma/instantiations/gr2b_f32.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gr2b_f32.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGriffin2B<float>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/gr2b_sfp.cc b/gemma/instantiations/gr2b_sfp.cc
deleted file mode 100644
index 32f40ff..0000000
--- a/gemma/instantiations/gr2b_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/gr2b_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGriffin2B<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/9b_f32.cc b/gemma/instantiations/nuq.cc
similarity index 87%
rename from gemma/instantiations/9b_f32.cc
rename to gemma/instantiations/nuq.cc
index d96b279..5e3ff4d 100644
--- a/gemma/instantiations/9b_f32.cc
+++ b/gemma/instantiations/nuq.cc
@@ -14,8 +14,7 @@
 // limitations under the License.
 
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/9b_f32.cc"
+#define HWY_TARGET_INCLUDE "gemma/instantiations/nuq.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2_9B<float>
+#define GEMMA_TYPE NuqStream
 #include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/paligemma_224_bf16.cc b/gemma/instantiations/paligemma_224_bf16.cc
deleted file mode 100644
index 8d508e7..0000000
--- a/gemma/instantiations/paligemma_224_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/paligemma_224_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigPaliGemma_224<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/paligemma_224_f32.cc b/gemma/instantiations/paligemma_224_f32.cc
deleted file mode 100644
index 97107e9..0000000
--- a/gemma/instantiations/paligemma_224_f32.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/paligemma_224_f32.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigPaliGemma_224<float>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/paligemma_224_sfp.cc b/gemma/instantiations/paligemma_224_sfp.cc
deleted file mode 100644
index aff27a0..0000000
--- a/gemma/instantiations/paligemma_224_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/paligemma_224_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigPaliGemma_224<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/2b_sfp.cc b/gemma/instantiations/sfp.cc
similarity index 87%
rename from gemma/instantiations/2b_sfp.cc
rename to gemma/instantiations/sfp.cc
index c93f1d1..563d034 100644
--- a/gemma/instantiations/2b_sfp.cc
+++ b/gemma/instantiations/sfp.cc
@@ -14,8 +14,7 @@
 // limitations under the License.
 
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/2b_sfp.cc"
+#define HWY_TARGET_INCLUDE "gemma/instantiations/sfp.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemma2B<SfpStream>
+#define GEMMA_TYPE SfpStream
 #include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/tiny_bf16.cc b/gemma/instantiations/tiny_bf16.cc
deleted file mode 100644
index 53dad72..0000000
--- a/gemma/instantiations/tiny_bf16.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/tiny_bf16.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemmaTiny<hwy::bfloat16_t>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/tiny_f32.cc b/gemma/instantiations/tiny_f32.cc
deleted file mode 100644
index 6f11ddc..0000000
--- a/gemma/instantiations/tiny_f32.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/tiny_f32.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemmaTiny<float>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/instantiations/tiny_sfp.cc b/gemma/instantiations/tiny_sfp.cc
deleted file mode 100644
index 2eaa86f..0000000
--- a/gemma/instantiations/tiny_sfp.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE \
-  "gemma/instantiations/tiny_sfp.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#define GEMMA_CONFIG ConfigGemmaTiny<SfpStream>
-#include "gemma/gemma-inl.h"
diff --git a/gemma/kv_cache.cc b/gemma/kv_cache.cc
index d12c9af..cc9db89 100644
--- a/gemma/kv_cache.cc
+++ b/gemma/kv_cache.cc
@@ -15,59 +15,56 @@
 
 #include "gemma/kv_cache.h"
 
+#include <algorithm>
+
 #include "gemma/common.h"  // CallForModel
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // ZeroBytes
 
 namespace gcpp {
-namespace {
-template <class TConfig>
-struct CreateKVCache {
-  KVCache operator()(size_t prefill_tbatch_size) const {
-    KVCache kv_cache = {};
-
-    const size_t size_cache_pos = CachePosSize<TConfig>()();
-    if (size_cache_pos != 0) {
-      // Allocate more so that prefill can always access one batch, even if
-      // near the end of the sequence.
-      kv_cache.seq_len = TConfig::kSeqLen + prefill_tbatch_size;
-      kv_cache.kv_cache =
-          hwy::AllocateAligned<float>(kv_cache.seq_len * size_cache_pos);
-    }
-
-    // TODO(patrickms): Add query batching support for Griffin.
-    if (TConfig::kGriffinLayers) {
-      constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
-      const size_t conv1d_cache_size =
-          TConfig::kGriffinLayers * (kConv1dWidth == 0 ? 0 : kConv1dWidth - 1) *
-          TConfig::kModelDim;
-      if (conv1d_cache_size != 0) {
-        kv_cache.conv1d_cache = hwy::AllocateAligned<float>(conv1d_cache_size);
-        hwy::ZeroBytes(kv_cache.conv1d_cache.get(),
-                       conv1d_cache_size * sizeof(kv_cache.conv1d_cache[0]));
-      }
-
-      const size_t rglru_cache_size =
-          TConfig::kGriffinLayers * TConfig::kModelDim;
-      if (rglru_cache_size != 0) {
-        kv_cache.rglru_cache = hwy::AllocateAligned<float>(rglru_cache_size);
-        hwy::ZeroBytes(kv_cache.rglru_cache.get(),
-                       rglru_cache_size * sizeof(kv_cache.rglru_cache[0]));
-      }
-    }  // kGriffinLayers
-
-    return kv_cache;
-  }
-};
-}  // namespace
 
 // prefill_tbatch_size is the maximum number of tokens from one query to
 // prefill at a time.
-KVCache KVCache::Create(Model model_type, size_t prefill_tbatch_size) {
-  // TWeight=float is a placeholder and unused because CreateKVCache does not
-  // use TConfig::Weight.
-  return CallForModel</*TWeight=*/float, CreateKVCache>(model_type,
-                                                        prefill_tbatch_size);
+KVCache KVCache::Create(const ModelConfig& weights_config,
+                        size_t prefill_tbatch_size) {
+  KVCache kv_cache = {};
+
+  const size_t size_cache_pos = weights_config.CachePosSize();
+  if (size_cache_pos != 0) {
+    // Allocate more so that prefill can always access one batch, even if
+    // near the end of the sequence.
+    kv_cache.seq_len = weights_config.seq_len + prefill_tbatch_size;
+    kv_cache.kv_cache =
+        hwy::AllocateAligned<float>(kv_cache.seq_len * size_cache_pos);
+  }
+  size_t num_griffin_layers = weights_config.NumLayersOfType(
+      LayerAttentionType::kGriffinRecurrentBlock);
+
+  // TODO(patrickms): Add query batching support for Griffin.
+  if (num_griffin_layers > 0) {
+    size_t conv1d_width = 0;
+    for (const auto& layer_config : weights_config.layer_configs) {
+      conv1d_width = std::max(conv1d_width, layer_config.conv1d_width);
+    }
+    const size_t conv1d_cache_size =
+        num_griffin_layers * (conv1d_width == 0 ? 0 : conv1d_width - 1) *
+        weights_config.model_dim;
+    if (conv1d_cache_size != 0) {
+      kv_cache.conv1d_cache = hwy::AllocateAligned<float>(conv1d_cache_size);
+      hwy::ZeroBytes(kv_cache.conv1d_cache.get(),
+                     conv1d_cache_size * sizeof(kv_cache.conv1d_cache[0]));
+    }
+
+    const size_t rglru_cache_size =
+        num_griffin_layers * weights_config.model_dim;
+    if (rglru_cache_size != 0) {
+      kv_cache.rglru_cache = hwy::AllocateAligned<float>(rglru_cache_size);
+      hwy::ZeroBytes(kv_cache.rglru_cache.get(),
+                     rglru_cache_size * sizeof(kv_cache.rglru_cache[0]));
+    }
+  }  // kGriffinLayers
+
+  return kv_cache;
 }
 
 }  // namespace gcpp
diff --git a/gemma/kv_cache.h b/gemma/kv_cache.h
index 65b40c1..9c46d93 100644
--- a/gemma/kv_cache.h
+++ b/gemma/kv_cache.h
@@ -35,7 +35,8 @@ struct KVCache {
   // kModelDim * kGriffinLayers
   hwy::AlignedFreeUniquePtr<float[]> rglru_cache;
 
-  static KVCache Create(Model type, size_t prefill_tbatch_size);
+  static KVCache Create(const ModelConfig& weights_config,
+                        size_t prefill_tbatch_size);
 };
 
 }  // namespace gcpp
diff --git a/gemma/run.cc b/gemma/run.cc
index 42e54a4..6f53fc7 100644
--- a/gemma/run.cc
+++ b/gemma/run.cc
@@ -194,7 +194,7 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
 
   Gemma model = CreateGemma(loader, pools);
   KVCache kv_cache =
-      KVCache::Create(model.Info().model, inference.prefill_tbatch_size);
+      KVCache::Create(model.GetModelConfig(), inference.prefill_tbatch_size);
 
   if (app.verbosity >= 1) {
     std::string instructions =
diff --git a/gemma/weights.cc b/gemma/weights.cc
index 955c4d6..de54ef3 100644
--- a/gemma/weights.cc
+++ b/gemma/weights.cc
@@ -17,12 +17,14 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
+#include <random>
 #include <vector>
 
 #include "compression/compress.h"
 #include "compression/io.h"  // Path
 #include "gemma/common.h"
-#include "util/allocator.h"
+#include "gemma/configs.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // HWY_ABORT
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -31,58 +33,128 @@
 
 namespace gcpp {
 
-namespace {
-template <class TConfig>
-struct LoadCompressedWeightsT {
-  ByteStorageT operator()(const Path& weights, hwy::ThreadPool& pool) const {
-    PROFILER_ZONE("Startup.LoadCompressedWeights");
-    if (!weights.Exists()) {
-      HWY_ABORT("The model weights file '%s' does not exist.",
-                weights.path.c_str());
-    }
-
-    // Allocate compressed weights.
-    using CWeights = CompressedWeights<TConfig>;
-    ByteStorageT c_weights_u8 = AllocateSizeof<CWeights>();
-    CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
-    new (c_weights) CWeights(pool);
-
-    CacheLoader loader(weights);
-    ForEachType fet =
-        loader.HaveToc() ? ForEachType::kLoadWithToc : ForEachType::kLoadNoToc;
-    CWeights::ForEachTensor(
-        {c_weights}, fet,
+template <typename T>
+struct TensorLoader {
+  void operator()(ModelWeightsPtrs<T>& weights, ForEachType fet,
+                  CacheLoader& loader) {
+    weights.ForEachTensor(
+        {&weights}, fet,
         [&loader](const char* name, hwy::Span<MatPtr*> tensors) {
           loader(name, tensors);
         });
-    std::vector<float> scales(TConfig::kNumTensorScales);
-    if (TConfig::kNumTensorScales > 0) {
-      loader.LoadScales(scales.data(), scales.size());
-    }
-    if (!loader.ReadAll(pool, c_weights->model_storage)) {
-      HWY_ABORT("Failed to load model weights.");
-    }
-    if (TConfig::kNumTensorScales > 0) {
-      c_weights->GetOrApplyScales(scales);
-    }
-    {
-      PROFILER_ZONE("Startup.Reshape");
-      c_weights->Reshape(pool);
-    }
-    return c_weights_u8;
   }
 };
-}  // namespace
 
-ByteStorageT LoadCompressedWeights(const Path& weights, Model model_type,
-                                   Type weight_type, hwy::ThreadPool& pool) {
-  return CallForModelAndWeight<LoadCompressedWeightsT>(model_type, weight_type,
-                                                       weights, pool);
+BlobError ModelWeightsStorage::Load(const Path& weights, Model model_type,
+                                    Type weight_type, hwy::ThreadPool& pool) {
+  PROFILER_ZONE("Startup.LoadModelWeightsPtrs");
+  if (!weights.Exists()) {
+    HWY_ABORT("The model weights file '%s' does not exist.",
+              weights.path.c_str());
+  }
+  CacheLoader loader(weights);
+  ForEachType fet =
+      loader.HaveToc() ? ForEachType::kLoadWithToc : ForEachType::kLoadNoToc;
+  if (fet == ForEachType::kLoadWithToc) {
+    // TODO(rays): Load the config from the file.
+    HWY_ABORT("TOC not supported yet.");
+  } else {
+    // No Toc-> no config.
+    config_ = ConfigFromModel(model_type);
+    config_.weight = weight_type;
+  }
+  CreateForType(weight_type, pool);
+  CallForModelWeightT<TensorLoader>(fet, loader);
+  std::vector<float> scales(config_.num_tensor_scales + config_.num_vit_scales);
+  if (!scales.empty()) {
+    loader.LoadScales(scales.data(), scales.size());
+  }
+  BlobError err = loader.ReadAll(pool, model_storage_);
+  if (err != 0) {
+    fprintf(stderr, "Failed to load model weights: %d\n", err);
+    return err;
+  }
+  if (!scales.empty()) {
+    GetOrApplyScales(scales);
+  }
+  if (fet == ForEachType::kLoadNoToc) {
+    PROFILER_ZONE("Startup.Reshape");
+    AllocAndCopyWithTranspose(pool);
+  }
+  return 0;
+}
+
+void ModelWeightsStorage::Allocate(const ModelConfig& config, Type weight_type,
+                                   hwy::ThreadPool& pool) {
+  PROFILER_ZONE("Startup.AllocateModelWeightsPtrs");
+  config_ = config;
+  config_.weight = weight_type;
+  CreateForType(weight_type, pool);
+  if (float_weights_) float_weights_->Allocate(model_storage_, pool);
+  if (bf16_weights_) bf16_weights_->Allocate(model_storage_, pool);
+  if (sfp_weights_) sfp_weights_->Allocate(model_storage_, pool);
+  if (nuq_weights_) nuq_weights_->Allocate(model_storage_, pool);
+}
+
+class WeightInitializer {
+ public:
+  WeightInitializer(std::mt19937& gen) : dist_(0.0f, 1.0f), gen_(gen) {}
+
+  void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
+    float* data = tensors[0]->data<float>();
+    for (size_t i = 0; i < tensors[0]->NumElements(); ++i) {
+      data[i] = dist_(gen_);
+    }
+    tensors[0]->set_scale(1.0f);
+  }
+
+ private:
+  std::normal_distribution<float> dist_;
+  std::mt19937& gen_;
+};
+
+void ModelWeightsStorage::RandInit(std::mt19937& gen) {
+  HWY_ASSERT(float_weights_);
+  WeightInitializer init(gen);
+  ModelWeightsPtrs<float>::ForEachTensor({float_weights_.get()},
+                                         ForEachType::kLoadNoToc, init);
+}
+
+void ModelWeightsStorage::ZeroInit() {
+  if (float_weights_) float_weights_->ZeroInit();
+  if (bf16_weights_) bf16_weights_->ZeroInit();
+  if (sfp_weights_) sfp_weights_->ZeroInit();
+  if (nuq_weights_) nuq_weights_->ZeroInit();
+}
+
+void ModelWeightsStorage::GetOrApplyScales(std::vector<float>& scales) {
+  if (float_weights_) float_weights_->GetOrApplyScales(scales);
+  if (bf16_weights_) bf16_weights_->GetOrApplyScales(scales);
+  if (sfp_weights_) sfp_weights_->GetOrApplyScales(scales);
+  if (nuq_weights_) nuq_weights_->GetOrApplyScales(scales);
+}
+
+void ModelWeightsStorage::AllocAndCopyWithTranspose(hwy::ThreadPool& pool) {
+  if (float_weights_)
+    float_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
+  if (bf16_weights_)
+    bf16_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
+  if (sfp_weights_)
+    sfp_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
+  if (nuq_weights_)
+    nuq_weights_->AllocAndCopyWithTranspose(pool, model_storage_);
+}
+
+void ModelWeightsStorage::CopyWithTranspose(hwy::ThreadPool& pool) {
+  if (float_weights_) float_weights_->CopyWithTranspose(pool);
+  if (bf16_weights_) bf16_weights_->CopyWithTranspose(pool);
+  if (sfp_weights_) sfp_weights_->CopyWithTranspose(pool);
+  if (nuq_weights_) nuq_weights_->CopyWithTranspose(pool);
 }
 
 namespace {
-// For reasons unknown, this is shown as potentially unused in the IDE.
-void HWY_MAYBE_UNUSED LogVec(const char* name, const float* data, size_t len) {
+
+void LogVec(const char* name, const float* data, size_t len) {
   hwy::Stats stats;
   for (size_t i = 0; i < len; ++i) {
     stats.Notify(data[i]);
@@ -91,36 +163,44 @@ void HWY_MAYBE_UNUSED LogVec(const char* name, const float* data, size_t len) {
          name, len, stats.Min(), stats.Mean(), stats.Max());
 }
 
-class WeightLogger {
- public:
-  void operator()(const char* name, hwy::Span<MatPtr*> tensors) {
-    const MatPtr& tensor = *tensors[0];
-    if (tensor.scale() != 1.0f) {
-      printf("[scale=%f] ", tensor.scale());
-    }
-    LogVec(name, tensor.data<float>(), tensor.NumElements());
-    total_weights += tensor.NumElements();
-  }
-  size_t total_weights = 0;
-};
-
-template <typename TConfig>
-struct LogWeightStatsT {
-  void operator()(const ByteStorageT& weights_u8) const {
-    auto& weights =
-        *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-    WeightLogger logger;
-    CompressedWeights<TConfig>::ForEachTensor(
-        {&weights}, ForEachType::kIgnoreNulls, logger);
-    printf("%-20s  %12zu\n", "Total", logger.total_weights);
-  }
-};
 }  // namespace
 
-void LogWeightStats(gcpp::Model model_type, Type weight_type,
-                    const ByteStorageT& weights) {
-  HWY_ASSERT(weight_type == Type::kF32);
-  CallForModel<float, LogWeightStatsT>(model_type, weights);
+void ModelWeightsStorage::LogWeightStats() {
+  size_t total_weights = 0;
+  // Only for float weights.
+  ModelWeightsPtrs<float>::ForEachTensor(
+      {float_weights_.get()}, ForEachType::kInitNoToc,
+      [&total_weights](const char* name, hwy::Span<MatPtr*> tensors) {
+        const MatPtr& tensor = *tensors[0];
+        if (tensor.scale() != 1.0f) {
+          printf("[scale=%f] ", tensor.scale());
+        }
+        LogVec(name, tensor.data<float>(), tensor.NumElements());
+        total_weights += tensor.NumElements();
+      });
+  printf("%-20s  %12zu\n", "Total", total_weights);
+}
+
+void ModelWeightsStorage::CreateForType(Type weight_type,
+                                        hwy::ThreadPool& pool) {
+  switch (weight_type) {
+    case Type::kF32:
+      float_weights_ = std::make_unique<ModelWeightsPtrs<float>>(config_, pool);
+      break;
+    case Type::kBF16:
+      bf16_weights_ = std::make_unique<ModelWeightsPtrs<BF16>>(config_, pool);
+      break;
+    case Type::kSFP:
+      sfp_weights_ =
+          std::make_unique<ModelWeightsPtrs<SfpStream>>(config_, pool);
+      break;
+    case Type::kNUQ:
+      nuq_weights_ =
+          std::make_unique<ModelWeightsPtrs<NuqStream>>(config_, pool);
+      break;
+    default:
+      HWY_ABORT("Weight type %d unsupported.", static_cast<int>(weight_type));
+  }
 }
 
 }  // namespace gcpp
diff --git a/gemma/weights.h b/gemma/weights.h
index 84ad3ef..65a5965 100644
--- a/gemma/weights.h
+++ b/gemma/weights.h
@@ -18,9 +18,10 @@
 
 #include <stddef.h>
 
-#include <array>
 #include <complex>
 #include <cstdio>
+#include <memory>
+#include <random>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -29,7 +30,6 @@
 #include "compression/shared.h"
 #include "gemma/common.h"
 #include "gemma/configs.h"
-#include "util/allocator.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -53,57 +53,79 @@ enum class ForEachType {
   kInitNoToc,
 };
 
-template <class TConfig>
-struct CompressedLayer {
+template <class Weight>
+struct LayerWeightsPtrs {
   // Large data is constructed separately.
-  CompressedLayer()
-      : attn_vec_einsum_w("att_ein", kModelDim, kHeads * kQKVDim),
-        qkv_einsum_w("qkv_ein", (kHeads + 2 * kKVHeads) * kQKVDim, kModelDim),
-        qkv_einsum_w1("qkv1_w", kHeads * kQKVDim, kModelDim),
-        qkv_einsum_w2("qkv2_w", 2 * kKVHeads * kQKVDim, kModelDim),
-        attention_output_biases("attn_ob", 1, kAOBiasDim),
-        griffin({.linear_x_w = {"gr_lin_x_w", kGriffinDim, kGriffinDim},
-                 .linear_x_biases = {"gr_lin_x_b", 1, kGriffinDim},
-                 .linear_y_w = {"gr_lin_y_w", kGriffinDim, kGriffinDim},
-                 .linear_y_biases = {"gr_lin_y_b", 1, kGriffinDim},
-                 .linear_out_w = {"gr_lin_out_w", kGriffinDim, kGriffinDim},
-                 .linear_out_biases = {"gr_lin_out_b", 1, kGriffinDim},
-                 .conv_w = {"gr_conv_w", kConv1dWidth, kGriffinDim},
-                 .conv_biases = {"gr_conv_b", 1, kGriffinDim},
-                 .gate_w = {"gr_gate_w", 2 * kGriffinDim, kGriffinDim / kHeads},
-                 .gate_biases = {"gr_gate_b", 1, kGriffinDim * 2},
-                 .a = {"gr_a", 1, kGriffinDim}}),
+  explicit LayerWeightsPtrs(const LayerConfig& config)
+      : attn_vec_einsum_w("att_ein", config.model_dim,
+                          config.heads * config.qkv_dim),
+        qkv_einsum_w("qkv_ein",
+                     (config.heads + 2 * config.kv_heads) * config.qkv_dim,
+                     config.model_dim),
+        qkv_einsum_w1("qkv1_w", config.heads * config.qkv_dim,
+                      config.model_dim),
+        qkv_einsum_w2("qkv2_w", 2 * config.kv_heads * config.qkv_dim,
+                      config.model_dim),
+        attention_output_biases(
+            "attn_ob", 1,
+            config.softmax_attn_output_biases ? config.model_dim : 0),
+        griffin(
+            {.linear_x_w = {"gr_lin_x_w", config.griffin_dim,
+                            config.griffin_dim},
+             .linear_x_biases = {"gr_lin_x_b", 1, config.griffin_dim},
+             .linear_y_w = {"gr_lin_y_w", config.griffin_dim,
+                            config.griffin_dim},
+             .linear_y_biases = {"gr_lin_y_b", 1, config.griffin_dim},
+             .linear_out_w = {"gr_lin_out_w", config.griffin_dim,
+                              config.griffin_dim},
+             .linear_out_biases = {"gr_lin_out_b", 1, config.griffin_dim},
+             .conv_w = {"gr_conv_w", config.conv1d_width, config.griffin_dim},
+             .conv_biases = {"gr_conv_b", 1, config.griffin_dim},
+             .gate_w = {"gr_gate_w", 2 * config.griffin_dim,
+                        config.griffin_dim / config.heads},
+             .gate_biases = {"gr_gate_b", 1, config.griffin_dim * 2},
+             .a = {"gr_a", 1, config.griffin_dim}}),
         // MultiHeadDotProductAttention.
-        vit({.attn_out_w = {"attn_out_w", kHeads * kQKVDim, kModelDim},
-             .attn_out_b = {"attn_out_b", 1, kModelDim},
-             .qkv_einsum_w = {"qkv_ein_w", (kHeads + 2 * kKVHeads) * kQKVDim,
-                              kModelDim},
-             .qkv_einsum_b = {"qkv_ein_b", (kHeads + 2 * kKVHeads), kQKVDim},
-             .linear_0_w = {"linear_0_w", kModelDim, kFFHiddenDim},
-             .linear_0_b = {"linear_0_b", 1, kFFHiddenDim},
-             .linear_1_w = {"linear_1_w", kFFHiddenDim, kModelDim},
-             .linear_1_b = {"linear_1_b", 1, kModelDim},
-             .layer_norm_0_bias = {"ln_0_bias", 1, kModelDim},
-             .layer_norm_0_scale = {"ln_0_scale", 1, kModelDim},
-             .layer_norm_1_bias = {"ln_1_bias", 1, kModelDim},
-             .layer_norm_1_scale = {"ln_1_scale", 1, kModelDim}}),
-        gating_einsum_w("gating_ein", 2 * kFFHiddenDim, kModelDim),
-        gating_einsum_w1("gating1_w", kFFHiddenDim, kModelDim),
-        gating_einsum_w2("gating2_w", kFFHiddenDim, kModelDim),
-        linear_w("linear_w", kModelDim, kFFHiddenDim),
-        pre_attention_norm_scale("pre_att_ns", 1, kModelDim),
-        pre_ffw_norm_scale("pre_ff_ns", 1, kModelDim),
+        vit({.attn_out_w = {"attn_out_w", config.heads * config.qkv_dim,
+                            config.model_dim},
+             .attn_out_b = {"attn_out_b", 1, config.model_dim},
+             .qkv_einsum_w = {"qkv_ein_w",
+                              (config.heads + 2 * config.kv_heads) *
+                                  config.qkv_dim,
+                              config.model_dim},
+             .qkv_einsum_b = {"qkv_ein_b", (config.heads + 2 * config.kv_heads),
+                              config.qkv_dim},
+             .linear_0_w = {"linear_0_w", config.model_dim,
+                            config.ff_hidden_dim},
+             .linear_0_b = {"linear_0_b", 1, config.ff_hidden_dim},
+             .linear_1_w = {"linear_1_w", config.ff_hidden_dim,
+                            config.model_dim},
+             .linear_1_b = {"linear_1_b", 1, config.model_dim},
+             .layer_norm_0_bias = {"ln_0_bias", 1, config.model_dim},
+             .layer_norm_0_scale = {"ln_0_scale", 1, config.model_dim},
+             .layer_norm_1_bias = {"ln_1_bias", 1, config.model_dim},
+             .layer_norm_1_scale = {"ln_1_scale", 1, config.model_dim}}),
+        gating_einsum_w("gating_ein", 2 * config.ff_hidden_dim,
+                        config.model_dim),
+        gating_einsum_w1("gating1_w", config.ff_hidden_dim, config.model_dim),
+        gating_einsum_w2("gating2_w", config.ff_hidden_dim, config.model_dim),
+        linear_w("linear_w", config.model_dim, config.ff_hidden_dim),
+        pre_attention_norm_scale("pre_att_ns", 1, config.model_dim),
+        pre_ffw_norm_scale("pre_ff_ns", 1, config.model_dim),
         post_attention_norm_scale(
-            "post_att_ns", 1, kPostNorm == PostNormType::Scale ? kModelDim : 0),
-        post_ffw_norm_scale("post_ff_ns", 1,
-                            kPostNorm == PostNormType::Scale ? kModelDim : 0),
-        ffw_gating_biases("ffw_gat_b", 1, kFFBiases ? 2 * kFFHiddenDim : 0),
-        ffw_output_biases("ffw_out_b", 1, kFFBiases ? kModelDim : 0),
-        att_weights("att_w", kModelDim, kHeads * kQKVDim)
-  {}
-  ~CompressedLayer() = default;
+            "post_att_ns", 1,
+            config.post_norm == PostNormType::Scale ? config.model_dim : 0),
+        post_ffw_norm_scale(
+            "post_ff_ns", 1,
+            config.post_norm == PostNormType::Scale ? config.model_dim : 0),
+        ffw_gating_biases("ffw_gat_b", 1,
+                          config.ff_biases ? 2 * config.ff_hidden_dim : 0),
+        ffw_output_biases("ffw_out_b", 1,
+                          config.ff_biases ? config.model_dim : 0),
+        att_weights("att_w", config.model_dim, config.heads * config.qkv_dim),
+        layer_config(config) {}
+  ~LayerWeightsPtrs() = default;
 
-  using Weight = typename TConfig::Weight;
   // If weights are f32, also f32; otherwise at least bf16. Useful for ops that
   // do not yet support smaller compressed types, or require at least bf16. When
   // weights are f32, we also want such tensors to be f32.
@@ -113,25 +135,6 @@ struct CompressedLayer {
               hwy::If<hwy::IsSame<Weight, double>(), double,
                       hwy::If<IsF32<Weight>(), float, BF16>>>;
 
-  static constexpr size_t kHeads = TConfig::kHeads;
-  static constexpr size_t kKVHeads = TConfig::kKVHeads;
-  static constexpr size_t kModelDim = TConfig::kModelDim;
-  static constexpr size_t kQKVDim = TConfig::kQKVDim;
-  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
-  static constexpr size_t kAttVecEinsumWSize = kHeads * kQKVDim * kModelDim;
-  static constexpr size_t kQKVEinsumWSize =
-      (kHeads + 2 * kKVHeads) * kQKVDim * kModelDim;
-  static constexpr size_t kQKVEinsumBSize = (kHeads + 2 * kKVHeads) * kQKVDim;
-  // 2x for (gelu gating vector, gated vector)
-  static constexpr size_t kGatingEinsumWSize = 2 * kFFHiddenDim * kModelDim;
-  static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
-  static constexpr bool kFFBiases = TConfig::kFFBiases;
-  static constexpr PostNormType kPostNorm = TConfig::kPostNorm;
-  static constexpr size_t kAOBiasDim =
-      TConfig::kSoftmaxAttnOutputBiases ? kModelDim : 0;
-  static constexpr size_t kGriffinDim =
-      TConfig::kGriffinLayers > 0 ? kModelDim : 0;
-
   template <class T>
   using ArrayT = MatPtrT<T>;
 
@@ -195,28 +198,32 @@ struct CompressedLayer {
   // Reshaped attention; not loaded from disk via ForEachTensor.
   ArrayT<Weight> att_weights;
 
+  const LayerConfig& layer_config;
+
   // Initializes att_weights from attn_vec_einsum_w, hence this must be called
   // after loading weights via ForEachTensor.
   // TODO: update compression/convert_weights to bake this in.
-  void Reshape(MatStorage& storage) {
+  void Reshape(MatStorage* storage) {
     if (attn_vec_einsum_w.data() == nullptr) return;
 
-    constexpr size_t kModelDim = TConfig::kModelDim;
-    constexpr size_t kHeads = TConfig::kHeads;
-    constexpr size_t kQKVDim = TConfig::kQKVDim;
+    const size_t model_dim = layer_config.model_dim;
+    const size_t heads = layer_config.heads;
+    const size_t qkv_dim = layer_config.qkv_dim;
 
-    // Would have to implement a CompressTraits::Copy for NUQ.
-    static_assert(!hwy::IsSame<Weight, NuqStream>());
+    // TODO: implement a CompressTraits::Copy for NUQ.
+    // static_assert(!hwy::IsSame<Weight, NuqStream>());
 
     // Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
-    storage.Allocate();
-    att_weights.SetPtr(storage);
-    for (size_t m = 0; m < kModelDim; ++m) {
-      Weight* HWY_RESTRICT out_row = att_weights.data() + m * kHeads * kQKVDim;
-      for (size_t h = 0; h < kHeads; ++h) {
+    if (storage != nullptr) {
+      storage->Allocate();
+      att_weights.SetPtr(*storage);
+    }
+    for (size_t m = 0; m < model_dim; ++m) {
+      Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim;
+      for (size_t h = 0; h < heads; ++h) {
         hwy::CopyBytes(
-            attn_vec_einsum_w.data() + h * kModelDim * kQKVDim + m * kQKVDim,
-            out_row + h * kQKVDim, kQKVDim * sizeof(Weight));
+            attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim,
+            out_row + h * qkv_dim, qkv_dim * sizeof(Weight));
       }
     }
     att_weights.set_scale(attn_vec_einsum_w.scale());
@@ -235,11 +242,11 @@ struct CompressedLayer {
   }
 
   template <class Func>
-  static void ForEachTensor(const std::vector<CompressedLayer<TConfig>*>& ptrs,
+  static void ForEachTensor(const std::vector<LayerWeightsPtrs<Weight>*>& ptrs,
                             int layer_idx, ForEachType fet, Func func,
                             char sep = ' ', int sep_index = -1) {
     MatPtr* tensors[ptrs.size()];
-    auto type = TConfig::kLayerConfig[layer_idx];
+    auto type = ptrs[0]->layer_config.type;
     if (type == LayerAttentionType::kVit) {
       // MHA.
       GEMMA_CALL_FUNC(vit.attn_out_w);
@@ -296,17 +303,17 @@ struct CompressedLayer {
     GEMMA_CALL_FUNC(pre_attention_norm_scale);
     GEMMA_CALL_FUNC(pre_ffw_norm_scale);
 
-    if (TConfig::kPostNorm == PostNormType::Scale) {
+    if (ptrs[0]->layer_config.post_norm == PostNormType::Scale) {
       GEMMA_CALL_FUNC(post_attention_norm_scale);
       GEMMA_CALL_FUNC(post_ffw_norm_scale);
     }
 
-    if (TConfig::kFFBiases) {
+    if (ptrs[0]->layer_config.ff_biases) {
       GEMMA_CALL_FUNC(ffw_gating_biases);
       GEMMA_CALL_FUNC(ffw_output_biases);
     }
 
-    if (TConfig::kSoftmaxAttnOutputBiases &&
+    if (ptrs[0]->layer_config.softmax_attn_output_biases &&
         type == LayerAttentionType::kGemma) {
       GEMMA_CALL_FUNC(attention_output_biases);
     }
@@ -322,47 +329,45 @@ struct CompressedLayer {
 
   // Allocates memory for all the tensors in the layer.
   // Note that this is slow and only used for a stand-alone layer.
-  void Allocate() {
-    layer_storage.clear();
-    ForEachTensor({this}, /*layer_idx=*/0, ForEachType::kInitNoToc,
-                  [this](const char* name, hwy::Span<MatPtr*> tensors) {
-                    this->layer_storage.emplace_back(*tensors[0]);
-                    layer_storage.back().Allocate();
-                    tensors[0]->SetPtr(layer_storage.back());
-                  });
+  void Allocate(std::vector<MatStorage>& layer_storage) {
+    ForEachTensor(
+        {this}, /*layer_idx=*/0, ForEachType::kInitNoToc,
+        [&layer_storage](const char* name, hwy::Span<MatPtr*> tensors) {
+          layer_storage.emplace_back(*tensors[0]);
+          layer_storage.back().Allocate();
+          tensors[0]->SetPtr(layer_storage.back());
+        });
   }
-
-  // Storage for all the matrices and vectors. Only used for a stand-alone
-  // layer. For a model, the CompressedWeights::model_storage is used instead.
-  std::vector<MatStorage> layer_storage;
 };
 
-template <class TConfig>
-struct CompressedWeights {
-  explicit CompressedWeights(hwy::ThreadPool& pool)
-      : embedder_input_embedding("c_embedding", TConfig::kVocabSize,
-                                 TConfig::kModelDim),
-        final_norm_scale("c_final_norm", 1, TConfig::kModelDim),
-        vit_encoder_norm_bias("enc_norm_bias", 1,
-                              TConfig::VitConfig::kModelDim),
-        vit_encoder_norm_scale("enc_norm_scale", 1,
-                               TConfig::VitConfig::kModelDim),
-        vit_img_embedding_bias("img_emb_bias", 1,
-                               TConfig::VitConfig::kModelDim),
+template <class Weight>
+struct ModelWeightsPtrs {
+  ModelWeightsPtrs(const ModelConfig& config, hwy::ThreadPool& pool)
+      : embedder_input_embedding("c_embedding", config.vocab_size,
+                                 config.model_dim),
+        final_norm_scale("c_final_norm", 1, config.model_dim),
+        vit_encoder_norm_bias("enc_norm_bias", 1, config.vit_model_dim),
+        vit_encoder_norm_scale("enc_norm_scale", 1, config.vit_model_dim),
+        vit_img_embedding_bias("img_emb_bias", 1, config.vit_model_dim),
         vit_img_embedding_kernel("img_emb_kernel", 14 * 14 * 3,
-                                 TConfig::VitConfig::kModelDim),
-        vit_img_pos_embedding("img_pos_emb", 256,
-                              TConfig::VitConfig::kModelDim),
-        vit_img_head_bias("img_head_bias", 1, TConfig::kModelDim),
-        vit_img_head_kernel("img_head_kernel", TConfig::VitConfig::kModelDim,
-                            TConfig::kModelDim),
-        scale_names({"att_ein", "qkv_ein", "gr_lin_x_w", "gr_lin_y_w",
-                     "gr_lin_out_w", "gr_gate_w", "gating_ein", "linear_w"}) {}
+                                 config.vit_model_dim),
+        vit_img_pos_embedding("img_pos_emb", 256, config.vit_model_dim),
+        vit_img_head_bias("img_head_bias", 1, config.model_dim),
+        vit_img_head_kernel("img_head_kernel", config.vit_model_dim,
+                            config.model_dim),
+        scale_names(config.scale_names),
+        weights_config(config) {
+    c_layers.reserve(config.layer_configs.size());
+    for (const auto& layer_config : config.layer_configs) {
+      c_layers.push_back(LayerWeightsPtrs<Weight>(layer_config));
+    }
+    for (const auto& layer_config : config.vit_layer_configs) {
+      vit_layers.push_back(LayerWeightsPtrs<Weight>(layer_config));
+    }
+  }
 
-  ~CompressedWeights() = default;
-
-  using Weight = typename TConfig::Weight;
-  using WeightF32OrBF16 = typename CompressedLayer<TConfig>::WeightF32OrBF16;
+  ~ModelWeightsPtrs() = default;
+  using WeightF32OrBF16 = typename LayerWeightsPtrs<Weight>::WeightF32OrBF16;
   using WeightF32OrInputT = hwy::If<hwy::IsSame<WeightF32OrBF16, BF16>(),
                                     EmbedderInputT, WeightF32OrBF16>;
 
@@ -380,49 +385,73 @@ struct CompressedWeights {
   MatPtrT<float> vit_img_head_bias;
   MatPtrT<WeightF32OrBF16> vit_img_head_kernel;
 
-  // Storage for all the matrices and vectors.
-  std::vector<MatStorage> model_storage;
   std::unordered_set<std::string> scale_names;
 
-  CompressedLayer<TConfig> c_layers[TConfig::kLayers];
-  CompressedLayer<typename TConfig::VitConfig>
-      vit_layers[TConfig::VitConfig::kLayers];
+  const ModelConfig& weights_config;
 
-  // Called by weights.cc after ForEachTensor.
-  void Reshape(hwy::ThreadPool& pool) {
+  std::vector<LayerWeightsPtrs<Weight>> c_layers;
+  std::vector<LayerWeightsPtrs<Weight>> vit_layers;
+
+  // Called by weights.cc after Loading, before att_w has been allocated.
+  void AllocAndCopyWithTranspose(hwy::ThreadPool& pool,
+                                 std::vector<MatStorage>& model_storage) {
     size_t storage_index = model_storage.size();
-    for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
-      model_storage.emplace_back(GetLayer(layer)->att_weights);
+    for (auto& layer : c_layers) {
+      model_storage.emplace_back(layer.att_weights);
     }
-    pool.Run(0, TConfig::kLayers,
-             [this, storage_index](uint64_t layer, size_t /*thread*/) {
-               GetLayer(layer)->Reshape(model_storage[storage_index + layer]);
+    pool.Run(0, c_layers.size(),
+             [this, &model_storage, storage_index](uint64_t layer,
+                                                   size_t /*thread*/) {
+               GetLayer(layer)->Reshape(&model_storage[storage_index + layer]);
              });
   }
+  // For when the storage has already been allocated.
+  void CopyWithTranspose(hwy::ThreadPool& pool) {
+    pool.Run(0, c_layers.size(), [this](uint64_t layer, size_t /*thread*/) {
+      GetLayer(layer)->Reshape(nullptr);
+    });
+  }
 
   void ZeroInit() {
     embedder_input_embedding.ZeroInit();
     final_norm_scale.ZeroInit();
-    for (int i = 0; i < TConfig::kLayers; ++i) {
+    for (size_t i = 0; i < c_layers.size(); ++i) {
       c_layers[i].ZeroInit(i);
     }
   }
 
-  const CompressedLayer<TConfig>* GetLayer(size_t layer) const {
+  const LayerWeightsPtrs<Weight>* GetLayer(size_t layer) const {
     return &c_layers[layer];
   }
-  CompressedLayer<TConfig>* GetLayer(size_t layer) { return &c_layers[layer]; }
-  const CompressedLayer<typename TConfig::VitConfig>* GetVitLayer(
-      size_t layer) const {
+  LayerWeightsPtrs<Weight>* GetLayer(size_t layer) { return &c_layers[layer]; }
+  const LayerWeightsPtrs<Weight>* GetVitLayer(size_t layer) const {
     return &vit_layers[layer];
   }
-  CompressedLayer<typename TConfig::VitConfig>* GetVitLayer(size_t layer) {
+  LayerWeightsPtrs<Weight>* GetVitLayer(size_t layer) {
     return &vit_layers[layer];
   }
 
+  void Allocate(std::vector<MatStorage>& model_storage, hwy::ThreadPool& pool) {
+    std::vector<MatPtr*> model_toc;
+    ForEachTensor(
+        {this}, ForEachType::kInitNoToc,
+        [&model_toc, &model_storage](const char*, hwy::Span<MatPtr*> tensors) {
+          model_toc.push_back(tensors[0]);
+          model_storage.emplace_back(*tensors[0]);
+        });
+    // Allocate in parallel using the pool.
+    pool.Run(0, model_toc.size(),
+             [&model_toc, &model_storage](uint64_t task, size_t /*thread*/) {
+               // model_storage may have had content before we started.
+               size_t idx = task + model_storage.size() - model_toc.size();
+               model_storage[idx].Allocate();
+               model_toc[task]->SetPtr(model_storage[idx]);
+             });
+  }
+
   // Copies the data from other to *this.
-  void CopyFrom(const CompressedWeights<TConfig>& other) {
-    ForEachTensor({this, const_cast<CompressedWeights<TConfig>*>(&other)},
+  void CopyFrom(const ModelWeightsPtrs<Weight>& other) {
+    ForEachTensor({this, const_cast<ModelWeightsPtrs<Weight>*>(&other)},
                   ForEachType::kIgnoreNulls,
                   [](const char*, hwy::Span<MatPtr*> tensors) {
                     hwy::CopyBytes(tensors[1]->Ptr(), tensors[0]->Ptr(),
@@ -448,16 +477,14 @@ struct CompressedWeights {
             ++scale_pos;
           }
         });
-    HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
+    HWY_ASSERT(scale_pos == weights_config.num_tensor_scales);
   }
 
   template <class Func>
-  static void ForEachTensor(
-      const std::vector<CompressedWeights<TConfig>*>& ptrs, ForEachType fet,
-      Func func) {
-    std::vector<CompressedLayer<TConfig>*> layers(ptrs.size());
-    std::vector<CompressedLayer<typename TConfig::VitConfig>*> vit_layers(
-        ptrs.size());
+  static void ForEachTensor(const std::vector<ModelWeightsPtrs<Weight>*>& ptrs,
+                            ForEachType fet, Func func) {
+    std::vector<LayerWeightsPtrs<Weight>*> layers(ptrs.size());
+    std::vector<LayerWeightsPtrs<Weight>*> vit_layers(ptrs.size());
     MatPtr* tensors[ptrs.size()];
     // Variables used by GEMMA_CALL_FUNC.
     int layer_idx = -1;
@@ -465,7 +492,7 @@ struct CompressedWeights {
     int sep_index = -1;
     GEMMA_CALL_FUNC(embedder_input_embedding);
     GEMMA_CALL_FUNC(final_norm_scale);
-    if constexpr (TConfig::VitConfig::kLayers > 0) {
+    if (ptrs[0]->weights_config.vit_layer_configs.size() > 0) {
       // Vit parts.
       GEMMA_CALL_FUNC(vit_encoder_norm_bias);
       GEMMA_CALL_FUNC(vit_encoder_norm_scale);
@@ -476,90 +503,108 @@ struct CompressedWeights {
       GEMMA_CALL_FUNC(vit_img_head_kernel);
     }
 
-    for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
+    for (int layer_idx = 0; layer_idx < ptrs[0]->c_layers.size(); ++layer_idx) {
       for (int i = 0; i < ptrs.size(); ++i) {
         layers[i] = ptrs[i]->GetLayer(layer_idx);
       }
-      CompressedLayer<TConfig>::ForEachTensor(layers, layer_idx, fet, func);
+      LayerWeightsPtrs<Weight>::ForEachTensor(layers, layer_idx, fet, func);
     }
 
     // Vit layers. Not supported for compress_weights.
-    if constexpr (TConfig::VitConfig::kLayers > 0) {
-      for (int layer_idx = 0; layer_idx < TConfig::VitConfig::kLayers;
+    if (ptrs[0]->weights_config.vit_layer_configs.size() > 0) {
+      for (int layer_idx = 0; layer_idx < ptrs[0]->vit_layers.size();
            ++layer_idx) {
-        auto type = TConfig::VitConfig::kLayerConfig[layer_idx];
+        auto type = ptrs[0]->vit_layers[layer_idx].layer_config.type;
         HWY_ASSERT(type == LayerAttentionType::kVit);
         for (int i = 0; i < ptrs.size(); ++i) {
           vit_layers[i] = ptrs[i]->GetVitLayer(layer_idx);
         }
-        CompressedLayer<typename TConfig::VitConfig>::ForEachTensor(
-            vit_layers, layer_idx, fet, func);
+        LayerWeightsPtrs<Weight>::ForEachTensor(vit_layers, layer_idx, fet,
+                                                func);
       }
     }
   }
 };
 #undef GEMMA_CALL_FUNC
 
-// Pair of configs for the compressed and uncompressed weights.
-template <class CConfig, class UCConfig>
-struct ConfigPair {
-  using uc = UCConfig;
-  using c = CConfig;
-};
-
 // ----------------------------------------------------------------------------
 // Interface
 
-template <typename TConfig>
-struct AllocateCompressedWeights {
-  ByteStorageT operator()(hwy::ThreadPool& pool) const {
-    using TWeights = CompressedWeights<TConfig>;
-    ByteStorageT weights_u8 = AllocateSizeof<TWeights>();
-    TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
-    new (weights) TWeights(pool);
-    std::vector<MatPtr*> model_toc;
-    auto& model_storage = weights->model_storage;
-    TWeights::ForEachTensor(
-        {weights}, ForEachType::kInitNoToc,
-        [&model_toc, &model_storage](const char*, hwy::Span<MatPtr*> tensors) {
-          model_toc.push_back(tensors[0]);
-          model_storage.emplace_back(*tensors[0]);
-        });
-    // Allocate in parallel using the pool.
-    pool.Run(0, model_storage.size(),
-             [&model_toc, &model_storage](uint64_t task, size_t /*thread*/) {
-               model_storage[task].Allocate();
-               model_toc[task]->SetPtr(model_storage[task]);
-             });
-    return weights_u8;
+class ModelWeightsStorage {
+ public:
+  ModelWeightsStorage() = default;
+  ~ModelWeightsStorage() = default;
+
+  BlobError Load(const Path& weights, Model model_type, Type weight_type,
+                 hwy::ThreadPool& pool);
+  void Allocate(Model model_type, Type weight_type, hwy::ThreadPool& pool) {
+    Allocate(ConfigFromModel(model_type), weight_type, pool);
   }
-};
+  void Allocate(const ModelConfig& config, Type weight_type,
+                hwy::ThreadPool& pool);
+  void RandInit(std::mt19937& gen);
+  void ZeroInit();
+  void GetOrApplyScales(std::vector<float>& scales);
+  void AllocAndCopyWithTranspose(hwy::ThreadPool& pool);
+  void CopyWithTranspose(hwy::ThreadPool& pool);
+  void LogWeightStats();
+  const ModelConfig& Config() const { return config_; }
 
-template <typename TConfig>
-struct ZeroInitCompressedWeights {
-  void operator()(ByteStorageT& weights_u8, hwy::ThreadPool& pool) const {
-    CompressedWeights<TConfig>& weights =
-        *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-    weights.ZeroInit();
+  template <typename T>
+  ModelWeightsPtrs<T>* GetWeightsOfType() const {
+    if constexpr (IsSfpStream<T>()) {
+      return sfp_weights_.get();
+    } else if constexpr (IsF32<T>()) {
+      return float_weights_.get();
+    } else if constexpr (IsBF16<T>()) {
+      return bf16_weights_.get();
+    } else if constexpr (IsNuqStream<T>()) {
+      return nuq_weights_.get();
+    } else {
+      return HWY_ABORT("Unsupported type.");
+    }
   }
-};
 
-template <typename TConfig>
-struct ReshapeCompressedWeights {
-  void operator()(ByteStorageT& weights_u8, hwy::ThreadPool& pool) const {
-    CompressedWeights<TConfig>& weights =
-        *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-    weights.Reshape(pool);
+  template <template <typename T> class FuncT, typename... TArgs>
+  decltype(auto) CallForModelWeightT(TArgs&&... args) {
+    if (HWY_LIKELY(sfp_weights_))
+      return FuncT<SfpStream>()(*sfp_weights_, std::forward<TArgs>(args)...);
+    if (bf16_weights_)
+      return FuncT<BF16>()(*bf16_weights_, std::forward<TArgs>(args)...);
+    if (nuq_weights_)
+      return FuncT<NuqStream>()(*nuq_weights_, std::forward<TArgs>(args)...);
+    if (float_weights_)
+      return FuncT<float>()(*float_weights_, std::forward<TArgs>(args)...);
+    return HWY_ABORT("No weights loaded.");
   }
+
+  template <template <typename T> class FuncT, typename... TArgs>
+  decltype(auto) CallForModelWeight(TArgs&&... args) {
+    if (HWY_LIKELY(sfp_weights_))
+      return FuncT<SfpStream>()(*this, std::forward<TArgs>(args)...);
+    if (bf16_weights_)
+      return FuncT<BF16>()(*this, std::forward<TArgs>(args)...);
+    if (nuq_weights_)
+      return FuncT<NuqStream>()(*this, std::forward<TArgs>(args)...);
+    if (float_weights_)
+      return FuncT<float>()(*this, std::forward<TArgs>(args)...);
+    return HWY_ABORT("No weights loaded.");
+  }
+
+ private:
+  void CreateForType(Type weight_type, hwy::ThreadPool& pool);
+
+  ModelConfig config_;
+  // To eliminate type templates, we hold a pointer to one of each weight type
+  // and dispatch to whichever is non-null.
+  std::unique_ptr<ModelWeightsPtrs<float>> float_weights_;
+  std::unique_ptr<ModelWeightsPtrs<BF16>> bf16_weights_;
+  std::unique_ptr<ModelWeightsPtrs<SfpStream>> sfp_weights_;
+  std::unique_ptr<ModelWeightsPtrs<NuqStream>> nuq_weights_;
+  // Storage for all the matrices and vectors.
+  std::vector<MatStorage> model_storage_;
 };
 
-// TODO: also add RandInitCompressedWeights
-
-ByteStorageT LoadCompressedWeights(const Path& weights, Model model_type,
-                                   Type weight_type, hwy::ThreadPool& pool);
-
-void LogWeightStats(Model model, Type weight_type, const ByteStorageT& weights);
-
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_H_
diff --git a/ops/gemma_matvec_test.cc b/ops/gemma_matvec_test.cc
index bfc515c..6982b20 100644
--- a/ops/gemma_matvec_test.cc
+++ b/ops/gemma_matvec_test.cc
@@ -115,8 +115,8 @@ void TestMatVecAdd() {
   FloatPtr expected_out = SimpleMatVecAdd(*mat, vec, add);
   FloatPtr actual_out = hwy::AllocateAligned<float>(kOuter);
   HWY_ASSERT(vec && add && expected_out && actual_out);
-  MatVecAdd<kOuter, kInner>(*mat, 0, vec.get(), add.get(), actual_out.get(),
-                            pool);
+  MatVecAdd(*mat, 0, kOuter, kInner, vec.get(), add.get(), actual_out.get(),
+            pool);
   AssertClose<kOuter>(actual_out, expected_out);
 }
 
@@ -135,9 +135,8 @@ void TestTwoMatVecAdd() {
   FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
   HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
              expected_out1 && actual_out1);
-  TwoMatVecAdd<kOuter, kInner>(*mat0, *mat1, 0, vec.get(), add0.get(),
-                               add1.get(), actual_out0.get(), actual_out1.get(),
-                               pool);
+  TwoMatVecAdd(*mat0, *mat1, 0, kOuter, kInner, vec.get(), add0.get(),
+               add1.get(), actual_out0.get(), actual_out1.get(), pool);
   AssertClose<kOuter>(actual_out0, expected_out0);
   AssertClose<kOuter>(actual_out1, expected_out1);
 }
@@ -156,9 +155,8 @@ void TestTwoOfsMatVecAddLoop() {
   FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
   HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
              expected_out1 && actual_out1);
-  TwoOfsMatVecAddLoop<kOuter, kInner>(*mat, 0, 0, vec.get(), add0.get(),
-                                      add1.get(), actual_out0.get(),
-                                      actual_out1.get());
+  TwoOfsMatVecAddLoop(*mat, 0, 0, kOuter, kInner, vec.get(), add0.get(),
+                      add1.get(), actual_out0.get(), actual_out1.get());
   AssertClose<kOuter>(actual_out0, expected_out0);
   AssertClose<kOuter>(actual_out1, expected_out1);
 }
diff --git a/ops/matvec-inl.h b/ops/matvec-inl.h
index 5d629ac..d9fdeeb 100644
--- a/ops/matvec-inl.h
+++ b/ops/matvec-inl.h
@@ -47,10 +47,10 @@ namespace hn = hwy::HWY_NAMESPACE;
 
 // Simple version without tiling nor threading, but two offsets/outputs and
 // always with addition.
-template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
-          typename AddT>
+template <typename ArrayT, typename VecT, typename AddT>
 HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
-                                    const size_t mat_ofs1,
+                                    const size_t mat_ofs1, const size_t outer,
+                                    const size_t inner,
                                     const VecT* HWY_RESTRICT vec_aligned,
                                     const AddT* HWY_RESTRICT add0,
                                     const AddT* HWY_RESTRICT add1,
@@ -58,13 +58,13 @@ HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
                                     float* HWY_RESTRICT out1) {
   PROFILER_ZONE("TwoOfsMatVecAddLoop");
 
-  for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
-    const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner;
-    const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner;
+  for (size_t idx_row = 0; idx_row < outer; ++idx_row) {
+    const size_t row_ofs0 = mat_ofs0 + (idx_row)*inner;
+    const size_t row_ofs1 = mat_ofs1 + (idx_row)*inner;
     out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
-                    Dot(mat, row_ofs0, vec_aligned, kInner);
+                    Dot(mat, row_ofs0, vec_aligned, inner);
     out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
-                    Dot(mat, row_ofs1, vec_aligned, kInner);
+                    Dot(mat, row_ofs1, vec_aligned, inner);
   }
 }
 
@@ -84,6 +84,14 @@ HWY_INLINE constexpr size_t RowsPerStrip() {
   return kRowsPerStrip;
 }
 
+HWY_INLINE size_t RowsPerStrip(const size_t outer) {
+  // Aim for 128 work items to reduce pool overhead. Must be at least one
+  // vector; prefer a power of two for faster division.
+  constexpr size_t kLanes = hn::ScalableTag<float>().MaxLanes();
+  return outer < 128 ? kLanes
+                     : HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(outer / 128));
+}
+
 namespace detail {
 
 // For each i = [0, num_rows), compute partial (length `num_cols`) dot product
@@ -161,63 +169,63 @@ HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
 
 // Stores dot products of rows with `vec_aligned` + add the values from `add`
 // (if kAdd), then stores them to `out`.
-template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
-          typename VecT, typename AddT>
+template <bool kAdd, typename ArrayT, typename VecT, typename AddT>
 HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs,
+                        const size_t outer, const size_t inner,
                         const VecT* HWY_RESTRICT const vec_aligned,
                         const AddT* HWY_RESTRICT const add,
                         float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
   PROFILER_ZONE("MatVecAdd");
 
   const hn::ScalableTag<float> df;
-  constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
-  constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
+  const size_t rows_per_strip = RowsPerStrip(outer);
+  const size_t num_strips = outer / rows_per_strip;
 
   // For each entire strip.
-  pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
+  pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
     PROFILER_ZONE("MatVec.lambda");
-    const size_t r0 = strip * kRowsPerStrip;
-    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, kInner, r0,
-                                          kRowsPerStrip, vec_aligned, add,
+    const size_t r0 = strip * rows_per_strip;
+    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, inner, r0,
+                                          rows_per_strip, vec_aligned, add,
                                           out + r0);
   });
 
   // Remaining rows
-  const size_t r0 = kNumStrips * kRowsPerStrip;
-  if (r0 < kOuter) {
+  const size_t r0 = num_strips * rows_per_strip;
+  if (r0 < outer) {
     PROFILER_ZONE("MatVec remainder");
-    const size_t num_rows = kOuter - r0;
-    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, kInner, r0,
-                                          num_rows, vec_aligned, add, out + r0);
+    const size_t num_rows = outer - r0;
+    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, inner, r0, num_rows,
+                                          vec_aligned, add, out + r0);
   }
 }
 
 // With addition
-template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
-          typename AddT>
+template <typename ArrayT, typename VecT, typename AddT>
 HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
+                          const size_t outer, const size_t inner,
                           const VecT* HWY_RESTRICT const vec_aligned,
                           const AddT* HWY_RESTRICT const add,
                           float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
-  return MatVecT</*kAdd=*/true, kOuter, kInner>(mat, mat_ofs, vec_aligned, add,
-                                                out, pool);
+  return MatVecT</*kAdd=*/true>(mat, mat_ofs, outer, inner, vec_aligned, add,
+                                out, pool);
 }
 
 // Without addition
-template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
+template <typename ArrayT, typename VecT>
 HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
+                       const size_t outer, const size_t inner,
                        const VecT* HWY_RESTRICT const vec_aligned,
                        float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
-  MatVecT</*kAdd=*/false, kOuter, kInner>(mat, mat_ofs, vec_aligned,
-                                          /*add=*/static_cast<VecT*>(nullptr),
-                                          out, pool);
+  MatVecT</*kAdd=*/false>(mat, mat_ofs, outer, inner, vec_aligned,
+                          /*add=*/static_cast<VecT*>(nullptr), out, pool);
 }
 
 // Two matrices, same vector
-template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT1,
-          typename ArrayT2, typename VecT, typename AddT>
+template <bool kAdd, typename ArrayT1, typename ArrayT2, typename VecT,
+          typename AddT>
 HWY_NOINLINE void TwoMatVecT(const ArrayT1& mat0, const ArrayT2& mat1,
-                             const size_t mat_ofs,
+                             const size_t mat_ofs, size_t outer, size_t inner,
                              const VecT* HWY_RESTRICT vec_aligned,
                              const AddT* HWY_RESTRICT add0,
                              const AddT* HWY_RESTRICT add1,
@@ -226,56 +234,56 @@ HWY_NOINLINE void TwoMatVecT(const ArrayT1& mat0, const ArrayT2& mat1,
   PROFILER_ZONE("TwoMatVecAdd");
 
   const hn::ScalableTag<float> df;
-  constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
-  constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
+  const size_t rows_per_strip = RowsPerStrip(outer);
+  const size_t num_strips = outer / rows_per_strip;
 
   // For each entire strip.
-  pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
+  pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
     PROFILER_ZONE("TwoMatVec.lambda");
-    const size_t r0 = strip * kRowsPerStrip;
-    detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, kInner, r0,
-                                          kRowsPerStrip, vec_aligned, add0,
+    const size_t r0 = strip * rows_per_strip;
+    detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, inner, r0,
+                                          rows_per_strip, vec_aligned, add0,
                                           out0 + r0);
-    detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, kInner, r0,
-                                          kRowsPerStrip, vec_aligned, add1,
+    detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, inner, r0,
+                                          rows_per_strip, vec_aligned, add1,
                                           out1 + r0);
   });
 
   // Remaining rows
-  const size_t r0 = kNumStrips * kRowsPerStrip;
-  if (r0 < kOuter) {
+  const size_t r0 = num_strips * rows_per_strip;
+  if (r0 < outer) {
     PROFILER_ZONE("TwoMatVec remainder");
-    const size_t num_rows = kOuter - r0;
+    const size_t num_rows = outer - r0;
     detail::FullDotProductsForStrip<kAdd>(
-        df, mat0, mat_ofs, kInner, r0, num_rows, vec_aligned, add0, out0 + r0);
+        df, mat0, mat_ofs, inner, r0, num_rows, vec_aligned, add0, out0 + r0);
     detail::FullDotProductsForStrip<kAdd>(
-        df, mat1, mat_ofs, kInner, r0, num_rows, vec_aligned, add1, out1 + r0);
+        df, mat1, mat_ofs, inner, r0, num_rows, vec_aligned, add1, out1 + r0);
   }
 }
 
 // With addition
-template <size_t kOuter, size_t kInner, typename ArrayT1, typename ArrayT2,
-          typename VecT, typename AddT>
+template <typename ArrayT1, typename ArrayT2, typename VecT, typename AddT>
 HWY_NOINLINE void TwoMatVecAdd(
     const ArrayT1& mat0, const ArrayT2& mat1, const size_t mat_ofs,
+    const size_t outer, const size_t inner,
     const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0,
     const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0,
     float* HWY_RESTRICT out1, hwy::ThreadPool& pool) {
-  return TwoMatVecT</*kAdd=*/true, kOuter, kInner>(
-      mat0, mat1, mat_ofs, vec_aligned, add0, add1, out0, out1, pool);
+  return TwoMatVecT</*kAdd=*/true>(mat0, mat1, mat_ofs, outer, inner,
+                                   vec_aligned, add0, add1, out0, out1, pool);
 }
 
 // Without addition
-template <size_t kOuter, size_t kInner, typename ArrayT1, typename ArrayT2,
-          typename VecT>
+template <typename ArrayT1, typename ArrayT2, typename VecT>
 HWY_NOINLINE void TwoMatVec(const ArrayT1& mat0, const ArrayT2& mat1,
-                            const size_t mat_ofs,
+                            const size_t mat_ofs, const size_t outer,
+                            const size_t inner,
                             const VecT* HWY_RESTRICT vec_aligned,
                             float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
                             hwy::ThreadPool& pool) {
-  TwoMatVecT</*kAdd=*/false, kOuter, kInner, ArrayT1, ArrayT2, VecT, VecT>(
-      mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
-      out0, out1, pool);
+  TwoMatVecT</*kAdd=*/false, ArrayT1, ArrayT2, VecT, VecT>(
+      mat0, mat1, mat_ofs, outer, inner, vec_aligned, /*add0=*/nullptr,
+      /*add1=*/nullptr, out0, out1, pool);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/ops/ops-inl.h b/ops/ops-inl.h
index 846ecc5..ecc3f97 100644
--- a/ops/ops-inl.h
+++ b/ops/ops-inl.h
@@ -21,11 +21,11 @@
 #include <stddef.h>
 #include <stdio.h>
 
-#include <array>
 #include <cmath>
 #include <limits>
 #include <random>
 #include <type_traits>  // std::enable_if_t
+#include <vector>
 
 #include "compression/compress.h"
 #include "util/basics.h"  // TokenAndProb
@@ -673,9 +673,8 @@ SampleArgmax(const float* probabilities, size_t vocab_size) {
   return max_index;
 }
 
-template <size_t k>
-HWY_NOINLINE HWY_MAYBE_UNUSED std::discrete_distribution<int>
-create_distribution(std::array<float, k>& top_k, float temperature) {
+HWY_INLINE HWY_MAYBE_UNUSED std::discrete_distribution<int> create_distribution(
+    std::vector<float>& top_k, float temperature) {
   HWY_ASSERT(temperature >= 0.0f);
   if (temperature == 0.0f) {
     // Temperature == 0 is a special case which always returns the argmax (0).
@@ -696,16 +695,16 @@ create_distribution(std::array<float, k>& top_k, float temperature) {
   return std::discrete_distribution<int>(std::begin(top_k), std::end(top_k));
 }
 
-template <size_t k, typename TAcceptToken>
+template <typename TAcceptToken>
 HWY_NOINLINE HWY_MAYBE_UNUSED int SampleTopK(
-    const float* HWY_RESTRICT probabilities, size_t vocab_size,
+    const float* HWY_RESTRICT probabilities, size_t k, size_t vocab_size,
     std::mt19937& gen, float temperature, TAcceptToken& accept_token) {
-  static_assert(k != 0, "");
+  HWY_ASSERT(k != 0);
   HWY_ASSERT(k <= vocab_size);
   // TODO: Optimize, potentially using new VQSort PartialSort.
-  std::array<float, k> top_k{};  // sorted from highest [0], to lowest [k-1]
-  top_k.fill(-std::numeric_limits<float>::infinity());
-  std::array<int, k> indices{};
+  // Sorted from highest [0], to lowest [k-1]
+  std::vector<float> top_k(k, -std::numeric_limits<float>::infinity());
+  std::vector<int> indices(k);
   size_t num_accepted = 0;
   for (size_t i = 0; i < vocab_size; ++i) {
     if (probabilities[i] < top_k[k - 1]) continue;
@@ -727,7 +726,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED int SampleTopK(
     }
   }
   HWY_ASSERT(k <= num_accepted);
-  return indices[create_distribution<k>(top_k, temperature)(gen)];
+  return indices[create_distribution(top_k, temperature)(gen)];
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/ops/ops_test.cc b/ops/ops_test.cc
index 6ac5115..a6f9b2d 100644
--- a/ops/ops_test.cc
+++ b/ops/ops_test.cc
@@ -387,8 +387,8 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void ScalarRopeAndMulBy(
 }
 
 void TestRopeAndMulBy() {
-  using Config = ConfigGemma2_9B<float>;
-  int dim_qkv = Config::kQKVDim;
+  ModelConfig config = ConfigFromModel(Model::GEMMA2_9B);
+  int dim_qkv = config.layer_configs[0].qkv_dim;
   RowVectorBatch<float> x(1, dim_qkv);
 
   std::mt19937 gen;
@@ -400,15 +400,15 @@ void TestRopeAndMulBy() {
     x.All()[i] = random_float();
   }
 
-  const float qmul = ChooseQueryScale<Config>();
+  const float qmul = ChooseQueryScale(config);
   const float kmul = 1.0;
 
   std::vector<float> qexpected(dim_qkv);
   std::vector<float> qactual(dim_qkv);
   std::vector<float> kexpected(dim_qkv);
   std::vector<float> kactual(dim_qkv);
-  RowVectorBatch<float> inv_timescale =
-      gcpp::Activations::CreateInvTimescale<Config>();
+  RowVectorBatch<float> inv_timescale = gcpp::Activations::CreateInvTimescale(
+      config.layer_configs[0].qkv_dim, config.layer_configs[0].post_qk);
   // Assert VectorizedRope computation is same as regular rope at different pos.
   for (int pos = 1; pos < 500; pos++) {
     // Rope'd Q embeddings
@@ -571,29 +571,29 @@ void TestSampleTopK() {
   float temperature = 1.0f;
   // SampleTopK<1> should return the argmax.
   std::function<bool(int, float)> accept_token;
-  int sample = SampleTopK<1>(logits.data(), kSize, gen, temperature,
-                             accept_token);
+  int sample =
+      SampleTopK(logits.data(), /*k=*/1, kSize, gen, temperature, accept_token);
   EXPECT_EQ(sample, 51);  // Last is largest.
   // Only accept even tokens, expect the last (largest) even index.
   accept_token = [](int i, float) { return i % 2 == 0; };
-  sample = SampleTopK<1>(logits.data(), kSize, gen, temperature,
-                         accept_token);
+  sample =
+      SampleTopK(logits.data(), /*k=*/1, kSize, gen, temperature, accept_token);
   EXPECT_EQ(sample, 50);  // Last even index.
   // Reset the logits to a positive, increasing sequence and take Softmax.
   std::iota(logits.begin(), logits.end(), 1.0f);
   Softmax(logits.data(), kSize);
   // Sample from the top 3, expect one of the top 3 even indices.
   for (int i = 0; i < 100; ++i) {
-    sample = SampleTopK<3>(logits.data(), kSize, gen, temperature,
-                          accept_token);
+    sample = SampleTopK(logits.data(), /*k=*/3, kSize, gen, temperature,
+                        accept_token);
     EXPECT_TRUE(sample == 50 || sample == 48 || sample == 46);
   }
   // Now set the temperature to 0.0f, which should always return the argmax,
   // even for k=3.
   temperature = 0.0f;
   for (int i = 0; i < 100; ++i) {
-    sample = SampleTopK<3>(logits.data(), kSize, gen, temperature,
-                          accept_token);
+    sample = SampleTopK(logits.data(), /*k=*/3, kSize, gen, temperature,
+                        accept_token);
     EXPECT_EQ(sample, 50);
   }
 }
diff --git a/util/app.h b/util/app.h
index e46e8df..c0a2d91 100644
--- a/util/app.h
+++ b/util/app.h
@@ -189,6 +189,8 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   const ModelInfo& Info() const { return info_; }
 
  private:
+  // TODO(rays): remove this. Eventually ModelConfig will be loaded from the
+  // weights file, so we can remove the need for this struct entirely.
   ModelInfo info_;
 };