diff --git a/BUILD.bazel b/BUILD.bazel
index 621233f..11893c1 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -53,6 +53,36 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "common",
+    srcs = ["gemma/common.cc"],
+    hdrs = [
+        "gemma/common.h",
+        "gemma/configs.h",
+    ],
+    deps = [
+        "//compression:compress",
+        "//compression:io",
+        "@hwy//:hwy",  # base.h
+        "@hwy//:thread_pool",
+    ],
+)
+
+cc_library(
+    name = "weights",
+    srcs = ["gemma/weights.cc"],
+    hdrs = ["gemma/weights.h"],
+    deps = [
+        ":common",
+        "//compression:compress",
+        "//compression:io",
+        "@hwy//:hwy",
+        "@hwy//:profiler",
+        "@hwy//:stats",
+        "@hwy//:thread_pool",
+    ],
+)
+
 cc_library(
     name = "gemma_lib",
     srcs = [
@@ -60,14 +90,12 @@ cc_library(
     ],
     hdrs = [
         "gemma/activations.h",
-        "gemma/common.h",
-        "gemma/common-inl.h",
-        "gemma/configs.h",
         "gemma/gemma.h",
-        "gemma/weights.h",
     ],
     deps = [
+        ":common",
         ":ops",
+        ":weights",
         "//compression:compress",
         "//compression:io",
         "@hwy//:hwy",
@@ -93,6 +121,7 @@ cc_library(
     hdrs = ["util/app.h"],
     deps = [
         ":args",
+        ":common",
         ":gemma_lib",
         "//compression:io",
         "@hwy//:hwy",
@@ -126,6 +155,7 @@ cc_binary(
     deps = [
         ":app",
         ":args",
+        ":common",
         ":gemma_lib",
         # "//base",
         "//compression:compress",
@@ -141,7 +171,9 @@ cc_binary(
     srcs = ["gemma/compress_weights.cc"],
     deps = [
         ":args",
+        ":common",
         ":gemma_lib",
+        ":weights",
         # "//base",
         "//compression:compress",
         "@hwy//:hwy",
@@ -157,7 +189,9 @@ cc_binary(
     deps = [
         ":app",
         ":args",
+        ":common",
         ":gemma_lib",
+        "//compression:io",
         "@hwy//:hwy",
         "@hwy//:nanobenchmark",
         "@hwy//:thread_pool",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcc9087..6c4d12f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,8 +50,6 @@ set(SOURCES
   backprop/backward.h
   backprop/backward-inl.h
   backprop/backward_scalar.h
-  backprop/common_scalar.cc
-  backprop/common_scalar.h
   backprop/forward.cc
   backprop/forward.h
   backprop/forward-inl.h
@@ -59,10 +57,9 @@ set(SOURCES
   backprop/optimizer.cc
   backprop/optimizer.h
   gemma/configs.h
-  gemma/activations.cc
   gemma/activations.h
+  gemma/common.cc
   gemma/common.h
-  gemma/common-inl.h
   gemma/gemma.cc
   gemma/gemma.h
   gemma/ops.h
diff --git a/DEVELOPERS.md b/DEVELOPERS.md
index 865ac26..324a33e 100644
--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
@@ -151,9 +151,9 @@ file (usually `tokenizer.spm`), call `Encode()` to go from string prompts to
 token id vectors, or `Decode()` to go from token id vector outputs from the
 model back to strings.
 
-### `GenerateGemma()` is the entrypoint for token generation
+### `model.Generate()` is the entrypoint for token generation
 
-Calling into `GenerateGemma` with a tokenized prompt will 1) mutate the
+Calling into `model.Generate` with a tokenized prompt will 1) mutate the
 activation values in `model` and 2) invoke StreamFunc - a lambda callback for
 each generated token.
 
@@ -170,11 +170,11 @@ no-op which is what `run.cc` does.
 
 ### `Transformer()` implements the inference (i.e. `forward()` method in PyTorch or Jax) computation of the neural network
 
-For high-level applications, you might only call `GenerateGemma()` and never
+For high-level applications, you might only call `model.Generate()` and never
 interact directly with the neural network, but if you're doing something a bit
-more custom you can call transformer which performs a single inference
-operation on a single token and mutates the Activations and the KVCache through
-the neural network computation.
+more custom you can call transformer which performs a single inference operation
+on a single token and mutates the Activations and the KVCache through the neural
+network computation.
 
 ### For low level operations, defining new architectures, call `ops.h` functions directly
 
diff --git a/backprop/backward-inl.h b/backprop/backward-inl.h
index 6d11fea..a0a6be3 100644
--- a/backprop/backward-inl.h
+++ b/backprop/backward-inl.h
@@ -28,8 +28,8 @@
 
 #include "backprop/prompt.h"
 #include "gemma/activations.h"
+#include "gemma/common.h"
 #include "gemma/weights.h"
-
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
@@ -43,7 +43,6 @@
 #define THIRD_PARTY_GEMMA_CPP_BACKWARD_TOGGLE
 #endif
 
-#include "gemma/common-inl.h"
 #include "gemma/ops.h"
 
 HWY_BEFORE_NAMESPACE();
diff --git a/backprop/backward.cc b/backprop/backward.cc
index d6987a4..4baeb0a 100644
--- a/backprop/backward.cc
+++ b/backprop/backward.cc
@@ -54,6 +54,7 @@ void CrossEntropyLossBackwardPassT(Model model,
                                    ByteStorageT& grad,
                                    ByteStorageT& backward,
                                    hwy::ThreadPool& pool) {
+  // TODO(janwas): use CallFunctorForModel
   switch (model) {
     case Model::GEMMA_2B:
       CrossEntropyLossBackwardPass<ConfigGemma2B>(
diff --git a/backprop/backward_scalar.h b/backprop/backward_scalar.h
index 1c9fbbc..fe5ca46 100644
--- a/backprop/backward_scalar.h
+++ b/backprop/backward_scalar.h
@@ -23,9 +23,9 @@
 #include <complex>
 #include <vector>
 
-#include "backprop/common_scalar.h"
 #include "backprop/prompt.h"
 #include "gemma/activations.h"
+#include "gemma/common.h"  // EmbeddingScaling
 #include "gemma/weights.h"
 
 namespace gcpp {
diff --git a/backprop/backward_scalar_test.cc b/backprop/backward_scalar_test.cc
index 17f1b6c..2ae1298 100644
--- a/backprop/backward_scalar_test.cc
+++ b/backprop/backward_scalar_test.cc
@@ -19,10 +19,10 @@
 #include <complex>
 #include <random>
 
+#include "gtest/gtest.h"
 #include "backprop/forward_scalar.h"
 #include "backprop/sampler.h"
 #include "backprop/test_util.h"
-#include "gtest/gtest.h"
 
 namespace gcpp {
 
diff --git a/backprop/backward_test.cc b/backprop/backward_test.cc
index cf13e6e..aad6838 100644
--- a/backprop/backward_test.cc
+++ b/backprop/backward_test.cc
@@ -30,11 +30,11 @@
 #include "backprop/sampler.h"
 #include "backprop/test_util.h"
 #include "compression/compress.h"
+#include "gemma/gemma.h"
+#include "gemma/weights.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
-#include "gemma/gemma.h"
-#include "gemma/weights.h"
 
 // clang-format off
 #undef HWY_TARGET_INCLUDE
diff --git a/backprop/common_scalar.cc b/backprop/common_scalar.cc
deleted file mode 100644
index b04dffb..0000000
--- a/backprop/common_scalar.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Compiles this file for multiple architectures via "foreach_target.h", to
-// which we pass the filename via macro 'argument'.
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "backprop/common_scalar.cc"  // NOLINT
-#include "hwy/foreach_target.h"        // IWYU pragma: keep
-
-#include "hwy/highway.h"
-// After highway.h
-#include "gemma/ops.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace gcpp {
-namespace HWY_NAMESPACE {
-namespace hn = hwy::HWY_NAMESPACE;
-
-float EmbeddingScaling(int model_dim) {
-  // Round to bf16 to match Gemma's Embedder, which casts before mul.
-  return hwy::ConvertScalarTo<float>(hwy::ConvertScalarTo<hwy::bfloat16_t>(
-      Sqrt(static_cast<float>(model_dim))));
-}
-
-}  // namespace HWY_NAMESPACE
-}  // namespace gcpp
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace gcpp {
-
-HWY_EXPORT(EmbeddingScaling);
-
-float EmbeddingScaling(int model_dim) {
-  return HWY_DYNAMIC_DISPATCH(EmbeddingScaling)(model_dim);
-}
-
-}  // namespace gcpp
-#endif  // HWY_ONCE
diff --git a/backprop/common_scalar.h b/backprop/common_scalar.h
deleted file mode 100644
index 628962b..0000000
--- a/backprop/common_scalar.h
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_SCALAR_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_SCALAR_H_
-
-#include <complex>
-
-namespace gcpp {
-
-template<typename T, typename U>
-U DotT(const T* a, const U* b, size_t N) {
-  U sum = {};
-  for (size_t i = 0; i < N; ++i) {
-    sum += a[i] * b[i];
-  }
-  return sum;
-}
-
-template<>
-std::complex<double> DotT(const float* a, const std::complex<double>* b,
-                          size_t N) {
-  std::complex<double> sum = {};
-  for (size_t i = 0; i < N; ++i) {
-    sum += static_cast<double>(a[i]) * b[i];
-  }
-  return sum;
-}
-
-template<typename T>
-void MulByConstT(T c, T* x, size_t N) {
-  for (size_t i = 0; i < N; ++i) {
-    x[i] *= c;
-  }
-}
-
-// out += c * x
-template<typename T>
-void MulByConstAndAddT(T c, const T* x, T* out, size_t N) {
-  for (size_t i = 0; i < N; ++i) {
-    out[i] += c * x[i];
-  }
-}
-
-template<typename T, size_t N>
-void MulByConstAndAddT(T c, const std::array<T, N>& x, std::array<T, N>& out) {
-  MulByConstAndAddT(c, x.data(), out.data(), N);
-}
-
-template<typename T>
-void AddFromT(const T* a, T* out, size_t N) {
-  for (size_t i = 0; i < N; ++i) {
-    out[i] += a[i];
-  }
-}
-
-template<typename T>
-T SquaredL2(const T* x, size_t N) {
-  T sum = {};
-  for (size_t i = 0; i < N; ++i) {
-    sum += x[i] * x[i];
-  }
-  return sum;
-}
-
-template<typename T>
-T Gelu(T x) {
-  static const T kMul = 0.044715;
-  static const T kSqrt2OverPi = 0.797884560804236;
-
-  const T x3 = x * x * x;
-  const T arg = kSqrt2OverPi * (kMul * x3 + x);
-  const T cdf = T(0.5) * (T(1.0) + std::tanh(arg));
-  return x * cdf;
-}
-
-template<typename T, typename U>
-void Rope(T* x, U base, size_t N, int i) {
-  const size_t N2 = N / 2;
-  for (size_t dim = 0; dim < N2; ++dim) {
-    const T freq_exponents = T(2 * dim) / T(N);
-    const T timescale = std::pow(base, freq_exponents);
-    const T theta = T(i) / timescale;
-    const T cos_val = std::cos(theta);
-    const T sin_val = std::sin(theta);
-    const T x0 = x[dim];
-    const T x1 = x[dim + N2];
-    x[dim] = x0 * cos_val - x1 * sin_val;
-    x[dim + N2] = x0 * sin_val + x1 * cos_val;
-  }
-}
-
-template<typename T>
-void Rope(T* x, size_t N, int i) {
-  Rope(x, T(10000.0), N, i);
-}
-
-template<typename T>
-void Rope(std::complex<T>* x, size_t N, int i) {
-  Rope(x, T(10000.0), N, i);
-}
-
-float EmbeddingScaling(int model_dim);
-
-}  // namespace gcpp
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_SCALAR_H_
diff --git a/backprop/forward-inl.h b/backprop/forward-inl.h
index 4b7cdf1..253319c 100644
--- a/backprop/forward-inl.h
+++ b/backprop/forward-inl.h
@@ -24,8 +24,8 @@
 #include <cmath>
 
 #include "gemma/activations.h"
+#include "gemma/common.h"
 #include "gemma/configs.h"
-
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
@@ -39,7 +39,6 @@
 #define THIRD_PARTY_GEMMA_CPP_FORWARD_TOGGLE
 #endif
 
-#include "gemma/common-inl.h"
 #include "gemma/ops.h"
 
 HWY_BEFORE_NAMESPACE();
diff --git a/backprop/forward.cc b/backprop/forward.cc
index fb67c2a..b712ce6 100644
--- a/backprop/forward.cc
+++ b/backprop/forward.cc
@@ -48,6 +48,7 @@ float CrossEntropyLossForwardPassT(Model model, const Prompt& prompt,
                                    const ByteStorageT& weights,
                                    ByteStorageT& forward,
                                    hwy::ThreadPool& pool) {
+  // TODO(janwas): use CallFunctorForModel
   switch (model) {
     case Model::GEMMA_2B:
       return CrossEntropyLossForwardPass<ConfigGemma2B>(
diff --git a/backprop/forward_scalar.h b/backprop/forward_scalar.h
index 5643530..44182a8 100644
--- a/backprop/forward_scalar.h
+++ b/backprop/forward_scalar.h
@@ -23,9 +23,9 @@
 #include <complex>
 #include <vector>
 
-#include "backprop/common_scalar.h"
 #include "backprop/prompt.h"
 #include "gemma/activations.h"
+#include "gemma/common.h"  // EmbeddingScaling
 #include "gemma/weights.h"
 
 namespace gcpp {
diff --git a/backprop/optimize_test.cc b/backprop/optimize_test.cc
index 5c354a9..32d4b93 100644
--- a/backprop/optimize_test.cc
+++ b/backprop/optimize_test.cc
@@ -16,6 +16,7 @@
 #include <iostream>
 #include <string>
 
+#include "gtest/gtest.h"
 #include "backprop/backward.h"
 #include "backprop/forward.h"
 #include "backprop/optimizer.h"
@@ -23,7 +24,6 @@
 #include "gemma/activations.h"
 #include "gemma/gemma.h"
 #include "gemma/weights.h"
-#include "gtest/gtest.h"
 
 namespace gcpp {
 
@@ -32,20 +32,20 @@ TEST(OptimizeTest, GradientDescent) {
   std::mt19937 gen(42);
 
   Model model_type = Model::GEMMA_TINY;
-  ByteStorageT weights = AllocateWeights(model_type, pool);
-  ByteStorageT grad = AllocateWeights(model_type, pool);
-  ByteStorageT grad_m = AllocateWeights(model_type, pool);
-  ByteStorageT grad_v = AllocateWeights(model_type, pool);
-  ByteStorageT forward = AllocateForwardPass(model_type);
-  ByteStorageT backward = AllocateForwardPass(model_type);
-  ByteStorageT inference = AllocateInferenceState(model_type);
-  auto kv_cache = CreateKVCache(model_type);
+  ByteStorageT grad = CallFunctorForModel<AllocateWeights>(model_type, pool);
+  ByteStorageT grad_m = CallFunctorForModel<AllocateWeights>(model_type, pool);
+  ByteStorageT grad_v = CallFunctorForModel<AllocateWeights>(model_type, pool);
+  ByteStorageT forward = CallFunctorForModel<AllocateForwardPass>(model_type);
+  ByteStorageT backward = CallFunctorForModel<AllocateForwardPass>(model_type);
+  KVCache kv_cache = KVCache::Create(model_type);
   size_t max_tokens = 32;
   size_t max_generated_tokens = 16;
   float temperature = 1.0f;
   int verbosity = 0;
   const auto accept_token = [](int) { return true; };
 
+  Gemma gemma(GemmaTokenizer(), model_type, pool);
+
   const auto generate = [&](const std::vector<int>& prompt) {
     std::vector<int> reply;
     auto stream_token = [&reply](int token, float) {
@@ -57,8 +57,7 @@ TEST(OptimizeTest, GradientDescent) {
       stream_token, accept_token, ReverseSequenceSampler::kEndToken,
     };
     TimingInfo timing_info;
-    GenerateGemma(model_type, weights, inference, runtime, prompt, 0,
-                  kv_cache, pool, timing_info);
+    model.Generate(runtime, prompt, 0, kv_cache, timing_info);
     return reply;
   };
 
@@ -74,12 +73,12 @@ TEST(OptimizeTest, GradientDescent) {
     return ok;
   };
 
-  RandInitWeights(model_type, weights, pool, gen);
-  ZeroInitWeights(model_type, grad_m, pool);
-  ZeroInitWeights(model_type, grad_v, pool);
+  CallFunctorForModel<RandInitWeights>(model_type, gemma.Weights(), pool, gen);
+  CallFunctorForModel<ZeroInitWeights>(model_type, grad_m, pool);
+  CallFunctorForModel<ZeroInitWeights>(model_type, grad_v, pool);
 
   printf("Initial weights:\n");
-  LogWeightStats(model_type, weights);
+  LogWeightStats(model_type, gemma.Weights());
 
   constexpr size_t kBatchSize = 8;
   float learning_rate = 0.0005f;
@@ -91,21 +90,21 @@ TEST(OptimizeTest, GradientDescent) {
   size_t num_ok;
   for (; steps < 1000000; ++steps) {
     std::mt19937 sgen(42);
-    ZeroInitWeights(model_type, grad, pool);
+    CallFunctorForModel<ZeroInitWeights>(model_type, grad, pool);
     float total_loss = 0.0f;
     num_ok = 0;
     for (size_t i = 0; i < kBatchSize; ++i) {
       Prompt prompt = training_task.Sample(sgen);
-      total_loss += CrossEntropyLossForwardPass(
-          model_type, prompt, weights, forward, pool);
-      CrossEntropyLossBackwardPass(
-          model_type, prompt, weights, forward, grad, backward, pool);
+      total_loss += CrossEntropyLossForwardPass(model_type, prompt,
+                                                gemma.Weights(), forward, pool);
+      CrossEntropyLossBackwardPass(model_type, prompt, gemma.Weights(), forward,
+                                   grad, backward, pool);
       num_ok += verify(prompt) ? 1 : 0;
     }
     total_loss /= kBatchSize;
 
     const float scale = -learning_rate / kBatchSize;
-    UpdateWeights(model_type, grad, scale, weights, pool);
+    UpdateWeights(model_type, grad, scale, gemma.Weights(), pool);
     printf("step: %zu  total_loss: %.15f   num_ok: %zu/%zu\n",
            steps, total_loss, num_ok, kBatchSize);
     if (steps % 100 == 0) {
@@ -119,7 +118,7 @@ TEST(OptimizeTest, GradientDescent) {
   }
   printf("Num steps: %zu\n", steps);
   printf("Final weights:\n");
-  LogWeightStats(model_type, weights);
+  LogWeightStats(model_type, gemma.Weights());
   EXPECT_LT(steps, 3000);
   EXPECT_EQ(num_ok, kBatchSize);
 }
diff --git a/backprop/optimizer.cc b/backprop/optimizer.cc
index 5b1c61d..79659b7 100644
--- a/backprop/optimizer.cc
+++ b/backprop/optimizer.cc
@@ -41,14 +41,16 @@ class WeightInitializer {
 };
 
 template <typename TConfig>
-void RandInitWeights(ByteStorageT& weights_u8, hwy::ThreadPool& pool,
-                     std::mt19937& gen) {
-  auto& weights = *reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
-  // TODO(szabadka) Use the same weight initialization method as in the python
-  // version.
-  WeightInitializer init(gen);
-  ForEachTensor1<float, TConfig>(init, weights);
-}
+struct RandInitWeights {
+  void operator()(ByteStorageT& weights_u8, hwy::ThreadPool& pool,
+                  std::mt19937& gen) const {
+    auto& weights = *reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
+    // TODO(szabadka) Use the same weight initialization method as in the python
+    // version.
+    WeightInitializer init(gen);
+    ForEachTensor1<float, TConfig>(init, weights);
+  }
+};
 
 class WeightUpdater {
  public:
@@ -67,55 +69,22 @@ class WeightUpdater {
 };
 
 template <typename TConfig>
-void UpdateWeights(const ByteStorageT& grad_u8, float scale,
-                   ByteStorageT& weights_u8, hwy::ThreadPool& pool) {
-  const auto& grad =
-      *reinterpret_cast<const WeightsF<TConfig>*>(grad_u8.get());
-  auto& weights = *reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
-  WeightUpdater updater(scale);
-  ForEachTensor2<float, TConfig>(updater, grad, weights);
-}
+struct UpdateWeightsT {
+  void operator()(const ByteStorageT& grad_u8, float scale,
+                  ByteStorageT& weights_u8, hwy::ThreadPool& pool) const {
+    const auto& grad =
+        *reinterpret_cast<const WeightsF<TConfig>*>(grad_u8.get());
+    auto& weights = *reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
+    WeightUpdater updater(scale);
+    ForEachTensor2<float, TConfig>(updater, grad, weights);
+  }
+};
 
 }  // namespace
 
-void RandInitWeights(Model model, ByteStorageT& weights_u8,
-                     hwy::ThreadPool& pool, std::mt19937& gen) {
-  switch (model) {
-    case Model::GEMMA_2B:
-      RandInitWeights<ConfigGemma2B>(weights_u8, pool, gen);
-      break;
-    case Model::GEMMA_7B:
-      RandInitWeights<ConfigGemma7B>(weights_u8, pool, gen);
-      break;
-    case Model::GRIFFIN_2B:
-      RandInitWeights<ConfigGriffin2B>(weights_u8, pool, gen);
-      break;
-    case Model::GEMMA_TINY:
-      RandInitWeights<ConfigGemmaTiny>(weights_u8, pool, gen);
-      break;
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
-}
-
 void UpdateWeights(Model model, const ByteStorageT& grad, float scale,
                    ByteStorageT& weights, hwy::ThreadPool& pool) {
-  switch (model) {
-    case Model::GEMMA_2B:
-      UpdateWeights<ConfigGemma2B>(grad, scale, weights, pool);
-      break;
-    case Model::GEMMA_7B:
-      UpdateWeights<ConfigGemma7B>(grad, scale, weights, pool);
-      break;
-    case Model::GRIFFIN_2B:
-      UpdateWeights<ConfigGriffin2B>(grad, scale, weights, pool);
-      break;
-    case Model::GEMMA_TINY:
-      UpdateWeights<ConfigGemmaTiny>(grad, scale, weights, pool);
-      break;
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
+  CallFunctorForModel<UpdateWeightsT>(model, grad, scale, weights, pool);
 }
 
 }  // namespace gcpp
diff --git a/backprop/optimizer.h b/backprop/optimizer.h
index 343db97..157352b 100644
--- a/backprop/optimizer.h
+++ b/backprop/optimizer.h
@@ -16,16 +16,11 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_OPTIMIZER_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_OPTIMIZER_H_
 
-#include <random>
-
 #include "gemma/common.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
 
-void RandInitWeights(Model model, ByteStorageT& weights, hwy::ThreadPool& pool,
-                     std::mt19937& gen);
-
 void UpdateWeights(Model model, const ByteStorageT& grad, float scale,
                    ByteStorageT& weights, hwy::ThreadPool& pool);
 
diff --git a/debug_prompt.cc b/debug_prompt.cc
index 8954f29..0ceb545 100644
--- a/debug_prompt.cc
+++ b/debug_prompt.cc
@@ -37,7 +37,7 @@ std::pair<std::string, int> QueryModel(
     gcpp::KVCache& kv_cache, hwy::ThreadPool& pool, const std::string& input,
     gcpp::LayersOutputT* layers_output) {
   std::vector<int> prompt;
-  HWY_ASSERT(model.Tokenizer()->Encode(input, &prompt));
+  HWY_ASSERT(model.Tokenizer().Encode(input, &prompt));
 
   // For both pre-trained and instruction-tuned models: prepend "<bos>" token
   // if needed.
@@ -48,11 +48,10 @@ std::pair<std::string, int> QueryModel(
   std::mt19937 gen;
   gen.seed(42);
 
-  auto stream_token = [&res, &total_tokens, &app,
-                       tokenizer = model.Tokenizer()](int token, float) {
+  auto stream_token = [&res, &total_tokens, &model](int token, float) {
     ++total_tokens;
     std::string token_text;
-    HWY_ASSERT(tokenizer->Decode(std::vector<int>{token}, &token_text));
+    HWY_ASSERT(model.Tokenizer().Decode(std::vector<int>{token}, &token_text));
     res += token_text;
     return true;
   };
@@ -70,8 +69,8 @@ std::pair<std::string, int> QueryModel(
       .stream_token = stream_token,
       .accept_token = accept_token,
   };
-  GenerateGemma(model, runtime_config, prompt, /*start_pos=*/0, kv_cache, pool,
-                timing_info, layers_output);
+  model.Generate(runtime_config, prompt, /*start_pos=*/0, kv_cache, timing_info,
+                 layers_output);
   return {res, total_tokens};
 }
 
@@ -115,7 +114,7 @@ int main(int argc, char** argv) {
   }
 
   gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
-  auto kv_cache = CreateKVCache(loader.ModelType());
+  gcpp::KVCache kv_cache = gcpp::KVCache::Create(loader.ModelType());
 
   const std::string& prompt = prompt_args.prompt;
   if (prompt.empty()) {
diff --git a/examples/hello_world/run.cc b/examples/hello_world/run.cc
index 9bb0d9f..9ac7259 100644
--- a/examples/hello_world/run.cc
+++ b/examples/hello_world/run.cc
@@ -40,7 +40,7 @@ int main(int argc, char** argv) {
 
   // Instantiate model and KV Cache
   gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
-  auto kv_cache = CreateKVCache(loader.ModelType());
+  gcpp::KVCache kv_cache = gcpp::KVCache::Create(loader.ModelType());
   size_t pos = 0;  // KV Cache position
 
   // Initialize random number generator
diff --git a/gemma/activations.cc b/gemma/activations.cc
deleted file mode 100644
index 56f8bde..0000000
--- a/gemma/activations.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gemma/activations.h"
-
-#include "gemma/common.h"
-#include "gemma/configs.h"
-
-namespace gcpp {
-
-ByteStorageT AllocateForwardPass(Model model) {
-  switch (model) {
-    case Model::GEMMA_2B:
-      return ForwardPass<float, ConfigGemma2B>::Allocate();
-    case Model::GEMMA_7B:
-      return ForwardPass<float, ConfigGemma7B>::Allocate();
-    case Model::GRIFFIN_2B:
-      return ForwardPass<float, ConfigGriffin2B>::Allocate();
-    case Model::GEMMA_TINY:
-      return ForwardPass<float, ConfigGemmaTiny>::Allocate();
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
-}
-
-}  // namespace gcpp
diff --git a/gemma/activations.h b/gemma/activations.h
index 9a43344..6894d67 100644
--- a/gemma/activations.h
+++ b/gemma/activations.h
@@ -16,8 +16,11 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_ACTIVATIONS_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_ACTIVATIONS_H_
 
-#include "gemma/common.h"
-#include "hwy/aligned_allocator.h"
+#include <stddef.h>
+
+#include <array>
+
+#include "gemma/common.h"  // ByteStorageT
 
 namespace gcpp {
 
@@ -54,9 +57,12 @@ struct ForwardPass {
   std::array<T, kSeqLen * kModelDim> final_norm_output;
   std::array<T, kSeqLen * kVocabSize> logits;
   std::array<T, kSeqLen * kVocabSize> probs;
+};
 
-  static ByteStorageT Allocate() {
-    return hwy::AllocateAligned<uint8_t>(sizeof(ForwardPass<T, TConfig>));
+template <typename TConfig>
+struct AllocateForwardPass {
+  ByteStorageT operator()() const {
+    return AllocateSizeof<ForwardPass<float, TConfig>>();
   }
 };
 
@@ -78,8 +84,6 @@ class ActivationsWrapper {
   WrappedT& activations_;
 };
 
-ByteStorageT AllocateForwardPass(Model model);
-
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_ACTIVATIONS_H_
diff --git a/gemma/benchmark.cc b/gemma/benchmark.cc
index 2233de9..1d41153 100644
--- a/gemma/benchmark.cc
+++ b/gemma/benchmark.cc
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <cstdlib>  // EXIT_FAILURE
 #include <fstream>
 #include <iostream>
 #include <ostream>
@@ -8,6 +9,7 @@
 #include <utility>  // std::pair
 #include <vector>
 
+#include "compression/io.h"  // Path
 #include "gemma/gemma.h"
 #include "util/app.h"
 #include "util/args.h"
@@ -61,7 +63,7 @@ std::pair<std::string, int> QueryModel(
     gcpp::Gemma& model, gcpp::InferenceArgs& args, gcpp::AppArgs& app,
     gcpp::KVCache& kv_cache, hwy::ThreadPool& pool, const std::string& input) {
   std::vector<int> prompt;
-  HWY_ASSERT(model.Tokenizer()->Encode(input, &prompt));
+  HWY_ASSERT(model.Tokenizer().Encode(input, &prompt));
 
   // For both pre-trained and instruction-tuned models: prepend "<bos>" token
   // if needed.
@@ -73,11 +75,11 @@ std::pair<std::string, int> QueryModel(
   gen.seed(42);
 
   const double time_start = hwy::platform::Now();
-  auto stream_token = [&res, &total_tokens, &time_start, &app,
-                       tokenizer = model.Tokenizer()](int token, float) {
+  auto stream_token = [&res, &total_tokens, &time_start, &app, &model](
+                          int token, float) {
     ++total_tokens;
     std::string token_text;
-    HWY_ASSERT(tokenizer->Decode(std::vector<int>{token}, &token_text));
+    HWY_ASSERT(model.Tokenizer().Decode(std::vector<int>{token}, &token_text));
     res += token_text;
     if (app.verbosity >= 1 && total_tokens % 100 == 0) {
       LogSpeedStats(time_start, total_tokens);
@@ -98,8 +100,8 @@ std::pair<std::string, int> QueryModel(
       .stream_token = stream_token,
       .accept_token = accept_token,
   };
-  GenerateGemma(model, runtime_config, prompt, /*start_pos=*/0, kv_cache, pool,
-                timing_info, /*layers_output=*/nullptr);
+  model.Generate(runtime_config, prompt, /*start_pos=*/0, kv_cache, timing_info,
+                 /*layers_output=*/nullptr);
   if (app.verbosity >= 1) {
     LogSpeedStats(time_start, total_tokens);
   }
@@ -190,7 +192,7 @@ int BenchmarkCrossEntropy(gcpp::Gemma& model, gcpp::Model model_type,
                           size_t batch_tokens) {
   std::string input = ReadFile(text);
   std::vector<int> prompt;
-  HWY_ASSERT(model.Tokenizer()->Encode(input, &prompt));
+  HWY_ASSERT(model.Tokenizer().Encode(input, &prompt));
   prompt.resize(std::min<size_t>(args.max_tokens, prompt.size()));
   std::cout << "Number of input tokens: " << prompt.size() << "\n";
   const double time_start = hwy::platform::Now();
@@ -201,14 +203,13 @@ int BenchmarkCrossEntropy(gcpp::Gemma& model, gcpp::Model model_type,
     size_t num_tokens = std::min<size_t>(prompt.size() - pos, batch_tokens);
     std::vector<int> prompt_slice(prompt.begin() + pos,
                                   prompt.begin() + pos + num_tokens);
-    auto kv_cache = CreateKVCache(model_type);
-    float entropy =
-        ComputeCrossEntropy(model, num_tokens, prompt_slice, kv_cache, pool,
-                            app.verbosity);
+    gcpp::KVCache kv_cache = gcpp::KVCache::Create(model_type);
+    float entropy = model.ComputeCrossEntropy(num_tokens, prompt_slice,
+                                              kv_cache, app.verbosity);
     total_entropy += entropy;
     LogSpeedStats(time_start, pos + num_tokens);
     std::string text_slice;
-    HWY_ASSERT(model.Tokenizer()->Decode(prompt_slice, &text_slice));
+    HWY_ASSERT(model.Tokenizer().Decode(prompt_slice, &text_slice));
     total_input_len += text_slice.size();
     printf("Cross entropy per byte: %f [cumulative: %f]\n",
            entropy / text_slice.size(), total_entropy / total_input_len);
@@ -277,7 +278,7 @@ int main(int argc, char** argv) {
   }
 
   gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
-  auto kv_cache = CreateKVCache(loader.ModelType());
+  gcpp::KVCache kv_cache = gcpp::KVCache::Create(loader.ModelType());
 
   if (!benchmark_args.goldens.path.empty()) {
     const std::string golden_path =
diff --git a/gemma/common-inl.h b/gemma/common-inl.h
deleted file mode 100644
index ac39d73..0000000
--- a/gemma/common-inl.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Include guard for non-SIMD code.
-#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_INL_H_
-#define THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_INL_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <array>
-#include <cmath>
-
-#include "gemma/activations.h"
-
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_INL_H_
-
-// Include guard for (potentially) SIMD code.
-#if defined(THIRD_PARTY_GEMMA_CPP_COMMON_TOGGLE) == defined(HWY_TARGET_TOGGLE)
-#ifdef THIRD_PARTY_GEMMA_CPP_COMMON_TOGGLE
-#undef THIRD_PARTY_GEMMA_CPP_COMMON_TOGGLE
-#else
-#define THIRD_PARTY_GEMMA_CPP_COMMON_TOGGLE
-#endif
-
-#include "gemma/ops.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace gcpp {
-namespace HWY_NAMESPACE {
-namespace hn = hwy::HWY_NAMESPACE;
-
-// `EmbeddingScaling` can be constexpr only if `Sqrt` and `hwy::ConvertScalarTo`
-// are both constexpr
-#if HWY_COMPILER_GCC_ACTUAL
-#define GEMMA_CONSTEXPR_EMBSCALING HWY_BF16_CONSTEXPR
-#else
-#define GEMMA_CONSTEXPR_EMBSCALING
-#endif
-
-template <typename TConfig>
-GEMMA_CONSTEXPR_EMBSCALING float EmbeddingScaling() {
-  // Round to bf16 to match Gemma's Embedder, which casts before mul.
-  return hwy::ConvertScalarTo<float>(hwy::ConvertScalarTo<hwy::bfloat16_t>(
-      Sqrt(static_cast<float>(TConfig::kModelDim))));
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace gcpp
-HWY_AFTER_NAMESPACE();
-
-#endif  // NOLINT
diff --git a/gemma/common.cc b/gemma/common.cc
new file mode 100644
index 0000000..8ed8718
--- /dev/null
+++ b/gemma/common.cc
@@ -0,0 +1,67 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gemma/common.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include <algorithm>  // std::transform
+#include <cctype>
+#include <string>
+#include <vector>
+
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+
+namespace gcpp {
+
+namespace {
+constexpr const char* kModelFlags[] = {"2b-pt", "7b-pt",   "gr2b-pt", "2b-it",
+                                       "7b-it", "gr2b-it", "tiny"};
+constexpr Model kModelTypes[] = {
+    Model::GEMMA_2B, Model::GEMMA_7B,   Model::GRIFFIN_2B, Model::GEMMA_2B,
+    Model::GEMMA_7B, Model::GRIFFIN_2B, Model::GEMMA_TINY};
+constexpr ModelTraining kModelTraining[] = {
+    ModelTraining::GEMMA_PT, ModelTraining::GEMMA_PT, ModelTraining::GEMMA_PT,
+    ModelTraining::GEMMA_IT, ModelTraining::GEMMA_IT, ModelTraining::GEMMA_IT,
+    ModelTraining::GEMMA_IT};
+}  // namespace
+
+const char* ParseModelTypeAndTraining(const std::string& model_flag,
+                                      Model& model, ModelTraining& training) {
+  constexpr size_t kNum = std::end(kModelFlags) - std::begin(kModelFlags);
+  static char kErrorMessageBuffer[kNum * 8 + 1024] =
+      "Invalid or missing model flag, need to specify one of ";
+  for (size_t i = 0; i + 1 < kNum; i++) {
+    strcat(kErrorMessageBuffer, kModelFlags[i]);  // NOLINT
+    strcat(kErrorMessageBuffer, ", ");            // NOLINT
+  }
+  strcat(kErrorMessageBuffer, kModelFlags[kNum - 1]);  // NOLINT
+  strcat(kErrorMessageBuffer, ".");                    // NOLINT
+  std::string model_type_lc = model_flag;
+  std::transform(begin(model_type_lc), end(model_type_lc), begin(model_type_lc),
+                 [](unsigned char c) { return std::tolower(c); });
+  for (size_t i = 0; i < kNum; i++) {
+    if (kModelFlags[i] == model_type_lc) {
+      model = kModelTypes[i];
+      training = kModelTraining[i];
+      return nullptr;
+    }
+  }
+  return kErrorMessageBuffer;
+}
+
+}  // namespace gcpp
diff --git a/gemma/common.h b/gemma/common.h
index d497259..e277095 100644
--- a/gemma/common.h
+++ b/gemma/common.h
@@ -16,15 +16,115 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_H_
 
+#include <math.h>  // sqrtf
+#include <stdint.h>
+
+#include <string>
+
+#include "gemma/configs.h"  // IWYU pragma: export
 #include "hwy/aligned_allocator.h"
+#include "hwy/base.h"  // ConvertScalarTo
 
 namespace gcpp {
 
 using ByteStorageT = hwy::AlignedFreeUniquePtr<uint8_t[]>;
 
+template <typename T>
+ByteStorageT AllocateSizeof() {
+  return hwy::AllocateAligned<uint8_t>(sizeof(T));
+}
+
 // Model variants: see configs.h for details.
 enum class Model { GEMMA_2B, GEMMA_7B, GRIFFIN_2B, GEMMA_TINY };
 
+enum class ModelTraining { GEMMA_IT, GEMMA_PT };
+
+// Returns the return value of Func<T>().operator() called with `args`, where
+// `T` is selected based on `model`.
+//
+// This is used to implement type-erased functions such as
+// LoadCompressedWeights, which can be called from other .cc files, by calling a
+// functor LoadCompressedWeightsT, which has a template argument. `Func` must
+// be a functor because function templates cannot be passed as a template
+// template argument, and we prefer to avoid the overhead of std::function.
+//
+// This function avoids having to update all call sites when we extend `Model`.
+template <template <typename Config> class Func, typename... Args>
+decltype(auto) CallFunctorForModel(Model model, Args&&... args) {
+  switch (model) {
+    case Model::GEMMA_TINY:
+      return Func<ConfigGemmaTiny>()(std::forward<Args>(args)...);
+    case Model::GEMMA_2B:
+      return Func<ConfigGemma2B>()(std::forward<Args>(args)...);
+    case Model::GEMMA_7B:
+      return Func<ConfigGemma7B>()(std::forward<Args>(args)...);
+    case Model::GRIFFIN_2B:
+      return Func<ConfigGriffin2B>()(std::forward<Args>(args)...);
+    default:
+      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
+  }
+}
+
+// Like CallFunctorForModel, but for SIMD function templates. This is a macro
+// because it boils down to N_SSE4::FUNC, which would not work if FUNC was a
+// normal function argument.
+#define GEMMA_EXPORT_AND_DISPATCH_MODEL(MODEL, FUNC, ARGS)          \
+  switch (MODEL) {                                                  \
+    case Model::GEMMA_TINY: {                                       \
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGemmaTiny>)      \
+      ARGS;                                                         \
+      break;                                                        \
+    }                                                               \
+    case Model::GEMMA_2B: {                                         \
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGemma2B>)        \
+      ARGS;                                                         \
+      break;                                                        \
+    }                                                               \
+    case Model::GEMMA_7B: {                                         \
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGemma7B>)        \
+      ARGS;                                                         \
+      break;                                                        \
+    }                                                               \
+    case Model::GRIFFIN_2B: {                                       \
+      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC<ConfigGriffin2B>)      \
+      ARGS;                                                         \
+      break;                                                        \
+    }                                                               \
+    default:                                                        \
+      HWY_ABORT("Model type %d unknown.", static_cast<int>(MODEL)); \
+  }
+
+// Returns error string or nullptr if OK.
+// Thread-hostile.
+const char* ParseModelTypeAndTraining(const std::string& model_flag,
+                                      Model& model, ModelTraining& training);
+
+// __builtin_sqrt is not constexpr as of Clang 17.
+#if HWY_COMPILER_GCC_ACTUAL
+#define GEMMA_CONSTEXPR_SQRT constexpr
+static GEMMA_CONSTEXPR_SQRT HWY_INLINE float Sqrt(float x) {
+  return __builtin_sqrt(x);
+}
+#else
+#define GEMMA_CONSTEXPR_SQRT
+static GEMMA_CONSTEXPR_SQRT HWY_INLINE float Sqrt(float x) { return sqrtf(x); }
+#endif
+
+// `EmbeddingScaling` can be constexpr only if `Sqrt` and `hwy::ConvertScalarTo`
+// are both constexpr
+#if HWY_COMPILER_GCC_ACTUAL
+#define GEMMA_CONSTEXPR_EMBSCALING HWY_BF16_CONSTEXPR
+#else
+#define GEMMA_CONSTEXPR_EMBSCALING
+#endif
+
+template <typename TConfig>
+GEMMA_CONSTEXPR_EMBSCALING float EmbeddingScaling() {
+  // Round to bf16 to match Gemma's Embedder, which casts before mul.
+  return hwy::ConvertScalarTo<float>(hwy::ConvertScalarTo<hwy::bfloat16_t>(
+      Sqrt(static_cast<float>(TConfig::kModelDim))));
+}
+
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_COMMON_H_
diff --git a/gemma/compress_weights.cc b/gemma/compress_weights.cc
index d9260b0..a254b72 100644
--- a/gemma/compress_weights.cc
+++ b/gemma/compress_weights.cc
@@ -15,11 +15,33 @@
 
 // Command line tool to create compressed weights.
 
+// Compiles this file for multiple architectures via "foreach_target.h", to
+// which we pass the filename via macro 'argument'.
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE \
+  "gemma/compress_weights.cc"  // NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+// Must come after foreach_target.h to avoid redefinition errors.
+#include "compression/compress-inl.h"
+#include "hwy/highway.h"
+
+#ifndef GEMMA_COMPRESS_WEIGHTS_ONCE
+#define GEMMA_COMPRESS_WEIGHTS_ONCE
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include <algorithm>  // std::clamp
 #include <iostream>
 #include <string>
+#include <thread>  // NOLINT
 
-#include "gemma/gemma.h"  // Gemma
+#include "compression/io.h"  // Path
+#include "gemma/common.h"    // Model
+#include "gemma/weights.h"
 #include "util/args.h"
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
 
@@ -99,10 +121,57 @@ void ShowHelp(gcpp::Args& args) {
   std::cerr << "\n";
 }
 
+}  // namespace gcpp
+#endif  // GEMMA_COMPRESS_WEIGHTS_ONCE
+
+// SIMD code, compiled once per target.
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+
+template <class TConfig>
+void CompressWeights(const Path& weights_path,
+                     const Path& compressed_weights_path, Model model_type,
+                     hwy::ThreadPool& pool) {
+  if (!weights_path.Exists()) {
+    HWY_ABORT("The model weights file '%s' does not exist.",
+              weights_path.path.c_str());
+  }
+
+  // Allocate compressed weights.
+  using CWeights = CompressedWeights<TConfig>;
+  ByteStorageT c_weights_u8 = AllocateSizeof<CWeights>();
+  CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
+  new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
+
+  // Get weights, compress, and store.
+  const bool scale_for_compression = TConfig::kNumTensorScales > 0;
+  const ByteStorageT weights_u8 = gcpp::LoadRawWeights(
+      weights_path, model_type, pool, scale_for_compression);
+  WeightsF<TConfig>* weights =
+      reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
+  Compressor compressor(pool);
+  ForEachTensor<TConfig>(weights, *c_weights, compressor);
+  compressor.AddScales(weights->scales.data(), weights->scales.size());
+  compressor.WriteAll(pool, compressed_weights_path);
+
+  weights->layer_ptrs.~LayerPointers<float, TConfig>();
+  c_weights->c_layer_ptrs.~CompressedLayerPointers<TConfig>();
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace gcpp {
+
 void Run(Args& args) {
   hwy::ThreadPool pool(args.num_threads);
-  gcpp::CompressWeights(args.ModelType(), args.weights, args.compressed_weights,
-                        pool);
+  const Model model_type = args.ModelType();
+  GEMMA_EXPORT_AND_DISPATCH_MODEL(
+      model_type, CompressWeights,
+      (args.weights, args.compressed_weights, model_type, pool));
 }
 
 }  // namespace gcpp
@@ -111,12 +180,12 @@ int main(int argc, char** argv) {
   gcpp::Args args(argc, argv);
 
   if (gcpp::HasHelp(argc, argv)) {
-    ShowHelp(args);
+    gcpp::ShowHelp(args);
     return 0;
   }
 
   if (const char* error = args.Validate()) {
-    ShowHelp(args);
+    gcpp::ShowHelp(args);
     HWY_ABORT("\nInvalid args: %s", error);
   }
 
@@ -124,3 +193,5 @@ int main(int argc, char** argv) {
 
   return 0;
 }
+
+#endif  // HWY_ONCE
diff --git a/gemma/configs.h b/gemma/configs.h
index 7fc6477..c515886 100644
--- a/gemma/configs.h
+++ b/gemma/configs.h
@@ -13,11 +13,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Model configurations
-
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_CONFIGS_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_CONFIGS_H_
 
+// Model configurations
+
+#include <stddef.h>
+
+#include <array>
+
+#include "compression/compress.h"  // SfpStream
+#include "hwy/base.h"                // hwy::bfloat16_t
+
+namespace gcpp {
+
 // Allow changing pre-allocated kv cache size as a compiler flag
 #ifndef GEMMA_MAX_SEQLEN
 #define GEMMA_MAX_SEQLEN 4096
@@ -33,25 +42,20 @@
 #define GEMMA_MAX_THREADS 128
 #endif  // !GEMMA_MAX_THREADS
 
-#include <stddef.h>
-
-#include <array>
-
-#include "compression/sfp.h"
-#include "hwy/base.h"  // hwy::bfloat16_t
-
 // Allowable types for GEMMA_WEIGHT_T (can be specified at compilation time):
 // float, hwy::bfloat16_t, SfpStream, NuqStream
 #ifndef GEMMA_WEIGHT_T
 #define GEMMA_WEIGHT_T SfpStream
 #endif  // !GEMMA_WEIGHT_T
 
-namespace gcpp {
-
 static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN;
 static constexpr size_t kTopK = GEMMA_TOPK;
 static constexpr size_t kMaxThreads = GEMMA_MAX_THREADS;
 
+using GemmaWeightT = GEMMA_WEIGHT_T;
+
+using EmbedderInputT = hwy::bfloat16_t;
+
 enum class LayerAttentionType {
   kGemma,
   kGriffinRecurrentBlock,
diff --git a/gemma/gemma.cc b/gemma/gemma.cc
index 0004351..3135b51 100644
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@@ -15,16 +15,12 @@
 
 // Lightweight C++ implementation of the gemma model.
 
-#include "gemma/common.h"
-
 // Compiles this file for multiple architectures via "foreach_target.h", to
 // which we pass the filename via macro 'argument'.
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "gemma/gemma.cc"  // NOLINT
 #include "hwy/foreach_target.h"        // IWYU pragma: keep
 // Must come after foreach_target.h to avoid redefinition errors.
-#include "compression/compress-inl.h"
-#include "gemma/common-inl.h"
 #include "gemma/ops.h"
 #include "hwy/contrib/matvec/matvec-inl.h"
 #include "hwy/highway.h"
@@ -42,7 +38,6 @@
 
 #include <algorithm>
 #include <array>
-#include <cctype>
 #include <cmath>
 #include <memory>
 #include <regex>  // NOLINT
@@ -50,8 +45,8 @@
 #include <utility>
 #include <vector>
 
-#include "compression/compress.h"
 #include "compression/io.h"  // Path
+#include "gemma/common.h"
 #include "gemma/configs.h"
 #include "gemma/gemma.h"
 #include "gemma/weights.h"
@@ -63,255 +58,14 @@
 // copybara:import_next_line:sentencepiece
 #include "src/sentencepiece_processor.h"
 
-// Setting this to true disables fread() calls that read the model file.
-constexpr bool kDryRunFread = false;
-
-// Setting this to false will load and use uncompressed weights.
-constexpr bool kWeightsAreCompressed = true;
+namespace gcpp {
 
 // Set this to true to debug tokenizer tokens.
 constexpr bool kShowTokenization = false;
 
-namespace gcpp {
-
-float ScaleWeights(float* data, size_t len) {
-  float maxabs = 0.0;
-  for (size_t i = 0; i < len; ++i) {
-    maxabs = std::max(maxabs, std::abs(data[i]));
-  }
-  const float kMaxRange = 1.875f;
-  if (maxabs <= kMaxRange) {
-    return 1.0f;
-  }
-  const float scale = maxabs / kMaxRange;
-  const float inv_scale = 1.0f / scale;
-  for (size_t i = 0; i < len; ++i) {
-    data[i] *= inv_scale;
-  }
-  return scale;
-}
-
-template <typename TConfig>
-hwy::AlignedFreeUniquePtr<uint8_t[]> LoadWeights(
-    const Path& checkpoint, hwy::ThreadPool& pool,
-    bool scale_for_compression = false) {
-  PROFILER_ZONE("Startup.LoadWeights");
-  if (!checkpoint.Exists()) {
-    HWY_ABORT("The model weights file '%s' does not exist.",
-              checkpoint.path.c_str());
-  }
-
-  ByteStorageT weights_u8 = AllocateWeights<float, TConfig>(pool);
-  auto* weights = reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
-
-  size_t scale_pos = 0;
-  FILE* fptr;
-  if constexpr (kDryRunFread) {
-    fprintf(stderr, "Dry-Run, not reading model-file.\n");
-  } else {
-    fptr = fopen(checkpoint.path.c_str(), "rb");
-    if (fptr == nullptr) {
-      HWY_ABORT("Failed to open model file %s - does it exist?",
-                checkpoint.path.c_str());
-    }
-  }
-  bool ok = true;
-  uint64_t total_size = 0;
-  auto do_fread = [&](void* var, int layer, const char* name, size_t size) {
-    if (layer == -1) {
-      fprintf(stderr, "Loading Parameters (size %zu): %s\n", size, name);
-    } else {
-      fprintf(stderr, "Loading Parameters (layer=%d, size %zu): %s\n", layer,
-              size, name);
-    }
-    if constexpr (!kDryRunFread) {
-      ok &= 1 == fread(var, size, 1, fptr);
-      total_size += size;
-    }
-  };
-  do_fread(&(weights->embedder_input_embedding), -1, "embedder_input_embedding",
-           sizeof(weights->embedder_input_embedding));
-  do_fread(&(weights->final_norm_scale), -1, "final_norm_scale",
-           sizeof(weights->final_norm_scale));
-  for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
-    auto type = TConfig::kLayerConfig[layer];
-    LayerF<TConfig>* layer_view = weights->GetLayer(layer);
-
-#define READ_WEIGHTS(name)                                                 \
-  do {                                                                     \
-    do_fread(&(layer_view->name), layer, #name, sizeof(layer_view->name)); \
-  } while (0)
-
-#define SCALE_WEIGHTS(name)                                               \
-  do {                                                                    \
-    if (ok && !kDryRunFread && scale_for_compression) {                   \
-      weights->scales[scale_pos++] =                                      \
-          ScaleWeights(layer_view->name.data(), layer_view->name.size()); \
-    }                                                                     \
-  } while (0)
-    // Make sure we don't have uninitialized memory.
-    hwy::ZeroBytes(layer_view, sizeof(*layer_view));
-    if (type == LayerAttentionType::kGemma) {
-      READ_WEIGHTS(attn_vec_einsum_w);
-      READ_WEIGHTS(qkv_einsum_w);
-      SCALE_WEIGHTS(attn_vec_einsum_w);
-      SCALE_WEIGHTS(qkv_einsum_w);
-    } else {
-      READ_WEIGHTS(griffin.linear_x_w);
-      READ_WEIGHTS(griffin.linear_x_biases);
-      READ_WEIGHTS(griffin.linear_y_w);
-      READ_WEIGHTS(griffin.linear_y_biases);
-      READ_WEIGHTS(griffin.linear_out_w);
-      READ_WEIGHTS(griffin.linear_out_biases);
-      READ_WEIGHTS(griffin.conv_w);
-      READ_WEIGHTS(griffin.conv_biases);
-      READ_WEIGHTS(griffin.gate_w);
-      READ_WEIGHTS(griffin.gate_biases);
-      READ_WEIGHTS(griffin.a);
-      SCALE_WEIGHTS(griffin.linear_x_w);
-      SCALE_WEIGHTS(griffin.linear_y_w);
-      SCALE_WEIGHTS(griffin.linear_out_w);
-      SCALE_WEIGHTS(griffin.gate_w);
-    }
-    READ_WEIGHTS(gating_einsum_w);
-    READ_WEIGHTS(linear_w);
-    SCALE_WEIGHTS(gating_einsum_w);
-    SCALE_WEIGHTS(linear_w);
-    READ_WEIGHTS(pre_attention_norm_scale);
-    READ_WEIGHTS(pre_ffw_norm_scale);
-    if (TConfig::kPostNormScale) {
-      READ_WEIGHTS(post_attention_norm_scale);
-      READ_WEIGHTS(post_ffw_norm_scale);
-    }
-    if (TConfig::kFFBiases) {
-      READ_WEIGHTS(ffw_gating_biases);
-      READ_WEIGHTS(ffw_output_biases);
-    }
-    if (TConfig::kSoftmaxAttnOutputBiases &&
-        type == LayerAttentionType::kGemma) {
-      READ_WEIGHTS(attention_output_biases);
-    }
-#undef READ_WEIGHTS
-  }
-  if (!ok) {
-    HWY_ABORT(
-        "Failed to read from %s - might be a directory, or too small? "
-        "expected size: %d kB",
-        checkpoint.path.c_str(), static_cast<uint32_t>(total_size >> 10));
-  }
-  if (!kDryRunFread) {
-    HWY_ASSERT(0 == fclose(fptr));
-    if (scale_for_compression) {
-      HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
-    }
-  }
-  return weights_u8;
-}
-
-template <class TConfig>
-struct CompressedLayer {
-  // No ctor/dtor, allocated via AllocateAligned.
-
-  using TLayer = gcpp::LayerF<TConfig>;
-  using WeightT = typename TConfig::WeightT;
-
-  static constexpr size_t kHeads = TLayer::kHeads;
-  static constexpr size_t kKVHeads = TLayer::kKVHeads;
-  static constexpr size_t kModelDim = TLayer::kModelDim;
-  static constexpr size_t kQKVDim = TLayer::kQKVDim;
-  static constexpr size_t kFFHiddenDim = TLayer::kFFHiddenDim;
-  static constexpr size_t kAttVecEinsumWSize = TLayer::kAttVecEinsumWSize;
-  static constexpr size_t kQKVEinsumWSize = TLayer::kQKVEinsumWSize;
-  static constexpr size_t kGatingEinsumWSize = TLayer::kGatingEinsumWSize;
-  static constexpr size_t kConv1dWidth = TLayer::kConv1dWidth;
-  static constexpr bool kFFBiases = TLayer::kFFBiases;
-  static constexpr bool kPostNormScale = TConfig::kPostNormScale;
-  static constexpr size_t kAOBiasDim = TLayer::kAOBiasDim;
-  static constexpr size_t kGriffinDim = TLayer::kGriffinDim;
-
-  // Compressed Parameters
-
-  template <class T, size_t N>
-  using ArrayT = CompressedArray<T, N>;
-
-  union {
-    struct {
-      ArrayT<WeightT, kAttVecEinsumWSize> attn_vec_einsum_w;
-      ArrayT<WeightT, kQKVEinsumWSize> qkv_einsum_w;
-      ArrayT<float, kAOBiasDim> attention_output_biases;
-    };
-
-    struct {
-      ArrayT<WeightT, kGriffinDim * kGriffinDim> linear_x_w;
-      ArrayT<float, kGriffinDim> linear_x_biases;
-      ArrayT<WeightT, kGriffinDim * kGriffinDim> linear_y_w;
-      ArrayT<float, kGriffinDim> linear_y_biases;
-      ArrayT<WeightT, kGriffinDim * kGriffinDim> linear_out_w;
-      ArrayT<float, kGriffinDim> linear_out_biases;
-      ArrayT<float, TConfig::kConv1dWidth * kGriffinDim> conv_w;
-      ArrayT<float, kGriffinDim> conv_biases;
-      ArrayT<WeightT, kGriffinDim * kGriffinDim / kHeads * 2> gate_w;
-      ArrayT<float, kGriffinDim * 2> gate_biases;
-      ArrayT<float, kGriffinDim> a;
-    } griffin;
-  };
-
-  ArrayT<WeightT, TLayer::kGatingEinsumWSize> gating_einsum_w;
-  ArrayT<WeightT, kModelDim * kFFHiddenDim> linear_w;
-  // We don't yet have an RMSNorm that accepts all WeightT.
-  ArrayT<hwy::bfloat16_t, kModelDim> pre_attention_norm_scale;
-  ArrayT<hwy::bfloat16_t, kModelDim> pre_ffw_norm_scale;
-  ArrayT<hwy::bfloat16_t, kPostNormScale ? kModelDim : 0>
-      post_attention_norm_scale;
-  ArrayT<hwy::bfloat16_t, kPostNormScale ? kModelDim : 0> post_ffw_norm_scale;
-
-  ArrayT<float, kFFBiases ? 2 * kFFHiddenDim : 0> ffw_gating_biases;
-  ArrayT<float, kFFBiases ? kModelDim : 0> ffw_output_biases;
-};
-
-// Array instead of single large allocation for parallel mem init. Split out
-// of CompressedWeights so that only these pointers are initialized, not the
-// CompressedArray.
-template <class TConfig>
-struct CompressedLayerPointers {
-  explicit CompressedLayerPointers(hwy::ThreadPool& pool) {
-    pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) {
-      this->c_layers[task] = hwy::AllocateAligned<CompressedLayer<TConfig>>(1);
-    });
-  }
-
-  using CLayer = CompressedLayer<TConfig>;
-  std::array<hwy::AlignedFreeUniquePtr<CLayer[]>, TConfig::kLayers> c_layers;
-};
-
-template <class TConfig>
-struct CompressedWeights {
-  // No ctor/dtor, allocated via AllocateAligned.
-
-  CompressedArray<EmbedderInputT, TConfig::kVocabSize * TConfig::kModelDim>
-      embedder_input_embedding;
-
-  CompressedArray<hwy::bfloat16_t, TConfig::kModelDim> final_norm_scale;
-
-  // Must be last so that the other arrays remain aligned.
-  CompressedLayerPointers<TConfig> c_layer_ptrs;
-
-  const CompressedLayer<TConfig>* GetLayer(size_t layer) const {
-    return c_layer_ptrs.c_layers[layer].get();
-  }
-  CompressedLayer<TConfig>* GetLayer(size_t layer) {
-    return c_layer_ptrs.c_layers[layer].get();
-  }
-};
-
-template <class TConfig>
-using WeightsT = hwy::If<kWeightsAreCompressed, CompressedWeights<TConfig>,
-                         WeightsF<TConfig>>;
-
-// Aligned.
-template <class TConfig, size_t TBatchSize>
+// Must be aligned.
+template <class TConfig, size_t kBatchSize>
 struct Activations {
-  static constexpr size_t kBatchSize = TBatchSize;
   using LayerConfig = LayerF<TConfig>;
   static constexpr size_t kModelDim = TConfig::kModelDim;
   static constexpr size_t kQKVDim = TConfig::kQKVDim;
@@ -353,139 +107,137 @@ struct Activations {
   std::array<float, kBatchSize * kGriffinDim> griffin_multiplier;
 };
 
-template<typename TConfig>
-struct InferenceState {
-  Activations<TConfig, kPrefillBatchSize> prefill;
-  HWY_ALIGN Activations<TConfig, 1> state;
+namespace {
 
-  static ByteStorageT Allocate() {
-    return hwy::AllocateAligned<uint8_t>(sizeof(InferenceState<TConfig>));
+template <class TConfig>
+struct CreateKVCache {
+  KVCache operator()() const {
+    KVCache kv_cache = {};
+
+    const size_t size_cache_pos =
+        TConfig::kGemmaLayers * TConfig::kKVHeads * TConfig::kQKVDim;
+    if (size_cache_pos != 0) {
+      const size_t seq_len = TConfig::kSeqLen + kPrefillBatchSize;
+      kv_cache.kv_cache =
+          hwy::AllocateAligned<float>(seq_len * size_cache_pos * 2);
+    }
+
+    if (TConfig::kGriffinLayers) {
+      constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
+      const size_t conv1d_cache_size =
+          TConfig::kGriffinLayers * (kConv1dWidth == 0 ? 0 : kConv1dWidth - 1) *
+          TConfig::kModelDim;
+      if (conv1d_cache_size != 0) {
+        kv_cache.conv1d_cache = hwy::AllocateAligned<float>(conv1d_cache_size);
+        hwy::ZeroBytes(kv_cache.conv1d_cache.get(),
+                       conv1d_cache_size * sizeof(kv_cache.conv1d_cache[0]));
+      }
+
+      const size_t rglru_cache_size =
+          TConfig::kGriffinLayers * TConfig::kModelDim;
+      if (rglru_cache_size != 0) {
+        kv_cache.rglru_cache = hwy::AllocateAligned<float>(rglru_cache_size);
+        hwy::ZeroBytes(kv_cache.rglru_cache.get(),
+                       rglru_cache_size * sizeof(kv_cache.rglru_cache[0]));
+      }
+    }  // kGriffinLayers
+
+    return kv_cache;
   }
 };
 
-// GemmaImpl is a template and thus cannot be exposed in gemma.h, hence we
-// define an abstract base class.
-struct GemmaInterface {
-  virtual ~GemmaInterface() = default;
+}  // namespace
 
-  virtual const GemmaTokenizer* Tokenizer() const = 0;
-
-  virtual void Generate(const RuntimeConfig& runtime_config,
-                        const std::vector<int>& prompt, size_t start_pos,
-                        KVCache& kv_cache, hwy::ThreadPool& pool,
-                        TimingInfo& timing_info,
-                        LayersOutputT* layers_output) = 0;
-
-  virtual float ComputeCrossEntropy(size_t max_tokens,
-                                    const std::vector<int>& prompt,
-                                    KVCache& kv_cache, hwy::ThreadPool& pool,
-                                    int verbosity) = 0;
-};
-
-template <class Config>
-KVCache CreateKVCacheT() {
-  constexpr size_t kConv1dWidth = Config::kConv1dWidth;
-  return CreateKVCache(
-      Config::kGemmaLayers * Config::kKVHeads * Config::kQKVDim,
-      Config::kSeqLen + kPrefillBatchSize,
-      Config::kGriffinLayers * (kConv1dWidth == 0 ? 0 : kConv1dWidth - 1) *
-          Config::kModelDim,
-      Config::kGriffinLayers * Config::kModelDim);
+KVCache KVCache::Create(Model type) {
+  return CallFunctorForModel<CreateKVCache>(type);
 }
 
-KVCache CreateKVCache(Model type) {
-  switch (type) {
-    case Model::GEMMA_2B:
-      return CreateKVCacheT<ConfigGemma2B>();
-    case Model::GEMMA_7B:
-      return CreateKVCacheT<ConfigGemma7B>();
-    case Model::GRIFFIN_2B:
-      return CreateKVCacheT<ConfigGriffin2B>();
-    case Model::GEMMA_TINY:
-      return CreateKVCacheT<ConfigGemmaTiny>();
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(type));
-  }
-}
-
-class GemmaTokenizerImpl : public GemmaTokenizer {
+class GemmaTokenizer::Impl {
  public:
-  GemmaTokenizerImpl(
-      std::unique_ptr<sentencepiece::SentencePieceProcessor>&& impl)
-      : impl_(std::move(impl)) {}
-  bool Encode(const std::string& input,
-              std::vector<std::string>* pieces) const override {
-    return impl_->Encode(input, pieces).ok();
+  explicit Impl(const Path& tokenizer_path) {
+    PROFILER_ZONE("Startup.tokenizer");
+    spp_ = std::make_unique<sentencepiece::SentencePieceProcessor>();
+    if (!spp_->Load(tokenizer_path.path).ok()) {
+      HWY_ABORT("Failed to load the tokenizer file.");
+    }
   }
+
   bool Encode(const std::string& input,
-              std::vector<int>* pieces) const override {
+              std::vector<std::string>* pieces) const {
+    return spp_->Encode(input, pieces).ok();
+  }
+
+  bool Encode(const std::string& input, std::vector<int>* pieces) const {
     if constexpr (kShowTokenization) {
-      bool is_ok = impl_->Encode(input, pieces).ok();
+      bool is_ok = spp_->Encode(input, pieces).ok();
       for (int i = 0; i < static_cast<int>(pieces->size()); i++) {
         fprintf(stderr, "%3d: %d\n", i, (*pieces)[i]);
       }
       return is_ok;
     } else {
-      return impl_->Encode(input, pieces).ok();
+      return spp_->Encode(input, pieces).ok();
     }
   }
+
   // Given a sequence of ids, decodes it into a detokenized output.
-  bool Decode(const std::vector<int>& ids,
-              std::string* detokenized) const override {
-    return impl_->Decode(ids, detokenized).ok();
+  bool Decode(const std::vector<int>& ids, std::string* detokenized) const {
+    return spp_->Decode(ids, detokenized).ok();
   }
 
  private:
-  std::unique_ptr<sentencepiece::SentencePieceProcessor> impl_;
+  std::unique_ptr<sentencepiece::SentencePieceProcessor> spp_;
 };
 
-namespace {
-template <class Config>
-void DeleteLayersPtrs(CompressedWeights<Config>* c_weights) {
-  c_weights->c_layer_ptrs.~CompressedLayerPointers<Config>();
+GemmaTokenizer::GemmaTokenizer(const Path& tokenizer_path) {
+  impl_ = std::make_unique<Impl>(tokenizer_path);
 }
-template <class Config>
-void DeleteLayersPtrs(WeightsF<Config>* weights) {
-  weights->layer_ptrs.~LayerPointers<float, Config>();
+
+GemmaTokenizer::~GemmaTokenizer() = default;
+GemmaTokenizer::GemmaTokenizer(GemmaTokenizer&& other) = default;
+GemmaTokenizer& GemmaTokenizer::operator=(GemmaTokenizer&& other) = default;
+
+bool GemmaTokenizer::Encode(const std::string& input,
+                            std::vector<std::string>* pieces) const {
+  return impl_->Encode(input, pieces);
 }
-}  // namespace
 
-template <class Config>
-struct GemmaImpl : public GemmaInterface {
-  GemmaImpl(std::unique_ptr<sentencepiece::SentencePieceProcessor>& tokenizer,
-            hwy::AlignedFreeUniquePtr<uint8_t[]>& weights_u8,
-            hwy::ThreadPool& pool);
+bool GemmaTokenizer::Encode(const std::string& input,
+                            std::vector<int>* pieces) const {
+  return impl_->Encode(input, pieces);
+}
 
-  ~GemmaImpl() {
-    WeightsT<Config>* weights =
-        reinterpret_cast<WeightsT<Config>*>(weights_u8.get());
-    DeleteLayersPtrs(weights);
-  }
+// Given a sequence of ids, decodes it into a detokenized output.
+bool GemmaTokenizer::Decode(const std::vector<int>& ids,
+                            std::string* detokenized) const {
+  return impl_->Decode(ids, detokenized);
+}
 
-  const GemmaTokenizer* Tokenizer() const override { return &tokenizer; }
-
-  void Generate(const RuntimeConfig& runtime_config,
-                const std::vector<int>& prompt, size_t start_pos,
-                KVCache& kv_cache, hwy::ThreadPool& pool,
-                TimingInfo& timing_info, LayersOutputT* layers_output) override;
-
-  float ComputeCrossEntropy(size_t max_tokens, const std::vector<int>& prompt,
-                            KVCache& kv_cache, hwy::ThreadPool& pool,
-                            int verbosity) override;
-
-  GemmaTokenizerImpl tokenizer;
-  hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8;
-  hwy::AlignedUniquePtr<Activations<Config, kPrefillBatchSize>> prefill;
-  hwy::AlignedUniquePtr<Activations<Config, 1>> state;
-};
-
-template <class TConfig>
-std::string TokenString(GemmaImpl<TConfig>& gemma, int token) {
+static std::string TokenString(const GemmaTokenizer& tokenizer, int token) {
   std::string token_str;
-  gemma.Tokenizer()->Decode({token}, &token_str);
+  tokenizer.Decode({token}, &token_str);
   return "'" + std::regex_replace(token_str, std::regex("\n"), "\\n") + "'";
 }
 
+void LogTopK(const GemmaTokenizer& tokenizer, float* HWY_RESTRICT logits,
+             float* HWY_RESTRICT dist, size_t len, size_t k) {
+  std::vector<std::pair<float, int>> sorted(len);
+  for (size_t i = 0; i < len; ++i) {
+    sorted[i] = std::make_pair(dist[i], static_cast<int>(i));
+  }
+  std::sort(sorted.begin(), sorted.end(),
+            [](const std::pair<float, int>& a, const std::pair<float, int>& b) {
+              if (a.first != b.first) {
+                return a.first > b.first;
+              }
+              return a.second < b.second;
+            });
+  for (size_t i = 0; i < k; ++i) {
+    printf("  [#%-2d token %6d = %-12s  %.2e  %f]\n", static_cast<int>(i + 1),
+           sorted[i].second, TokenString(tokenizer, sorted[i].second).c_str(),
+           sorted[i].first, logits[sorted[i].second]);
+  }
+}
+
 }  // namespace gcpp
 #endif  // GEMMA_ONCE
 
@@ -493,6 +245,7 @@ std::string TokenString(GemmaImpl<TConfig>& gemma, int token) {
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
+namespace {
 
 template <size_t kBatchSize, typename LayerT, class TConfig>
 HWY_NOINLINE void GriffinRecurrent(
@@ -893,9 +646,11 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
 
 // n = 1 specialization
 template <typename WeightArrayT, class TConfig>
-void Transformer(int token, size_t pos, const WeightArrayT& weights,
-                 Activations<TConfig, 1>& activations, KVCache& kv_cache,
-                 hwy::ThreadPool& pool, LayersOutputT* layers_output) {
+HWY_NOINLINE void Transformer(int token, size_t pos,
+                              const WeightArrayT& weights,
+                              Activations<TConfig, 1>& activations,
+                              KVCache& kv_cache, hwy::ThreadPool& pool,
+                              LayersOutputT* layers_output) {
   if (layers_output != nullptr) {
     float token_f = token;
     (*layers_output)(pos, "Tokens", &token_f, 1);
@@ -981,14 +736,30 @@ void RangeChecks(size_t& max_tokens, size_t& max_generated_tokens,
   }
 }
 
-template <class TConfig, template<typename> typename WeightsType>
-void GenerateImpl(const WeightsType<TConfig>& weights,
-                  Activations<TConfig, kPrefillBatchSize>& prefill_activations,
-                  Activations<TConfig, 1>& activations,
-                  const RuntimeConfig& runtime_config,
-                  const std::vector<int>& prompt, size_t pos, KVCache& kv_cache,
-                  hwy::ThreadPool& pool, TimingInfo& timing_info,
-                  LayersOutputT* layers_output) {
+template <class TConfig>
+const WeightsT<TConfig>& GetWeights(const ByteStorageT& weights_u8) {
+  return *reinterpret_cast<const WeightsT<TConfig>*>(weights_u8.get());
+}
+
+template <class TConfig, size_t kBatchSize>
+Activations<TConfig, kBatchSize>& GetActivations(const ByteStorageT& state_u8) {
+  return *reinterpret_cast<Activations<TConfig, kBatchSize>*>(state_u8.get());
+}
+
+}  // namespace
+
+template <class TConfig>
+void Generate(const ByteStorageT& weights_u8, const ByteStorageT& prefill_u8,
+              const ByteStorageT& decode_u8,
+              const RuntimeConfig& runtime_config,
+              const std::vector<int>& prompt, size_t pos, KVCache& kv_cache,
+              hwy::ThreadPool& pool, TimingInfo& timing_info,
+              LayersOutputT* layers_output) {
+  const WeightsT<TConfig>& weights = GetWeights<TConfig>(weights_u8);
+  auto& prefill_activations =
+      GetActivations<TConfig, kPrefillBatchSize>(prefill_u8);
+  auto& activations = GetActivations<TConfig, 1>(decode_u8);
+
   static constexpr size_t kVocabSize = TConfig::kVocabSize;
   size_t prompt_size = prompt.size();
   size_t max_tokens = runtime_config.max_tokens;
@@ -1093,78 +864,31 @@ void GenerateImpl(const WeightsType<TConfig>& weights,
 }
 
 template <class TConfig>
-void GenerateImpl(GemmaImpl<TConfig>& gemma,
-                  const RuntimeConfig& runtime_config,
-                  const std::vector<int>& prompt, size_t pos, KVCache& kv_cache,
-                  hwy::ThreadPool& pool, TimingInfo& timing_info,
-                  LayersOutputT* layers_output) {
-  const WeightsT<TConfig>& weights =
-      *reinterpret_cast<WeightsT<TConfig>*>(gemma.weights_u8.get());
-  GenerateImpl<TConfig, WeightsT>(
-      weights, *gemma.prefill.get(), *gemma.state.get(), runtime_config, prompt,
-      pos, kv_cache, pool, timing_info, layers_output);
-}
+void ComputeCrossEntropy(const ByteStorageT& weights_u8,
+                         ByteStorageT& decode_u8,
+                         const GemmaTokenizer& tokenizer, size_t max_tokens,
+                         const std::vector<int>& prompt, KVCache& kv_cache,
+                         hwy::ThreadPool& pool, int verbosity,
+                         float& cross_entropy) {
+  const WeightsT<TConfig>& weights = GetWeights<TConfig>(weights_u8);
+  auto& activations = GetActivations<TConfig, 1>(decode_u8);
 
-template <class TConfig>
-void GenerateGemma(const ByteStorageT& weights_u8,
-                   ByteStorageT& inference_state_u8,
-                   const RuntimeConfig& runtime_config,
-                   const std::vector<int>& prompt, size_t pos,
-                   KVCache& kv_cache, hwy::ThreadPool& pool,
-                   TimingInfo& timing_info, LayersOutputT* layers_output) {
-  const WeightsF<TConfig>& weights =
-      *reinterpret_cast<const WeightsF<TConfig>*>(weights_u8.get());
-  InferenceState<TConfig>& inference_state =
-      *reinterpret_cast<InferenceState<TConfig>*>(inference_state_u8.get());
-  GenerateImpl<TConfig, WeightsF>(
-      weights, inference_state.prefill, inference_state.state, runtime_config,
-      prompt, pos, kv_cache, pool, timing_info, layers_output);
-}
-
-#define TOKEN(token_id) TokenString(gemma, token_id).c_str()
-
-template <class TConfig>
-void LogTopK(GemmaImpl<TConfig>& gemma, float* logits, float* dist, size_t len,
-             size_t k) {
-  std::vector<std::pair<float, int>> sorted(len);
-  for (size_t i = 0; i < len; ++i) {
-    sorted[i] = std::make_pair(dist[i], static_cast<int>(i));
-  }
-  std::sort(sorted.begin(), sorted.end(),
-            [](const std::pair<float, int>& a, const std::pair<float, int>& b) {
-              if (a.first != b.first) {
-                return a.first > b.first;
-              }
-              return a.second < b.second;
-            });
-  for (size_t i = 0; i < k; ++i) {
-    printf("  [#%-2d token %6d = %-12s  %.2e  %f]\n", static_cast<int>(i + 1),
-           sorted[i].second, TOKEN(sorted[i].second), sorted[i].first,
-           logits[sorted[i].second]);
-  }
-}
-
-template <class TConfig>
-float ComputeCrossEntropyImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
-                              const std::vector<int>& prompt, KVCache& kv_cache,
-                              hwy::ThreadPool& pool, int verbosity) {
   static constexpr size_t kModelDim = TConfig::kModelDim;
   static constexpr size_t kVocabSize = TConfig::kVocabSize;
-  Activations<TConfig, 1>& activations = *gemma.state.get();
-  const WeightsT<TConfig>& weights =
-      *reinterpret_cast<const WeightsT<TConfig>*>(gemma.weights_u8.get());
   std::vector<float> logits(kVocabSize);
   Softmax(activations.logits.data(), kVocabSize);
   float total_entropy = 0.0f;
   for (size_t pos = 0; pos < max_tokens && pos < prompt.size(); ++pos) {
     if (verbosity >= 4) {
-      LogTopK(gemma, logits.data(), activations.logits.data(), kVocabSize, 10);
+      LogTopK(tokenizer, logits.data(), activations.logits.data(), kVocabSize,
+              10);
     }
     const int token = prompt[pos];
     const float prob = activations.logits[token];
     if (verbosity >= 3) {
       printf("pos %4zu token %6d = %-12s  %.10e  %14.10f bits\n", pos, token,
-             TOKEN(token), prob, -std::log(prob) / std::log(2.0));
+             TokenString(tokenizer, token).c_str(), prob,
+             -std::log(prob) / std::log(2.0));
     }
     total_entropy -= std::max(std::log(prob), -64.0f);
     if (verbosity >= 2 && pos % 100 == 99) {
@@ -1181,149 +905,7 @@ float ComputeCrossEntropyImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
            kVocabSize * sizeof(logits[0]));
     Softmax(activations.logits.data(), kVocabSize);
   }
-  return total_entropy / std::log(2.0);
-}
-
-#undef TOKEN
-
-// Calls func(name, float*, CompressedArray&) for each tensor. float* is null
-// if weights = null, which happens during the first call where we attempt to
-// load from cache.
-//
-// This avoids repeating the list of tensors between loading and compressing.
-template <class TConfig, class Func>
-void ForEachTensor(const WeightsF<TConfig>* weights,
-                   CompressedWeights<TConfig>& c_weights, Func& func) {
-  func("c_embedding",
-       weights ? weights->embedder_input_embedding.data() : nullptr,
-       c_weights.embedder_input_embedding);
-  func("c_final_norm", weights ? weights->final_norm_scale.data() : nullptr,
-       c_weights.final_norm_scale);
-
-  char name_buf[16];
-  for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
-    auto type = TConfig::kLayerConfig[layer_idx];
-    const size_t idx = static_cast<size_t>(layer_idx);
-    const LayerF<TConfig>* layer = weights ? weights->GetLayer(idx) : nullptr;
-    CompressedLayer<TConfig>* layer_weights = c_weights.GetLayer(idx);
-
-#define CALL_FUNC(name, member)                                \
-  snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
-  func(name_buf, layer ? layer->member.data() : nullptr, layer_weights->member)
-
-    CALL_FUNC("pre_ff_ns", pre_ffw_norm_scale);
-    CALL_FUNC("gating_ein", gating_einsum_w);
-    CALL_FUNC("linear_w", linear_w);
-    if (type == LayerAttentionType::kGemma) {
-      CALL_FUNC("qkv_ein", qkv_einsum_w);
-      CALL_FUNC("att_ein", attn_vec_einsum_w);
-    } else {
-      CALL_FUNC("gr_lin_x_w", griffin.linear_x_w);
-      CALL_FUNC("gr_lin_x_b", griffin.linear_x_biases);
-      CALL_FUNC("gr_lin_y_w", griffin.linear_y_w);
-      CALL_FUNC("gr_lin_y_b", griffin.linear_y_biases);
-      CALL_FUNC("gr_lin_out_w", griffin.linear_out_w);
-      CALL_FUNC("gr_lin_out_b", griffin.linear_out_biases);
-      CALL_FUNC("gr_conv_w", griffin.conv_w);
-      CALL_FUNC("gr_conv_b", griffin.conv_biases);
-      CALL_FUNC("gr_gate_w", griffin.gate_w);
-      CALL_FUNC("gr_gate_b", griffin.gate_biases);
-      CALL_FUNC("gr_a", griffin.a);
-    }
-    CALL_FUNC("pre_att_ns", pre_attention_norm_scale);
-    if (TConfig::kPostNormScale) {
-      CALL_FUNC("post_att_ns", post_attention_norm_scale);
-      CALL_FUNC("post_ff_ns", post_ffw_norm_scale);
-    }
-
-    if (TConfig::kFFBiases) {
-      CALL_FUNC("ffw_gat_b", ffw_gating_biases);
-      CALL_FUNC("ffw_out_b", ffw_output_biases);
-    }
-
-    if (TConfig::kSoftmaxAttnOutputBiases &&
-        type == LayerAttentionType::kGemma) {
-      CALL_FUNC("attn_ob", attention_output_biases);
-    }
-#undef CALL_FUNC
-  }
-}
-
-template <class TConfig>
-hwy::AlignedFreeUniquePtr<uint8_t[]> LoadCompressedWeights(
-    const Path& weights, hwy::ThreadPool& pool) {
-  PROFILER_ZONE("Startup.LoadCompressedWeights");
-  if (!weights.Exists()) {
-    HWY_ABORT("The model weights file '%s' does not exist.",
-              weights.path.c_str());
-  }
-
-  // Allocate compressed weights.
-  using CWeights = CompressedWeights<TConfig>;
-  hwy::AlignedFreeUniquePtr<uint8_t[]> c_weights_u8 =
-      hwy::AllocateAligned<uint8_t>(sizeof(CWeights));
-  CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
-  new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
-
-  std::array<float, TConfig::kNumTensorScales> scales;
-  CacheLoader loader(weights);
-  ForEachTensor<TConfig>(nullptr, *c_weights, loader);
-  loader.LoadScales(scales.data(), scales.size());
-  if (!loader.ReadAll(pool)) {
-    HWY_ABORT("Failed to load model weights.");
-  }
-  if (TConfig::kNumTensorScales > 0) {
-    size_t scale_pos = 0;
-    for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
-      auto type = TConfig::kLayerConfig[layer_idx];
-      const size_t idx = static_cast<size_t>(layer_idx);
-      CompressedLayer<TConfig>* layer_weights = c_weights->GetLayer(idx);
-      if (type == LayerAttentionType::kGemma) {
-        layer_weights->attn_vec_einsum_w.set_scale(scales[scale_pos++]);
-        layer_weights->qkv_einsum_w.set_scale(scales[scale_pos++]);
-      } else {
-        layer_weights->griffin.linear_x_w.set_scale(scales[scale_pos++]);
-        layer_weights->griffin.linear_y_w.set_scale(scales[scale_pos++]);
-        layer_weights->griffin.linear_out_w.set_scale(scales[scale_pos++]);
-        layer_weights->griffin.gate_w.set_scale(scales[scale_pos++]);
-      }
-      layer_weights->gating_einsum_w.set_scale(scales[scale_pos++]);
-      layer_weights->linear_w.set_scale(scales[scale_pos++]);
-    }
-    HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
-  }
-  return c_weights_u8;
-}
-
-template <class TConfig>
-void CompressWeights(const Path& weights_path,
-                     const Path& compressed_weights_path,
-                     hwy::ThreadPool& pool) {
-  if (!weights_path.Exists()) {
-    HWY_ABORT("The model weights file '%s' does not exist.",
-              weights_path.path.c_str());
-  }
-
-  // Allocate compressed weights.
-  using CWeights = CompressedWeights<TConfig>;
-  hwy::AlignedFreeUniquePtr<uint8_t[]> c_weights_u8 =
-      hwy::AllocateAligned<uint8_t>(sizeof(CWeights));
-  CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
-  new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
-
-  // Get weights, compress, and store.
-  const bool scale_for_compression = TConfig::kNumTensorScales > 0;
-  const hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8 =
-      LoadWeights<TConfig>(weights_path, pool, scale_for_compression);
-  WeightsF<TConfig>* weights =
-      reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
-  Compressor compressor(pool);
-  ForEachTensor<TConfig>(weights, *c_weights, compressor);
-  compressor.AddScales(weights->scales.data(), weights->scales.size());
-  compressor.WriteAll(pool, compressed_weights_path);
-
-  weights->layer_ptrs.~LayerPointers<float, TConfig>();
-  c_weights->c_layer_ptrs.~CompressedLayerPointers<TConfig>();
+  cross_entropy = total_entropy / std::log(2.0);
 }
 
 }  // namespace HWY_NAMESPACE
@@ -1333,241 +915,69 @@ HWY_AFTER_NAMESPACE();
 #if HWY_ONCE
 namespace gcpp {
 
-KVCache CreateKVCache(size_t size_cache_pos, size_t seq_len,
-                      size_t conv1d_cache_size, size_t rglru_cache_size) {
-  KVCache kv_cache = {};
-  if (size_cache_pos != 0) {
-    kv_cache.kv_cache =
-        hwy::AllocateAligned<float>(seq_len * size_cache_pos * 2);
-  }
-  if (conv1d_cache_size != 0) {
-    kv_cache.conv1d_cache = hwy::AllocateAligned<float>(conv1d_cache_size);
-    hwy::ZeroBytes(kv_cache.conv1d_cache.get(),
-                   conv1d_cache_size * sizeof(kv_cache.conv1d_cache[0]));
-  }
-  if (rglru_cache_size != 0) {
-    kv_cache.rglru_cache = hwy::AllocateAligned<float>(rglru_cache_size);
-    hwy::ZeroBytes(kv_cache.rglru_cache.get(),
-                   rglru_cache_size * sizeof(kv_cache.rglru_cache[0]));
-  }
-  return kv_cache;
-}
-
-template <class Config>
-GemmaImpl<Config>::GemmaImpl(
-    std::unique_ptr<sentencepiece::SentencePieceProcessor>& tokenizer,
-    hwy::AlignedFreeUniquePtr<uint8_t[]>& weights_u8, hwy::ThreadPool& pool)
-    : tokenizer(GemmaTokenizerImpl(std::move(tokenizer))),
-      weights_u8(std::move(weights_u8)),
-      prefill(hwy::MakeUniqueAligned<Activations<Config, kPrefillBatchSize>>()),
-      state(hwy::MakeUniqueAligned<Activations<Config, 1>>()) {}
-
-template <typename Config>
-void GemmaImpl<Config>::Generate(const RuntimeConfig& runtime_config,
-                                 const std::vector<int>& prompt,
-                                 size_t start_pos, KVCache& kv_cache,
-                                 hwy::ThreadPool& pool, TimingInfo& timing_info,
-                                 LayersOutputT* layers_output) {
-  HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateImpl<Config>)
-  (*this, runtime_config, prompt, start_pos, kv_cache, pool, timing_info,
-   layers_output);
-}
-
-template <typename Config>
-float GemmaImpl<Config>::ComputeCrossEntropy(size_t max_tokens,
-                                             const std::vector<int>& prompt,
-                                             KVCache& kv_cache,
-                                             hwy::ThreadPool& pool,
-                                             int verbosity) {
-  HWY_EXPORT_T(ComputeCrossEntropyT, ComputeCrossEntropyImpl<Config>);
-  return HWY_DYNAMIC_DISPATCH_T(ComputeCrossEntropyT)(
-      *this, max_tokens, prompt, kv_cache, pool, verbosity);
-}
-
-template <class Config>
-GemmaImpl<Config>* CreateGemmaImpl(const Path& tokenizer_path,
-                                   const Path& weights, hwy::ThreadPool& pool) {
-  std::unique_ptr<sentencepiece::SentencePieceProcessor> tokenizer;
-  {
-    PROFILER_ZONE("Startup.tokenizer");
-    tokenizer = std::make_unique<sentencepiece::SentencePieceProcessor>();
-    if (!tokenizer->Load(tokenizer_path.path).ok()) {
-      HWY_ABORT("Failed to load the tokenizer file.");
-    }
-  }
-
-  hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8;
-  if constexpr (kWeightsAreCompressed) {
-    HWY_EXPORT_T(LoadCompressedWeightsT, LoadCompressedWeights<Config>);
-    weights_u8 = HWY_DYNAMIC_DISPATCH_T(LoadCompressedWeightsT)(weights, pool);
-  } else {
-    weights_u8 = LoadWeights<Config>(weights, pool);
-  }
-  return new GemmaImpl<Config>(tokenizer, weights_u8, pool);
-}
-
-Gemma::Gemma(const Path& tokenizer_path, const Path& weights, Model model_type,
-             hwy::ThreadPool& pool) {
-  switch (model_type) {
-    case Model::GEMMA_2B:
-      impl_.reset(
-          CreateGemmaImpl<ConfigGemma2B>(tokenizer_path, weights, pool));
-      break;
-    case Model::GEMMA_7B:
-      impl_.reset(CreateGemmaImpl<ConfigGemma7B>(tokenizer_path, weights, pool));
-      break;
-    case Model::GRIFFIN_2B:
-      impl_.reset(CreateGemmaImpl<ConfigGriffin2B>(tokenizer_path, weights, pool));
-      break;
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model_type));
-  }
-}
-
-Gemma::~Gemma() = default;  // after GemmaInterface is defined
-
-const GemmaTokenizer* Gemma::Tokenizer() const { return impl_->Tokenizer(); }
-
-void GenerateGemma(Gemma& gemma, const RuntimeConfig& runtime_config,
-                   const std::vector<int>& prompt, size_t start_pos,
-                   KVCache& kv_cache, hwy::ThreadPool& pool,
-                   TimingInfo& timing_info,
-                   LayersOutputT* layers_output) {
-  pool.SetWaitMode(hwy::PoolWaitMode::kSpin);
-  gemma.impl_->Generate(runtime_config, prompt, start_pos, kv_cache, pool,
-                        timing_info, layers_output);
-  pool.SetWaitMode(hwy::PoolWaitMode::kBlock);
-}
-
-void GenerateGemma(Model model, const ByteStorageT& weights,
-                   ByteStorageT& inference_state,
-                   RuntimeConfig runtime_config,
-                   const std::vector<int>& prompt, size_t start_pos,
-                   KVCache& kv_cache, hwy::ThreadPool& pool,
-                   TimingInfo& timing_info) {
-  switch (model) {
-    case Model::GEMMA_2B:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateGemma<ConfigGemma2B>)(
-          weights, inference_state, runtime_config, prompt, start_pos, kv_cache,
-          pool, timing_info, /*layers_output=*/nullptr);
-      break;
-    case Model::GEMMA_7B:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateGemma<ConfigGemma7B>)(
-          weights, inference_state, runtime_config, prompt, start_pos, kv_cache,
-          pool, timing_info, /*layers_output=*/nullptr);
-      break;
-    case Model::GRIFFIN_2B:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateGemma<ConfigGriffin2B>)(
-          weights, inference_state, runtime_config, prompt, start_pos, kv_cache,
-          pool, timing_info, /*layers_output=*/nullptr);
-      break;
-    case Model::GEMMA_TINY:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(GenerateGemma<ConfigGemmaTiny>)(
-          weights, inference_state, runtime_config, prompt, start_pos, kv_cache,
-          pool, timing_info, /*layers_output=*/nullptr);
-      break;
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
-}
-
-ByteStorageT LoadWeights(const Path& weights, Model model,
-                         hwy::ThreadPool& pool) {
-  switch (model) {
-    case Model::GEMMA_2B:
-      return LoadWeights<ConfigGemma2B>(weights, pool);
-    case Model::GEMMA_7B:
-      return LoadWeights<ConfigGemma7B>(weights, pool);
-    case Model::GRIFFIN_2B:
-      return LoadWeights<ConfigGriffin2B>(weights, pool);
-    case Model::GEMMA_TINY:
-      return LoadWeights<ConfigGemmaTiny>(weights, pool);
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
-}
-
-ByteStorageT AllocateInferenceState(Model model) {
-  switch (model) {
-    case Model::GEMMA_2B:
-      return InferenceState<ConfigGemma2B>::Allocate();
-    case Model::GEMMA_7B:
-      return InferenceState<ConfigGemma7B>::Allocate();
-    case Model::GRIFFIN_2B:
-      return InferenceState<ConfigGriffin2B>::Allocate();
-    case Model::GEMMA_TINY:
-      return InferenceState<ConfigGemmaTiny>::Allocate();
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
-}
-
-void CompressWeights(gcpp::Model model, const Path& weights,
-                     const Path& compressed_weights, hwy::ThreadPool& pool) {
-  switch (model) {
-    case Model::GEMMA_2B:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<ConfigGemma2B>)(
-          weights, compressed_weights, pool);
-      break;
-    case Model::GEMMA_7B:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<ConfigGemma7B>)(
-          weights, compressed_weights, pool);
-      break;
-    case Model::GRIFFIN_2B:
-      HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(CompressWeights<ConfigGriffin2B>)(
-          weights, compressed_weights, pool);
-      break;
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
-}
-
-float ComputeCrossEntropy(Gemma& gemma, size_t max_tokens,
-                          const std::vector<int>& prompt, KVCache& kv_cache,
-                          hwy::ThreadPool& pool, int verbosity) {
-  pool.SetWaitMode(hwy::PoolWaitMode::kSpin);
-  const float result = gemma.impl_->ComputeCrossEntropy(
-      max_tokens, prompt, kv_cache, pool, verbosity);
-  pool.SetWaitMode(hwy::PoolWaitMode::kBlock);
-  return result;
-}
-
 namespace {
-constexpr const char* kModelFlags[] = {"2b-pt", "7b-pt", "gr2b-pt",
-                                       "2b-it", "7b-it", "gr2b-it",
-                                       "tiny"};
-constexpr Model kModelTypes[] = {Model::GEMMA_2B,   Model::GEMMA_7B,
-                                 Model::GRIFFIN_2B, Model::GEMMA_2B,
-                                 Model::GEMMA_7B,   Model::GRIFFIN_2B,
-                                 Model::GEMMA_TINY};
-constexpr ModelTraining kModelTraining[] = {
-    ModelTraining::GEMMA_PT, ModelTraining::GEMMA_PT, ModelTraining::GEMMA_PT,
-    ModelTraining::GEMMA_IT, ModelTraining::GEMMA_IT, ModelTraining::GEMMA_IT,
-    ModelTraining::GEMMA_IT};
+template <typename TConfig>
+struct AllocatePrefill {
+  ByteStorageT operator()() const {
+    return AllocateSizeof<Activations<TConfig, kPrefillBatchSize>>();
+  }
+};
+
+template <typename TConfig>
+struct AllocateDecode {
+  ByteStorageT operator()() const {
+    return AllocateSizeof<Activations<TConfig, 1>>();
+  }
+};
 }  // namespace
 
-const char* ParseModelTypeAndTraining(const std::string& model_flag,
-                                      Model& model, ModelTraining& training) {
-  constexpr size_t kNum = std::end(kModelFlags) - std::begin(kModelFlags);
-  static char kErrorMessageBuffer[kNum * 8 + 1024] =
-      "Invalid or missing model flag, need to specify one of ";
-  for (size_t i = 0; i + 1 < kNum; i++) {
-    strcat(kErrorMessageBuffer, kModelFlags[i]);  // NOLINT
-    strcat(kErrorMessageBuffer, ", ");            // NOLINT
-  }
-  strcat(kErrorMessageBuffer, kModelFlags[kNum - 1]);  // NOLINT
-  strcat(kErrorMessageBuffer, ".");                    // NOLINT
-  std::string model_type_lc = model_flag;
-  std::transform(begin(model_type_lc), end(model_type_lc), begin(model_type_lc),
-                 [](unsigned char c) { return std::tolower(c); });
-  for (size_t i = 0; i < kNum; i++) {
-    if (kModelFlags[i] == model_type_lc) {
-      model = kModelTypes[i];
-      training = kModelTraining[i];
-      return nullptr;
-    }
-  }
-  return kErrorMessageBuffer;
+Gemma::Gemma(const Path& tokenizer_path, const Path& weights, Model model_type,
+             hwy::ThreadPool& pool)
+    : pool_(pool), tokenizer_(tokenizer_path), model_type_(model_type) {
+  weights_u8_ = LoadWeights(weights, model_type, pool);
+  prefill_u8_ = CallFunctorForModel<AllocatePrefill>(model_type);
+  decode_u8_ = CallFunctorForModel<AllocateDecode>(model_type);
+}
+
+Gemma::Gemma(GemmaTokenizer&& tokenizer, Model model_type,
+             hwy::ThreadPool& pool)
+    : pool_(pool), tokenizer_(std::move(tokenizer)), model_type_(model_type) {
+  weights_u8_ = CallFunctorForModel<AllocateWeights>(model_type, pool);
+  prefill_u8_ = CallFunctorForModel<AllocatePrefill>(model_type);
+  decode_u8_ = CallFunctorForModel<AllocateDecode>(model_type);
+}
+
+Gemma::~Gemma() {
+  CallFunctorForModel<DeleteLayersPtrs>(model_type_, weights_u8_);
+}
+
+void Gemma::Generate(const RuntimeConfig& runtime_config,
+                     const std::vector<int>& prompt, size_t start_pos,
+                     KVCache& kv_cache, TimingInfo& timing_info,
+                     LayersOutputT* layers_output) {
+  pool_.SetWaitMode(hwy::PoolWaitMode::kSpin);
+
+  GEMMA_EXPORT_AND_DISPATCH_MODEL(
+      model_type_, Generate,
+      (weights_u8_, prefill_u8_, decode_u8_, runtime_config, prompt, start_pos,
+       kv_cache, pool_, timing_info, layers_output));
+
+  pool_.SetWaitMode(hwy::PoolWaitMode::kBlock);
+}
+
+float Gemma::ComputeCrossEntropy(size_t max_tokens,
+                                 const std::vector<int>& prompt,
+                                 KVCache& kv_cache, int verbosity) {
+  pool_.SetWaitMode(hwy::PoolWaitMode::kSpin);
+
+  float cross_entropy = 0.0f;
+  GEMMA_EXPORT_AND_DISPATCH_MODEL(
+      model_type_, ComputeCrossEntropy,
+      (weights_u8_, decode_u8_, tokenizer_, max_tokens, prompt, kv_cache, pool_,
+       verbosity, cross_entropy));
+
+  pool_.SetWaitMode(hwy::PoolWaitMode::kBlock);
+  return cross_entropy;
 }
 
 }  // namespace gcpp
diff --git a/gemma/gemma.h b/gemma/gemma.h
index 442637d..2e7673a 100644
--- a/gemma/gemma.h
+++ b/gemma/gemma.h
@@ -24,22 +24,12 @@
 
 #include "compression/io.h"  // Path
 #include "gemma/common.h"
-#include "gemma/configs.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // hwy::bfloat16_t
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
 
-using GemmaWeightT = GEMMA_WEIGHT_T;
-using EmbedderInputT = hwy::bfloat16_t;
-// Will be called for layers output with:
-// - position in the tokens sequence
-// - name of the data, p.ex. "tokens", "block.1", "final_norm"
-// - pointer to the data array
-// - size of the data array
-using LayersOutputT =
-    std::function<void(int, const std::string&, const float*, size_t)>;
 constexpr size_t kPrefillBatchSize = 16;
 constexpr bool kSystemPrompt = false;
 
@@ -50,14 +40,30 @@ struct KVCache {
       conv1d_cache;  // (kConv1dWidth - 1) * kModelDim * kGriffinLayers
   hwy::AlignedFreeUniquePtr<float[]>
       rglru_cache;  // kModelDim * kGriffinLayers
+
+  static KVCache Create(Model type);
 };
 
-enum class ModelTraining { GEMMA_IT, GEMMA_PT };
+constexpr int EOS_ID = 1;
 
-// Returns error string or nullptr if OK.
-// Thread-hostile.
-const char* ParseModelTypeAndTraining(const std::string& model_flag,
-                                      Model& model, ModelTraining& training);
+class GemmaTokenizer {
+ public:
+  GemmaTokenizer() = default;  // for second Gemma ctor.
+  explicit GemmaTokenizer(const Path& tokenizer_path);
+
+  // must come after definition of Impl
+  ~GemmaTokenizer();
+  GemmaTokenizer(GemmaTokenizer&& other);
+  GemmaTokenizer& operator=(GemmaTokenizer&& other);
+
+  bool Encode(const std::string& input, std::vector<std::string>* pieces) const;
+  bool Encode(const std::string& input, std::vector<int>* pieces) const;
+  bool Decode(const std::vector<int>& ids, std::string* detokenized) const;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
 
 // StreamFunc is called with (token, probability). For prompt tokens,
 // probability is 0.0f. StreamFunc should return False to stop generation and
@@ -67,8 +73,6 @@ using StreamFunc = std::function<bool(int, float)>;
 // want to generate and True for tokens you want to generate.
 using AcceptFunc = std::function<bool(int)>;
 
-constexpr int EOS_ID = 1;
-
 struct RuntimeConfig {
   size_t max_tokens;
   size_t max_generated_tokens;
@@ -80,64 +84,65 @@ struct RuntimeConfig {
   int eos_id = EOS_ID;
 };
 
-struct GemmaInterface;
-
-class GemmaTokenizer {
- public:
-  virtual ~GemmaTokenizer() = default;
-  virtual bool Encode(const std::string& input,
-                      std::vector<std::string>* pieces) const = 0;
-  virtual bool Encode(const std::string& input,
-                      std::vector<int>* pieces) const = 0;
-  virtual bool Decode(const std::vector<int>& ids,
-                      std::string* detokenized) const = 0;
-};
-
-struct Gemma {
-  Gemma(const Path& tokenizer_path, const Path& weights, Model model_type,
-        hwy::ThreadPool& pool);
-  ~Gemma();  // must be defined after the GemmaInterface dtor is defined.
-  const GemmaTokenizer* Tokenizer() const;
-  std::unique_ptr<GemmaInterface> impl_;
-};
-
 struct TimingInfo {
   double prefill_tok_sec = 0.0;
   double gen_tok_sec = 0.0;
   double time_to_first_token = 0;
 };
 
-KVCache CreateKVCache(Model type);  // convenient workaround for now
-KVCache CreateKVCache(size_t size_cache_pos, size_t seq_len,
-                      size_t conv1d_cache_size, size_t rglru_cache_size);
+// Will be called for layers output with:
+// - position in the tokens sequence
+// - name of the data, p.ex. "tokens", "block.1", "final_norm"
+// - pointer to the data array
+// - size of the data array
+using LayersOutputT =
+    std::function<void(int, const std::string&, const float*, size_t)>;
 
-// Bundle runtime parameters as RuntimeConfig
-// layers_output is optional; if set - it will be called with the activations
-// output after applying each layer.
-void GenerateGemma(Gemma& gemma, const RuntimeConfig& runtime_config,
-                   const std::vector<int>& prompt, size_t start_pos,
-                   KVCache& kv_cache, hwy::ThreadPool& pool,
-                   TimingInfo& timing_info,
-                   LayersOutputT* layers_output = nullptr);
+class Gemma {
+ public:
+  Gemma(const Path& tokenizer_path, const Path& weights, Model model_type,
+        hwy::ThreadPool& pool);
 
-void GenerateGemma(Model model, const ByteStorageT& weights,
-                   ByteStorageT& inference_state,
-                   RuntimeConfig runtime_config,
-                   const std::vector<int>& prompt, size_t start_pos,
-                   KVCache& kv_cache, hwy::ThreadPool& pool,
-                   TimingInfo& timing_info);
+  // Allocates weights, caller is responsible for filling them.
+  Gemma(GemmaTokenizer&& tokenizer, Model model_type, hwy::ThreadPool& pool);
+  ~Gemma();
 
-ByteStorageT LoadWeights(const Path& weights, Model model,
-                         hwy::ThreadPool& pool);
+  const GemmaTokenizer& Tokenizer() const { return tokenizer_; }
+  const ByteStorageT& Weights() const { return weights_u8_; }
+  const ByteStorageT& Prefill() const { return prefill_u8_; }
+  const ByteStorageT& Decode() const { return decode_u8_; }
 
-ByteStorageT AllocateInferenceState(Model model);
+  // layers_output is optional; if set - it will be called with the activations
+  // output after applying each layer.
+  void Generate(const RuntimeConfig& runtime_config,
+                const std::vector<int>& prompt, size_t start_pos,
+                KVCache& kv_cache, TimingInfo& timing_info,
+                LayersOutputT* layers_output = nullptr);
 
-void CompressWeights(gcpp::Model model, const Path& weights,
-                     const Path& compressed_weights, hwy::ThreadPool& pool);
+  float ComputeCrossEntropy(size_t max_tokens, const std::vector<int>& prompt,
+                            KVCache& kv_cache, int verbosity);
 
-float ComputeCrossEntropy(Gemma& gemma, size_t max_tokens,
-                          const std::vector<int>& prompt, KVCache& kv_cache,
-                          hwy::ThreadPool& pool, int verbosity);
+ private:
+  hwy::ThreadPool& pool_;
+
+  GemmaTokenizer tokenizer_;
+  // Type-erased so that this can be defined in the header, without requiring
+  // forwarding functions.
+  ByteStorageT weights_u8_;
+  ByteStorageT prefill_u8_;
+  ByteStorageT decode_u8_;
+  Model model_type_;
+};
+
+// DEPRECATED, call Gemma::Generate directly.
+HWY_INLINE void GenerateGemma(Gemma& gemma, const RuntimeConfig& runtime_config,
+                              const std::vector<int>& prompt, size_t start_pos,
+                              KVCache& kv_cache, hwy::ThreadPool& /*pool*/,
+                              TimingInfo& timing_info,
+                              LayersOutputT* layers_output) {
+  gemma.Generate(runtime_config, prompt, start_pos, kv_cache, timing_info,
+                 layers_output);
+}
 
 }  // namespace gcpp
 
diff --git a/gemma/gemma_test.cc b/gemma/gemma_test.cc
index 0318fde..75686b9 100644
--- a/gemma/gemma_test.cc
+++ b/gemma/gemma_test.cc
@@ -22,8 +22,9 @@
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "compression/io.h"  // Path
+#include "gemma/common.h"
 #include "gemma/ops.h"
-#include "util/args.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -38,7 +39,7 @@ class GemmaTest : public ::testing::Test {
         pool(std::min<int>(20, (std::thread::hardware_concurrency() - 1) / 2)),
         model_type(gcpp::Model::GEMMA_2B),
         model(tokenizer, weights, model_type, pool) {
-    kv_cache = CreateKVCache(model_type);
+    KVCache kv_cache = KVCache::Create(model_type);
   }
 
   std::string GemmaReply(const std::string& prompt_string) {
@@ -46,7 +47,7 @@ class GemmaTest : public ::testing::Test {
     gen.seed(42);
 
     std::vector<int> prompt;
-    HWY_ASSERT(model.Tokenizer()->Encode(prompt_string, &prompt));
+    HWY_ASSERT(model.Tokenizer().Encode(prompt_string, &prompt));
     // For both pre-trained and instruction-tuned models: prepend "<bos>" token
     // if needed.
     prompt.insert(prompt.begin(), 2);
@@ -66,18 +67,18 @@ class GemmaTest : public ::testing::Test {
         .accept_token = [](int) { return true; },
     };
     gcpp::TimingInfo timing_info;
-    gcpp::GenerateGemma(model, runtime_config, prompt, /*start_pos=*/0,
-                        kv_cache, pool, timing_info, /*layers_output=*/nullptr);
+    model.Generate(runtime_config, prompt, /*start_pos=*/0, kv_cache,
+                   timing_info, /*layers_output=*/nullptr);
     std::string response_text;
-    HWY_ASSERT(model.Tokenizer()->Decode(response, &response_text));
+    HWY_ASSERT(model.Tokenizer().Decode(response, &response_text));
     return response_text;
   }
 
   float GemmaCrossEntropy(const std::string& prompt_string) {
     std::vector<int> prompt;
-    HWY_ASSERT(model.Tokenizer()->Encode(prompt_string, &prompt));
-    return gcpp::ComputeCrossEntropy(model, /*max_tokens=*/3072, prompt,
-                                     kv_cache, pool, /*verbosity=*/0) /
+    HWY_ASSERT(model.Tokenizer().Encode(prompt_string, &prompt));
+    return model.ComputeCrossEntropy(/*max_tokens=*/3072, prompt, kv_cache,
+                                     /*verbosity=*/0) /
            prompt_string.size();
   }
 
diff --git a/gemma/ops.h b/gemma/ops.h
index e102224..4b54378 100644
--- a/gemma/ops.h
+++ b/gemma/ops.h
@@ -17,12 +17,12 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_OPS_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_OPS_H_
 
+#include <math.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <stdio.h>
 
 #include <array>
-#include <cmath>
-#include <cstdio>
 #include <random>
 #include <type_traits>  // std::enable_if_t
 
@@ -31,21 +31,6 @@
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/profiler.h"
 
-namespace gcpp {
-
-// __builtin_sqrt is not constexpr as of Clang 17.
-#if HWY_COMPILER_GCC_ACTUAL
-#define GEMMA_CONSTEXPR_SQRT constexpr
-static GEMMA_CONSTEXPR_SQRT HWY_INLINE float Sqrt(float x) {
-  return __builtin_sqrt(x);
-}
-#else
-#define GEMMA_CONSTEXPR_SQRT
-static GEMMA_CONSTEXPR_SQRT HWY_INLINE float Sqrt(float x) { return sqrtf(x); }
-#endif
-
-}  // namespace gcpp
-
 #endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_OPS_H_
 
 // Include guard for (potentially) SIMD code.
diff --git a/gemma/run.cc b/gemma/run.cc
index 2d557af..372346b 100644
--- a/gemma/run.cc
+++ b/gemma/run.cc
@@ -25,6 +25,7 @@
 
 // Placeholder for internal header, do not modify.
 #include "compression/compress.h"
+#include "gemma/configs.h"
 #include "gemma/gemma.h"  // Gemma
 #include "util/app.h"
 #include "util/args.h"  // HasHelp
@@ -116,8 +117,7 @@ void ReplGemma(gcpp::Gemma& model, ModelTraining training,
 
   // callback function invoked for each generated token.
   auto stream_token = [&abs_pos, &current_pos, &args, &gen, &prompt_size,
-                       tokenizer = model.Tokenizer(),
-                       verbosity](int token, float) {
+                       &model, verbosity](int token, float) {
     ++abs_pos;
     ++current_pos;
     // <= since position is incremented before
@@ -135,7 +135,8 @@ void ReplGemma(gcpp::Gemma& model, ModelTraining training,
       }
     } else {
       std::string token_text;
-      HWY_ASSERT(tokenizer->Decode(std::vector<int>{token}, &token_text));
+      HWY_ASSERT(
+          model.Tokenizer().Decode(std::vector<int>{token}, &token_text));
       // +1 since position is incremented above
       if (current_pos == prompt_size + 1) {
         // first token of response
@@ -192,7 +193,7 @@ void ReplGemma(gcpp::Gemma& model, ModelTraining training,
       }
     }
 
-    HWY_ASSERT(model.Tokenizer()->Encode(prompt_string, &prompt));
+    HWY_ASSERT(model.Tokenizer().Encode(prompt_string, &prompt));
 
     // For both pre-trained and instruction-tuned models: prepend "<bos>" token
     // if needed.
@@ -221,8 +222,7 @@ void ReplGemma(gcpp::Gemma& model, ModelTraining training,
         .stream_token = stream_token,
         .accept_token = accept_token,
     };
-    GenerateGemma(model, runtime_config, prompt, abs_pos, kv_cache, pool,
-                  timing_info);
+    model.Generate(runtime_config, prompt, abs_pos, kv_cache, timing_info);
     if (verbosity >= 2) {
       std::cout << current_pos << " tokens (" << abs_pos << " total tokens)"
                 << "\n"
@@ -251,7 +251,7 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
 
   gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
 
-  auto kv_cache = CreateKVCache(loader.ModelType());
+  KVCache kv_cache = KVCache::Create(loader.ModelType());
 
   if (const char* error = inference.Validate()) {
     ShowHelp(loader, inference, app);
diff --git a/gemma/run_mmlu.cc b/gemma/run_mmlu.cc
index 4d1b0be..822d864 100644
--- a/gemma/run_mmlu.cc
+++ b/gemma/run_mmlu.cc
@@ -15,7 +15,6 @@
 
 // Command line text interface to gemma.
 
-#include <ctime>
 #include <fstream>
 #include <iostream>
 #include <random>
@@ -63,7 +62,7 @@ void JsonGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache,
   std::set<int> accept_token_set{};
   for (const std::string& accept_token : accept_tokens) {
     std::vector<int> accept_token_ids;
-    HWY_ASSERT(model.Tokenizer()->Encode(accept_token, &accept_token_ids));
+    HWY_ASSERT(model.Tokenizer().Encode(accept_token, &accept_token_ids));
     accept_token_set.insert(accept_token_ids.begin(), accept_token_ids.end());
   }
 
@@ -76,7 +75,7 @@ void JsonGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache,
     const std::string& prompt_string = sample["prompt"];
     std::vector<int> prompt;
 
-    HWY_ASSERT(model.Tokenizer()->Encode(prompt_string, &prompt));
+    HWY_ASSERT(model.Tokenizer().Encode(prompt_string, &prompt));
     prompt_size = prompt.size();
 
     const std::string& correct_answer = accept_tokens[sample["input_label"]];
@@ -124,11 +123,10 @@ void JsonGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache,
         .stream_token = stream_token,
         .accept_token = accept_token,
     };
-    GenerateGemma(model, runtime_config, prompt, abs_pos, kv_cache, pool,
-                  timing_info);
+    model.Generate(runtime_config, prompt, abs_pos, kv_cache, timing_info);
 
     std::string output_string;
-    HWY_ASSERT(model.Tokenizer()->Decode(predicted_token_ids, &output_string));
+    HWY_ASSERT(model.Tokenizer().Decode(predicted_token_ids, &output_string));
     std::cout << "QuestionId: " << sample["i"] << "; "
               << "Predicted Answer: " << output_string << "; "
               << "Correct Answer: " << correct_answer << std::endl;
@@ -161,7 +159,7 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
 
   gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
 
-  auto kv_cache = CreateKVCache(loader.ModelType());
+  gcpp::KVCache kv_cache = gcpp::KVCache::Create(loader.ModelType());
 
   JsonGemma(model, kv_cache, pool, inference, app.verbosity, app.eot_line);
 }
diff --git a/gemma/weights.cc b/gemma/weights.cc
index aa302e2..cfb2b86 100644
--- a/gemma/weights.cc
+++ b/gemma/weights.cc
@@ -15,53 +15,230 @@
 
 #include "gemma/weights.h"
 
+#include <algorithm>
+#include <cstdlib>
+
+#include "compression/compress.h"
+#include "compression/io.h"  // Path
 #include "gemma/common.h"
 #include "gemma/configs.h"
+#include "hwy/base.h"  // HWY_ABORT
 #include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/profiler.h"
 #include "hwy/stats.h"
 
 namespace gcpp {
 
-ByteStorageT AllocateWeights(Model model, hwy::ThreadPool& pool) {
-  switch (model) {
-    case Model::GEMMA_2B:
-      return AllocateWeights<float, ConfigGemma2B>(pool);
-    case Model::GEMMA_7B:
-      return AllocateWeights<float, ConfigGemma7B>(pool);
-    case Model::GRIFFIN_2B:
-      return AllocateWeights<float, ConfigGriffin2B>(pool);
-    case Model::GEMMA_TINY:
-      return AllocateWeights<float, ConfigGemmaTiny>(pool);
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
+// Setting this to true disables fread() calls that read the model file.
+constexpr bool kDryRunFread = false;
+
+namespace {
+float ScaleWeights(float* data, size_t len) {
+  float maxabs = 0.0;
+  for (size_t i = 0; i < len; ++i) {
+    maxabs = std::max(maxabs, std::abs(data[i]));
   }
+  const float kMaxRange = 1.875f;
+  if (maxabs <= kMaxRange) {
+    return 1.0f;
+  }
+  const float scale = maxabs / kMaxRange;
+  const float inv_scale = 1.0f / scale;
+  for (size_t i = 0; i < len; ++i) {
+    data[i] *= inv_scale;
+  }
+  return scale;
+}
+
+#define READ_WEIGHTS(name)                                                 \
+  do {                                                                     \
+    do_fread(&(layer_view->name), layer, #name, sizeof(layer_view->name)); \
+  } while (0)
+
+#define SCALE_WEIGHTS(name)                                               \
+  do {                                                                    \
+    if (ok && !kDryRunFread && scale_for_compression) {                   \
+      weights->scales[scale_pos++] =                                      \
+          ScaleWeights(layer_view->name.data(), layer_view->name.size()); \
+    }                                                                     \
+  } while (0)
+
+template <typename TConfig>
+struct LoadRawWeightsT {
+  ByteStorageT operator()(const Path& checkpoint, hwy::ThreadPool& pool,
+                          bool scale_for_compression) const {
+    PROFILER_ZONE("Startup.LoadWeights");
+    if (!checkpoint.Exists()) {
+      HWY_ABORT("The model weights file '%s' does not exist.",
+                checkpoint.path.c_str());
+    }
+
+    ByteStorageT weights_u8 = AllocateWeights<TConfig>()(pool);
+    auto* weights = reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
+
+    size_t scale_pos = 0;
+    FILE* fptr;
+    if constexpr (kDryRunFread) {
+      fprintf(stderr, "Dry-Run, not reading model-file.\n");
+    } else {
+      fptr = fopen(checkpoint.path.c_str(), "rb");
+      if (fptr == nullptr) {
+        HWY_ABORT("Failed to open model file %s - does it exist?",
+                  checkpoint.path.c_str());
+      }
+    }
+    bool ok = true;
+    uint64_t total_size = 0;
+    auto do_fread = [&](void* var, int layer, const char* name, size_t size) {
+      if (layer == -1) {
+        fprintf(stderr, "Loading Parameters (size %zu): %s\n", size, name);
+      } else {
+        fprintf(stderr, "Loading Parameters (layer=%d, size %zu): %s\n", layer,
+                size, name);
+      }
+      if constexpr (!kDryRunFread) {
+        ok &= 1 == fread(var, size, 1, fptr);
+        total_size += size;
+      }
+    };
+    do_fread(&(weights->embedder_input_embedding), -1,
+             "embedder_input_embedding",
+             sizeof(weights->embedder_input_embedding));
+    do_fread(&(weights->final_norm_scale), -1, "final_norm_scale",
+             sizeof(weights->final_norm_scale));
+    for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
+      auto type = TConfig::kLayerConfig[layer];
+      LayerF<TConfig>* layer_view = weights->GetLayer(layer);
+
+      // Make sure we don't have uninitialized memory.
+      hwy::ZeroBytes(layer_view, sizeof(*layer_view));
+      if (type == LayerAttentionType::kGemma) {
+        READ_WEIGHTS(attn_vec_einsum_w);
+        READ_WEIGHTS(qkv_einsum_w);
+        SCALE_WEIGHTS(attn_vec_einsum_w);
+        SCALE_WEIGHTS(qkv_einsum_w);
+      } else {
+        READ_WEIGHTS(griffin.linear_x_w);
+        READ_WEIGHTS(griffin.linear_x_biases);
+        READ_WEIGHTS(griffin.linear_y_w);
+        READ_WEIGHTS(griffin.linear_y_biases);
+        READ_WEIGHTS(griffin.linear_out_w);
+        READ_WEIGHTS(griffin.linear_out_biases);
+        READ_WEIGHTS(griffin.conv_w);
+        READ_WEIGHTS(griffin.conv_biases);
+        READ_WEIGHTS(griffin.gate_w);
+        READ_WEIGHTS(griffin.gate_biases);
+        READ_WEIGHTS(griffin.a);
+        SCALE_WEIGHTS(griffin.linear_x_w);
+        SCALE_WEIGHTS(griffin.linear_y_w);
+        SCALE_WEIGHTS(griffin.linear_out_w);
+        SCALE_WEIGHTS(griffin.gate_w);
+      }
+      READ_WEIGHTS(gating_einsum_w);
+      READ_WEIGHTS(linear_w);
+      SCALE_WEIGHTS(gating_einsum_w);
+      SCALE_WEIGHTS(linear_w);
+      READ_WEIGHTS(pre_attention_norm_scale);
+      READ_WEIGHTS(pre_ffw_norm_scale);
+      if (TConfig::kPostNormScale) {
+        READ_WEIGHTS(post_attention_norm_scale);
+        READ_WEIGHTS(post_ffw_norm_scale);
+      }
+      if (TConfig::kFFBiases) {
+        READ_WEIGHTS(ffw_gating_biases);
+        READ_WEIGHTS(ffw_output_biases);
+      }
+      if (TConfig::kSoftmaxAttnOutputBiases &&
+          type == LayerAttentionType::kGemma) {
+        READ_WEIGHTS(attention_output_biases);
+      }
+    }
+    if (!ok) {
+      HWY_ABORT(
+          "Failed to read from %s - might be a directory, or too small? "
+          "expected size: %d kB",
+          checkpoint.path.c_str(), static_cast<uint32_t>(total_size >> 10));
+    }
+    if (!kDryRunFread) {
+      HWY_ASSERT(0 == fclose(fptr));
+      if (scale_for_compression) {
+        HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
+      }
+    }
+    return weights_u8;
+  }
+};
+
+#undef READ_WEIGHTS
+#undef SCALE_WEIGHTS
+}  // namespace
+
+ByteStorageT LoadRawWeights(const Path& weights, Model model,
+                            hwy::ThreadPool& pool, bool scale_for_compression) {
+  return CallFunctorForModel<LoadRawWeightsT>(model, weights, pool,
+                                              scale_for_compression);
 }
 
 namespace {
-template <typename TConfig>
-void ZeroInitWeightsT(ByteStorageT& weights, hwy::ThreadPool& pool) {
-  ZeroInit<float, TConfig>(
-      *reinterpret_cast<Weights<float, TConfig>*>(weights.get()));
-}
+template <class TConfig>
+struct LoadCompressedWeightsT {
+  ByteStorageT operator()(const Path& weights, hwy::ThreadPool& pool) const {
+    PROFILER_ZONE("Startup.LoadCompressedWeights");
+    if (!weights.Exists()) {
+      HWY_ABORT("The model weights file '%s' does not exist.",
+                weights.path.c_str());
+    }
+
+    // Allocate compressed weights.
+    using CWeights = CompressedWeights<TConfig>;
+    ByteStorageT c_weights_u8 = AllocateSizeof<CWeights>();
+    CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
+    new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
+
+    std::array<float, TConfig::kNumTensorScales> scales;
+    CacheLoader loader(weights);
+    ForEachTensor<TConfig>(nullptr, *c_weights, loader);
+    loader.LoadScales(scales.data(), scales.size());
+    if (!loader.ReadAll(pool)) {
+      HWY_ABORT("Failed to load model weights.");
+    }
+    if (TConfig::kNumTensorScales > 0) {
+      size_t scale_pos = 0;
+      for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
+        auto type = TConfig::kLayerConfig[layer_idx];
+        const size_t idx = static_cast<size_t>(layer_idx);
+        CompressedLayer<TConfig>* layer_weights = c_weights->GetLayer(idx);
+        if (type == LayerAttentionType::kGemma) {
+          layer_weights->attn_vec_einsum_w.set_scale(scales[scale_pos++]);
+          layer_weights->qkv_einsum_w.set_scale(scales[scale_pos++]);
+        } else {
+          layer_weights->griffin.linear_x_w.set_scale(scales[scale_pos++]);
+          layer_weights->griffin.linear_y_w.set_scale(scales[scale_pos++]);
+          layer_weights->griffin.linear_out_w.set_scale(scales[scale_pos++]);
+          layer_weights->griffin.gate_w.set_scale(scales[scale_pos++]);
+        }
+        layer_weights->gating_einsum_w.set_scale(scales[scale_pos++]);
+        layer_weights->linear_w.set_scale(scales[scale_pos++]);
+      }
+      HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
+    }
+    return c_weights_u8;
+  }
+};
 }  // namespace
 
-void ZeroInitWeights(Model model, ByteStorageT& weights,
-                     hwy::ThreadPool& pool) {
-  switch (model) {
-    case Model::GEMMA_2B:
-      ZeroInitWeightsT<ConfigGemma2B>(weights, pool);
-      break;
-    case Model::GEMMA_7B:
-      ZeroInitWeightsT<ConfigGemma7B>(weights, pool);
-      break;
-    case Model::GRIFFIN_2B:
-      ZeroInitWeightsT<ConfigGriffin2B>(weights, pool);
-      break;
-    case Model::GEMMA_TINY:
-      ZeroInitWeightsT<ConfigGemmaTiny>(weights, pool);
-      break;
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
+ByteStorageT LoadCompressedWeights(const Path& weights, Model model,
+                                   hwy::ThreadPool& pool) {
+  return CallFunctorForModel<LoadCompressedWeightsT>(model, weights, pool);
+}
+
+ByteStorageT LoadWeights(const Path& weights, Model model,
+                         hwy::ThreadPool& pool) {
+  if constexpr (kWeightsAreCompressed) {
+    return LoadCompressedWeights(weights, model, pool);
+  } else {
+    return LoadRawWeights(weights, model, pool,
+                          /*scale_for_compression=*/false);
   }
 }
 
@@ -86,27 +263,19 @@ class WeightLogger {
 };
 
 template <typename TConfig>
-void LogWeightStats(const ByteStorageT& weights_u8) {
-  const auto& weights = *reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
-  WeightLogger logger;
-  ForEachTensor1<float, TConfig>(logger, weights);
-  printf("%-20s  %12zu\n", "Total", logger.total_weights);
-}
+struct LogWeightStatsT {
+  void operator()(const ByteStorageT& weights_u8) const {
+    const auto& weights =
+        *reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
+    WeightLogger logger;
+    ForEachTensor1<float, TConfig>(logger, weights);
+    printf("%-20s  %12zu\n", "Total", logger.total_weights);
+  }
+};
 }  // namespace
 
 void LogWeightStats(gcpp::Model model, const ByteStorageT& weights) {
-  switch (model) {
-    case Model::GEMMA_2B:
-      return LogWeightStats<ConfigGemma2B>(weights);
-    case Model::GEMMA_7B:
-      return LogWeightStats<ConfigGemma7B>(weights);
-    case Model::GRIFFIN_2B:
-      return LogWeightStats<ConfigGriffin2B>(weights);
-    case Model::GEMMA_TINY:
-      return LogWeightStats<ConfigGemmaTiny>(weights);
-    default:
-      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
-  }
+  CallFunctorForModel<LogWeightStatsT>(model, weights);
 }
 
 }  // namespace gcpp
diff --git a/gemma/weights.h b/gemma/weights.h
index 8d25759..2346d67 100644
--- a/gemma/weights.h
+++ b/gemma/weights.h
@@ -16,13 +16,21 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_H_
 
+#include "compression/compress.h"
 #include "gemma/common.h"
 #include "gemma/configs.h"
 #include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
 namespace gcpp {
 
+// Setting this to false will load and use uncompressed weights.
+constexpr bool kWeightsAreCompressed = true;
+
+// ----------------------------------------------------------------------------
+// Uncompressed
+
 template <typename T, class TConfig>
 struct Layer {
   Layer() {}
@@ -118,15 +126,262 @@ struct Weights {
 template <class TConfig>
 using WeightsF = Weights<float, TConfig>;
 
+// ----------------------------------------------------------------------------
+// Compressed
+
+template <class TConfig>
+struct CompressedLayer {
+  // No ctor/dtor, allocated via AllocateAligned.
+
+  using TLayer = gcpp::LayerF<TConfig>;
+  using WeightT = typename TConfig::WeightT;
+
+  static constexpr size_t kHeads = TLayer::kHeads;
+  static constexpr size_t kKVHeads = TLayer::kKVHeads;
+  static constexpr size_t kModelDim = TLayer::kModelDim;
+  static constexpr size_t kQKVDim = TLayer::kQKVDim;
+  static constexpr size_t kFFHiddenDim = TLayer::kFFHiddenDim;
+  static constexpr size_t kAttVecEinsumWSize = TLayer::kAttVecEinsumWSize;
+  static constexpr size_t kQKVEinsumWSize = TLayer::kQKVEinsumWSize;
+  static constexpr size_t kGatingEinsumWSize = TLayer::kGatingEinsumWSize;
+  static constexpr size_t kConv1dWidth = TLayer::kConv1dWidth;
+  static constexpr bool kFFBiases = TLayer::kFFBiases;
+  static constexpr bool kPostNormScale = TConfig::kPostNormScale;
+  static constexpr size_t kAOBiasDim = TLayer::kAOBiasDim;
+  static constexpr size_t kGriffinDim = TLayer::kGriffinDim;
+
+  // Compressed Parameters
+
+  template <class T, size_t N>
+  using ArrayT = CompressedArray<T, N>;
+
+  union {
+    struct {
+      ArrayT<WeightT, kAttVecEinsumWSize> attn_vec_einsum_w;
+      ArrayT<WeightT, kQKVEinsumWSize> qkv_einsum_w;
+      ArrayT<float, kAOBiasDim> attention_output_biases;
+    };
+
+    struct {
+      ArrayT<WeightT, kGriffinDim * kGriffinDim> linear_x_w;
+      ArrayT<float, kGriffinDim> linear_x_biases;
+      ArrayT<WeightT, kGriffinDim * kGriffinDim> linear_y_w;
+      ArrayT<float, kGriffinDim> linear_y_biases;
+      ArrayT<WeightT, kGriffinDim * kGriffinDim> linear_out_w;
+      ArrayT<float, kGriffinDim> linear_out_biases;
+      ArrayT<float, TConfig::kConv1dWidth * kGriffinDim> conv_w;
+      ArrayT<float, kGriffinDim> conv_biases;
+      ArrayT<WeightT, kGriffinDim * kGriffinDim / kHeads * 2> gate_w;
+      ArrayT<float, kGriffinDim * 2> gate_biases;
+      ArrayT<float, kGriffinDim> a;
+    } griffin;
+  };
+
+  ArrayT<WeightT, TLayer::kGatingEinsumWSize> gating_einsum_w;
+  ArrayT<WeightT, kModelDim * kFFHiddenDim> linear_w;
+  // We don't yet have an RMSNorm that accepts all WeightT.
+  ArrayT<hwy::bfloat16_t, kModelDim> pre_attention_norm_scale;
+  ArrayT<hwy::bfloat16_t, kModelDim> pre_ffw_norm_scale;
+  ArrayT<hwy::bfloat16_t, kPostNormScale ? kModelDim : 0>
+      post_attention_norm_scale;
+  ArrayT<hwy::bfloat16_t, kPostNormScale ? kModelDim : 0> post_ffw_norm_scale;
+
+  ArrayT<float, kFFBiases ? 2 * kFFHiddenDim : 0> ffw_gating_biases;
+  ArrayT<float, kFFBiases ? kModelDim : 0> ffw_output_biases;
+};
+
+// Array instead of single large allocation for parallel mem init. Split out
+// of CompressedWeights so that only these pointers are initialized, not the
+// CompressedArray.
+template <class TConfig>
+struct CompressedLayerPointers {
+  explicit CompressedLayerPointers(hwy::ThreadPool& pool) {
+    pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) {
+      this->c_layers[task] = hwy::AllocateAligned<CompressedLayer<TConfig>>(1);
+    });
+  }
+
+  using CLayer = CompressedLayer<TConfig>;
+  std::array<hwy::AlignedFreeUniquePtr<CLayer[]>, TConfig::kLayers> c_layers;
+};
+
+template <class TConfig>
+struct CompressedWeights {
+  // No ctor/dtor, allocated via AllocateAligned.
+
+  CompressedArray<EmbedderInputT, TConfig::kVocabSize * TConfig::kModelDim>
+      embedder_input_embedding;
+
+  CompressedArray<hwy::bfloat16_t, TConfig::kModelDim> final_norm_scale;
+
+  // Must be last so that the other arrays remain aligned.
+  CompressedLayerPointers<TConfig> c_layer_ptrs;
+
+  const CompressedLayer<TConfig>* GetLayer(size_t layer) const {
+    return c_layer_ptrs.c_layers[layer].get();
+  }
+  CompressedLayer<TConfig>* GetLayer(size_t layer) {
+    return c_layer_ptrs.c_layers[layer].get();
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Interface
+
+template <class TConfig>
+using WeightsT = hwy::If<kWeightsAreCompressed, CompressedWeights<TConfig>,
+                         WeightsF<TConfig>>;
+
+// Call via CallFunctorForModel.
+template <typename TConfig>
+struct AllocateWeights {
+  ByteStorageT operator()(hwy::ThreadPool& pool) const {
+    using TWeights = WeightsF<TConfig>;
+    ByteStorageT weights_u8 = AllocateSizeof<TWeights>();
+    TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
+    new (&weights->layer_ptrs) LayerPointers<float, TConfig>(pool);
+    return weights_u8;
+  }
+};
+
+template <typename TConfig>
+struct ZeroInitWeights {
+  void operator()(ByteStorageT& weights, hwy::ThreadPool& pool) const {
+    WeightsF<TConfig>& w = *reinterpret_cast<WeightsF<TConfig>*>(weights.get());
+    hwy::ZeroBytes(&w.embedder_input_embedding,
+                   sizeof(w.embedder_input_embedding));
+    hwy::ZeroBytes(&w.final_norm_scale, sizeof(w.final_norm_scale));
+    for (int i = 0; i < TConfig::kLayers; ++i) {
+      hwy::ZeroBytes(w.GetLayer(i), sizeof(*w.GetLayer(i)));
+    }
+  }
+};
+
+template <typename TConfig>
+struct CopyWeights {
+  void operator()(WeightsF<TConfig>& dst, const WeightsF<TConfig>& src) const {
+    hwy::CopyBytes(&src.embedder_input_embedding, &dst.embedder_input_embedding,
+                   sizeof(src.embedder_input_embedding));
+    hwy::CopyBytes(&src.final_norm_scale, &dst.final_norm_scale,
+                   sizeof(src.final_norm_scale));
+    for (int i = 0; i < TConfig::kLayers; ++i) {
+      hwy::CopyBytes(src.GetLayer(i), dst.GetLayer(i),
+                     sizeof(*dst.GetLayer(i)));
+    }
+  }
+};
+
+template <class TConfig>
+struct DeleteLayersPtrs {
+  void operator()(ByteStorageT& weights_u8) const {
+    auto* weights = reinterpret_cast<WeightsT<TConfig>*>(weights_u8.get());
+    if constexpr (kWeightsAreCompressed) {
+      weights->c_layer_ptrs.~CompressedLayerPointers<TConfig>();
+    } else {
+      weights->layer_ptrs.~LayerPointers<float, TConfig>();
+    }
+  }
+};
+
+// Owns weights and provides access to TConfig.
 template <typename T, typename TConfig>
-ByteStorageT AllocateWeights(hwy::ThreadPool& pool) {
-  using TWeights = Weights<T, TConfig>;
-  ByteStorageT weights_u8 = hwy::AllocateAligned<uint8_t>(sizeof(TWeights));
-  TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
-  new (&weights->layer_ptrs) LayerPointers<T, TConfig>(pool);
-  return weights_u8;
+class WeightsWrapper {
+ public:
+  WeightsWrapper()
+      : pool_(0),
+        data_(AllocateWeights<TConfig>(pool_)),
+        weights_(reinterpret_cast<Weights<T, TConfig>*>(data_.get())) {}
+
+  const Weights<T, TConfig>& get() const { return *weights_; }
+  Weights<T, TConfig>& get() { return *weights_; }
+  void clear() { ZeroInitWeights<TConfig>()(get()); }
+  void copy(const WeightsWrapper<T, TConfig>& other) {
+    CopyWeights<TConfig>()(get(), other.get());
+  }
+
+ private:
+  hwy::ThreadPool pool_;
+  ByteStorageT data_;
+  Weights<T, TConfig>* weights_;
+};
+
+// For use by compress_weights.cc.
+ByteStorageT LoadRawWeights(const Path& weights, Model model,
+                            hwy::ThreadPool& pool, bool scale_for_compression);
+
+// For gemma.cc; calls LoadRawWeights if !kWeightsAreCompressed.
+ByteStorageT LoadWeights(const Path& weights, Model model,
+                         hwy::ThreadPool& pool);
+
+void LogWeightStats(Model model, const ByteStorageT& weights);
+
+// ----------------------------------------------------------------------------
+// Iterators
+
+#define GEMMA_CALL_FUNC(name, member)                          \
+  snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
+  func(name_buf, layer ? layer->member.data() : nullptr, layer_weights->member)
+
+// Calls func(name, float*, CompressedArray&) for each tensor. float* is null
+// if weights = null, which happens during the first call where we attempt to
+// load from cache.
+//
+// This avoids repeating the list of tensors between loading and compressing.
+template <class TConfig, class Func>
+void ForEachTensor(const WeightsF<TConfig>* weights,
+                   CompressedWeights<TConfig>& c_weights, Func& func) {
+  func("c_embedding",
+       weights ? weights->embedder_input_embedding.data() : nullptr,
+       c_weights.embedder_input_embedding);
+  func("c_final_norm", weights ? weights->final_norm_scale.data() : nullptr,
+       c_weights.final_norm_scale);
+
+  char name_buf[16];
+  for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
+    auto type = TConfig::kLayerConfig[layer_idx];
+    const size_t idx = static_cast<size_t>(layer_idx);
+    const LayerF<TConfig>* layer = weights ? weights->GetLayer(idx) : nullptr;
+    CompressedLayer<TConfig>* layer_weights = c_weights.GetLayer(idx);
+
+    GEMMA_CALL_FUNC("pre_ff_ns", pre_ffw_norm_scale);
+    GEMMA_CALL_FUNC("gating_ein", gating_einsum_w);
+    GEMMA_CALL_FUNC("linear_w", linear_w);
+    if (type == LayerAttentionType::kGemma) {
+      GEMMA_CALL_FUNC("qkv_ein", qkv_einsum_w);
+      GEMMA_CALL_FUNC("att_ein", attn_vec_einsum_w);
+    } else {
+      GEMMA_CALL_FUNC("gr_lin_x_w", griffin.linear_x_w);
+      GEMMA_CALL_FUNC("gr_lin_x_b", griffin.linear_x_biases);
+      GEMMA_CALL_FUNC("gr_lin_y_w", griffin.linear_y_w);
+      GEMMA_CALL_FUNC("gr_lin_y_b", griffin.linear_y_biases);
+      GEMMA_CALL_FUNC("gr_lin_out_w", griffin.linear_out_w);
+      GEMMA_CALL_FUNC("gr_lin_out_b", griffin.linear_out_biases);
+      GEMMA_CALL_FUNC("gr_conv_w", griffin.conv_w);
+      GEMMA_CALL_FUNC("gr_conv_b", griffin.conv_biases);
+      GEMMA_CALL_FUNC("gr_gate_w", griffin.gate_w);
+      GEMMA_CALL_FUNC("gr_gate_b", griffin.gate_biases);
+      GEMMA_CALL_FUNC("gr_a", griffin.a);
+    }
+    GEMMA_CALL_FUNC("pre_att_ns", pre_attention_norm_scale);
+    if (TConfig::kPostNormScale) {
+      GEMMA_CALL_FUNC("post_att_ns", post_attention_norm_scale);
+      GEMMA_CALL_FUNC("post_ff_ns", post_ffw_norm_scale);
+    }
+
+    if (TConfig::kFFBiases) {
+      GEMMA_CALL_FUNC("ffw_gat_b", ffw_gating_biases);
+      GEMMA_CALL_FUNC("ffw_out_b", ffw_output_biases);
+    }
+
+    if (TConfig::kSoftmaxAttnOutputBiases &&
+        type == LayerAttentionType::kGemma) {
+      GEMMA_CALL_FUNC("attn_ob", attention_output_biases);
+    }
+  }
 }
 
+#undef GEMMA_CALL_FUNC
+
 #define GEMMA_CALL_TOP_FUNC1(name, member) func(name, weights1.member)
 #define GEMMA_CALL_TOP_FUNC2(name, member)      \
   func(name, weights1.member, weights2.member)
@@ -237,54 +492,6 @@ void ForEachTensor2(Func& func, const Weights<T, TConfig>& weights1,
 #undef GEMMA_CALL_LAYER_FUNC4
 #undef GEMMA_CALL_ALL_LAYER_FUNC
 
-template<typename T, typename TConfig>
-void ZeroInit(Weights<T, TConfig>& w) {
-  hwy::ZeroBytes(&w.embedder_input_embedding,
-                 sizeof(w.embedder_input_embedding));
-  hwy::ZeroBytes(&w.final_norm_scale, sizeof(w.final_norm_scale));
-  for (int i = 0; i < TConfig::kLayers; ++i) {
-    hwy::ZeroBytes(w.GetLayer(i), sizeof(*w.GetLayer(i)));
-  }
-}
-
-template<typename T, typename TConfig>
-void Copy(Weights<T, TConfig>& dst, const Weights<T, TConfig>& src) {
-  hwy::CopyBytes(&src.embedder_input_embedding, &dst.embedder_input_embedding,
-                 sizeof(src.embedder_input_embedding));
-  hwy::CopyBytes(&src.final_norm_scale, &dst.final_norm_scale,
-                 sizeof(src.final_norm_scale));
-  for (int i = 0; i < TConfig::kLayers; ++i) {
-    hwy::CopyBytes(src.GetLayer(i), dst.GetLayer(i), sizeof(*dst.GetLayer(i)));
-  }
-}
-
-// Owns weights and undoes the type erasure of AllocateWeights.
-template<typename T, typename TConfig>
-class WeightsWrapper {
- public:
-  WeightsWrapper()
-      : pool_(0), data_(AllocateWeights<T, TConfig>(pool_)),
-        weights_(reinterpret_cast<Weights<T, TConfig>*>(data_.get())) {}
-
-  const Weights<T, TConfig>& get() const { return *weights_; }
-  Weights<T, TConfig>& get() { return *weights_; }
-  void clear() { ZeroInit(get()); }
-  void copy(const WeightsWrapper<T, TConfig>& other) {
-    Copy(get(), other.get());
-  }
-
- private:
-  hwy::ThreadPool pool_;
-  ByteStorageT data_;
-  Weights<T, TConfig>* weights_;
-};
-
-ByteStorageT AllocateWeights(Model model, hwy::ThreadPool& pool);
-
-void ZeroInitWeights(Model model, ByteStorageT& weights, hwy::ThreadPool& pool);
-
-void LogWeightStats(Model model, const ByteStorageT& weights);
-
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_H_