From 7d0720675f99f41ca884ef9b2cc1331a5b6a07b7 Mon Sep 17 00:00:00 2001
From: Jan Wassenberg <janwas@google.com>
Date: Mon, 17 Jun 2024 06:16:17 -0700
Subject: [PATCH] Move raw_weights into separate header, used mainly by
 compress_weights.

Fix warnings in backprop/* (include)

PiperOrigin-RevId: 643983136
---
 BUILD.bazel                      |  21 ++-
 CMakeLists.txt                   |   1 +
 backprop/backward-inl.h          |   1 -
 backprop/backward.h              |   2 -
 backprop/backward_scalar.h       |   3 +-
 backprop/backward_scalar_test.cc |  12 +-
 backprop/backward_test.cc        |   9 +-
 backprop/forward-inl.h           |   9 +-
 backprop/forward_scalar.h        |   2 +-
 backprop/test_util.h             |  33 +---
 gemma/compress_weights.cc        |   4 +-
 gemma/gemma.cc                   |   5 +-
 gemma/weights.cc                 |   4 +-
 gemma/weights.h                  | 284 ++++++++-----------------------
 gemma/weights_raw.h              | 242 ++++++++++++++++++++++++++
 15 files changed, 353 insertions(+), 279 deletions(-)
 create mode 100644 gemma/weights_raw.h
diff --git a/BUILD.bazel b/BUILD.bazel
index fc64dc5..42bfeea 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -83,6 +83,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "weights_raw",
+    hdrs = ["gemma/weights_raw.h"],
+    deps = [
+        ":common",
+        ":weights",
+        "//compression:compress",
+        "@hwy//:hwy",
+        "@hwy//:thread_pool",
+    ],
+)
+
 cc_library(
     name = "gemma_lib",
     srcs = [
@@ -214,6 +226,7 @@ cc_binary(
         ":common",
         ":gemma_lib",
         ":weights",
+        ":weights_raw",
         # Placeholder for internal dep, do not remove.,
         "//compression:compress",
         "@hwy//:hwy",
@@ -331,7 +344,7 @@ cc_library(
         ":common",
         ":gemma_lib",
         ":prompt",
-        ":weights",
+        ":weights_raw",
     ],
 )
 
@@ -346,7 +359,7 @@ cc_test(
         ":backprop_scalar",
         ":prompt",
         ":sampler",
-        ":weights",
+        ":weights_raw",
         "@googletest//:gtest_main",
     ],
 )
@@ -363,11 +376,9 @@ cc_test(
         ":backprop_scalar",
         ":gemma_lib",
         ":ops",
-        ":prompt",
         ":sampler",
-        ":weights",
+        ":weights_raw",
         "@googletest//:gtest_main",
-        "//compression:compress",
         "@hwy//:hwy",
         "@hwy//:hwy_test_util",
         "@hwy//:thread_pool",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1473d3b..ae4fb73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,7 @@ set(SOURCES
   gemma/ops.h
   gemma/weights.cc
   gemma/weights.h
+  gemma/weights_raw.h
   util/app.h
   util/args.h
   )
diff --git a/backprop/backward-inl.h b/backprop/backward-inl.h
index d6c4d68..837dc13 100644
--- a/backprop/backward-inl.h
+++ b/backprop/backward-inl.h
@@ -23,7 +23,6 @@
 #include <stddef.h>
 
 #include <algorithm>
-#include <array>
 #include <cmath>
 
 #include "backprop/prompt.h"
diff --git a/backprop/backward.h b/backprop/backward.h
index 6917f20..aac2122 100644
--- a/backprop/backward.h
+++ b/backprop/backward.h
@@ -16,8 +16,6 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_BACKWARD_H_
 
-#include <vector>
-
 #include "backprop/prompt.h"
 #include "gemma/common.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
diff --git a/backprop/backward_scalar.h b/backprop/backward_scalar.h
index 024e2a4..aa652ac 100644
--- a/backprop/backward_scalar.h
+++ b/backprop/backward_scalar.h
@@ -20,14 +20,13 @@
 #include <string.h>
 
 #include <cmath>
-#include <complex>
 #include <vector>
 
 #include "backprop/common_scalar.h"
 #include "backprop/prompt.h"
 #include "gemma/activations.h"
 #include "gemma/common.h"  // EmbeddingScaling
-#include "gemma/weights.h"
+#include "gemma/weights_raw.h"
 
 namespace gcpp {
 template<typename T>
diff --git a/backprop/backward_scalar_test.cc b/backprop/backward_scalar_test.cc
index 9a94484..2a9d99b 100644
--- a/backprop/backward_scalar_test.cc
+++ b/backprop/backward_scalar_test.cc
@@ -15,6 +15,9 @@
 
 #include "backprop/backward_scalar.h"
 
+#include <stddef.h>
+#include <string.h>  // memset
+
 #include <array>
 #include <complex>
 #include <random>
@@ -23,6 +26,7 @@
 #include "backprop/forward_scalar.h"
 #include "backprop/sampler.h"
 #include "backprop/test_util.h"
+#include "gemma/weights_raw.h"
 
 namespace gcpp {
 
@@ -55,8 +59,8 @@ TEST(BackPropTest, MatMulVJP) {
     memset(&grad, 0, sizeof(grad));
     MatMulVJPT(weights.data(), x.data(), dy.data(), grad.data(), dx.data(),
                kRows, kCols, kTokens);
-    TestGradient(dx, c_x, func, 1e-11, 1e-12,__LINE__);
-    TestGradient(grad, c_weights, func, 1e-14, 1e-12,__LINE__);
+    TestGradient(dx, c_x, func, 1e-11, 1e-12, __LINE__);
+    TestGradient(grad, c_weights, func, 1e-14, 1e-12, __LINE__);
   }
 }
 
@@ -91,8 +95,8 @@ TEST(BackPropTest, MultiHeadMatMulVJP) {
     memset(&grad, 0, sizeof(grad));
     MultiHeadMatMulVJPT(weights.data(), x.data(), dy.data(), grad.data(),
                         dx.data(), kHeads, kRows, kCols, kTokens);
-    TestGradient(dx, c_x, func, 1e-15, 1e-13,__LINE__);
-    TestGradient(grad, c_weights, func, 1e-15, 1e-13,__LINE__);
+    TestGradient(dx, c_x, func, 1e-15, 1e-13, __LINE__);
+    TestGradient(grad, c_weights, func, 1e-15, 1e-13, __LINE__);
   }
 }
 
diff --git a/backprop/backward_test.cc b/backprop/backward_test.cc
index 4a3e4cc..146ce67 100644
--- a/backprop/backward_test.cc
+++ b/backprop/backward_test.cc
@@ -19,7 +19,6 @@
 
 #include <stddef.h>
 
-#include <algorithm>
 #include <array>
 #include <complex>
 #include <random>
@@ -29,10 +28,8 @@
 #include "backprop/forward_scalar.h"
 #include "backprop/sampler.h"
 #include "backprop/test_util.h"
-#include "compression/compress.h"
-#include "gemma/gemma.h"
-#include "gemma/weights.h"
-#include "hwy/aligned_allocator.h"
+#include "gemma/activations.h"
+#include "gemma/weights_raw.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 
@@ -52,8 +49,6 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-namespace hn = hwy::HWY_NAMESPACE;
-
 void TestMatMulVJP() {
   static const size_t kRows = 8;
   static const size_t kCols = 64;
diff --git a/backprop/forward-inl.h b/backprop/forward-inl.h
index 49efedd..b322061 100644
--- a/backprop/forward-inl.h
+++ b/backprop/forward-inl.h
@@ -20,8 +20,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <array>
 #include <cmath>
+#include <vector>
 
 #include "gemma/activations.h"
 #include "gemma/common.h"
@@ -40,11 +40,11 @@
 #endif
 
 #include "gemma/ops.h"
+#include "hwy/highway.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
-namespace hn = hwy::HWY_NAMESPACE;
 
 template <typename ArrayT>
 void InputEmbedding(const ArrayT& weights, const std::vector<int>& prompt,
@@ -202,11 +202,10 @@ void ApplyForwardLayer(const LayerT<TConfig>& weights,
         activations.ffw_hidden_gated.data() + pos * kFFHiddenDim;
     namespace hn = hwy::HWY_NAMESPACE;
     using DF = hn::ScalableTag<float>;
-    using VF = hn::Vec<DF>;
     DF df;
     for (size_t i = 0; i < kFFHiddenDim; i += Lanes(df)) {
-      const auto y = Load(df, out + i);
-      const auto x = Load(df, out_mul + i);
+      const auto y = hn::Load(df, out + i);
+      const auto x = hn::Load(df, out_mul + i);
       hn::Store(hn::Mul(x, Gelu(df, y)), df, out_gated + i);
     }
   }
diff --git a/backprop/forward_scalar.h b/backprop/forward_scalar.h
index 8bc125e..95c5f0c 100644
--- a/backprop/forward_scalar.h
+++ b/backprop/forward_scalar.h
@@ -27,7 +27,7 @@
 #include "backprop/prompt.h"
 #include "gemma/activations.h"
 #include "gemma/common.h"  // EmbeddingScaling
-#include "gemma/weights.h"
+#include "gemma/weights_raw.h"
 
 namespace gcpp {
 
diff --git a/backprop/test_util.h b/backprop/test_util.h
index 939411d..387b979 100644
--- a/backprop/test_util.h
+++ b/backprop/test_util.h
@@ -16,43 +16,16 @@
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_TEST_UTIL_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_TEST_UTIL_H_
 
+#include <stddef.h>
+
 #include <array>
 #include <complex>
-#include <random>
 
-#include "gemma/weights.h"
 #include "gtest/gtest.h"
+#include "gemma/weights_raw.h"
 
 namespace gcpp {
 
-template<typename T, size_t kLen>
-void RandInit(std::array<T, kLen>& x, T stddev, std::mt19937& gen) {
-  std::normal_distribution<T> dist(0.0, stddev);
-  for (size_t i = 0; i < kLen; ++i) {
-    x[i] = dist(gen);
-  }
-}
-
-template<typename T, typename TConfig>
-void RandInit(Layer<T, TConfig>& w, T stddev, std::mt19937& gen) {
-  RandInit(w.pre_attention_norm_scale, stddev, gen);
-  RandInit(w.attn_vec_einsum_w, stddev, gen);
-  RandInit(w.qkv_einsum_w, stddev, gen);
-  RandInit(w.pre_ffw_norm_scale, stddev, gen);
-  RandInit(w.gating_einsum_w, stddev, gen);
-  RandInit(w.linear_w, stddev, gen);
-}
-
-template<typename T, typename TConfig>
-void RandInit(Weights<T, TConfig>& w, T stddev, std::mt19937& gen) {
-  static constexpr size_t kLayers = TConfig::kLayers;
-  RandInit(w.embedder_input_embedding, stddev, gen);
-  RandInit(w.final_norm_scale, stddev, gen);
-  for (size_t i = 0; i < kLayers; ++i) {
-    RandInit(*w.GetLayer(i), stddev, gen);
-  }
-}
-
 template<typename T, typename U, size_t kLen>
 void Complexify(const std::array<T, kLen>& x,
                 std::array<std::complex<U>, kLen>& c_x) {
diff --git a/gemma/compress_weights.cc b/gemma/compress_weights.cc
index 8552802..bff3460 100644
--- a/gemma/compress_weights.cc
+++ b/gemma/compress_weights.cc
@@ -40,6 +40,7 @@
 #include "compression/io.h"  // Path
 #include "gemma/common.h"    // Model
 #include "gemma/weights.h"
+#include "gemma/weights_raw.h"
 #include "util/args.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
@@ -317,7 +318,8 @@ void CompressWeights(const Path& weights_path,
   WeightsF<TConfig>* weights =
       reinterpret_cast<WeightsF<TConfig>*>(weights_u8.get());
   Compressor compressor(pool);
-  ForEachTensor<TConfig>(weights, *c_weights, compressor);
+  ForEachTensor</*kHaveRaw=*/true, TConfig, LayerF<TConfig>>(
+      weights, *c_weights, compressor);
   compressor.AddScales(weights->scales.data(), weights->scales.size());
   compressor.WriteAll(pool, compressed_weights_path);
 
diff --git a/gemma/gemma.cc b/gemma/gemma.cc
index 1716399..347a4cd 100644
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@@ -66,7 +66,6 @@ constexpr bool kShowTokenization = false;
 // Must be aligned.
 template <class TConfig, size_t kBatchSize>
 struct Activations {
-  using LayerConfig = LayerF<TConfig>;
   static constexpr size_t kModelDim = TConfig::kModelDim;
   static constexpr size_t kQKVDim = TConfig::kQKVDim;
   static constexpr size_t kHeads = TConfig::kHeads;
@@ -979,8 +978,8 @@ Gemma::Gemma(GemmaTokenizer&& tokenizer, Model model_type, Type weight_type,
 }
 
 Gemma::~Gemma() {
-  CallForModelAndWeight<DeleteLayersPtrs>(model_type_, weight_type_,
-                                          weights_u8_);
+  CallForModelAndWeight<DeleteCompressedWeights>(model_type_, weight_type_,
+                                                 weights_u8_);
 }
 
 void Gemma::Generate(const RuntimeConfig& runtime_config,
diff --git a/gemma/weights.cc b/gemma/weights.cc
index d850cfd..660576a 100644
--- a/gemma/weights.cc
+++ b/gemma/weights.cc
@@ -15,7 +15,6 @@
 
 #include "gemma/weights.h"
 
-#include <algorithm>
 #include <cstdlib>
 
 #include "compression/compress.h"
@@ -47,7 +46,8 @@ struct LoadCompressedWeightsT {
 
     std::array<float, TConfig::kNumTensorScales> scales;
     CacheLoader loader(weights);
-    ForEachTensor<TConfig>(nullptr, *c_weights, loader);
+    const void* raw_weights = nullptr;  // ForEachTensor requires const.
+    ForEachTensor</*kHaveRaw=*/false, TConfig>(raw_weights, *c_weights, loader);
     loader.LoadScales(scales.data(), scales.size());
     if (!loader.ReadAll(pool)) {
       HWY_ABORT("Failed to load model weights.");
diff --git a/gemma/weights.h b/gemma/weights.h
index 8d21cf3..5192319 100644
--- a/gemma/weights.h
+++ b/gemma/weights.h
@@ -25,12 +25,17 @@
 
 namespace gcpp {
 
-// ----------------------------------------------------------------------------
-// Uncompressed
+template <class TConfig>
+struct CompressedLayer {
+  // No ctor/dtor, allocated via AllocateAligned.
+
+  using Weight = typename TConfig::Weight;
+  // If weights are f32, also f32; otherwise at least bf16. Useful for ops that
+  // do not yet support smaller compressed types, or require at least bf16. When
+  // weights are f32, we also want such tensors to be f32.
+  using WeightF32OrBF16 =
+      hwy::If<hwy::IsSame<Weight, float>(), float, hwy::bfloat16_t>;
 
-template <typename T, class TConfig>
-struct Layer {
-  Layer() {}
   static constexpr size_t kHeads = TConfig::kHeads;
   static constexpr size_t kKVHeads = TConfig::kKVHeads;
   static constexpr size_t kModelDim = TConfig::kModelDim;
@@ -49,111 +54,6 @@ struct Layer {
   static constexpr size_t kGriffinDim =
       TConfig::kGriffinLayers > 0 ? kModelDim : 0;
 
-  union {
-    struct {
-      std::array<T, kAttVecEinsumWSize> attn_vec_einsum_w;
-      std::array<T, kQKVEinsumWSize> qkv_einsum_w;
-      std::array<T, kAOBiasDim> attention_output_biases;
-    };
-
-    struct {
-      std::array<T, kGriffinDim * kGriffinDim> linear_x_w;
-      std::array<T, kGriffinDim> linear_x_biases;
-      std::array<T, kGriffinDim * kGriffinDim> linear_y_w;
-      std::array<T, kGriffinDim> linear_y_biases;
-      std::array<T, kGriffinDim * kGriffinDim> linear_out_w;
-      std::array<T, kGriffinDim> linear_out_biases;
-      std::array<T, kConv1dWidth * kGriffinDim> conv_w;
-      std::array<T, kGriffinDim> conv_biases;
-      std::array<T, kGriffinDim * kGriffinDim / kHeads * 2> gate_w;
-      std::array<T, kGriffinDim * 2> gate_biases;
-      std::array<T, kGriffinDim> a;
-    } griffin;
-  };
-
-  std::array<T, kGatingEinsumWSize> gating_einsum_w;
-  std::array<T, kModelDim * kFFHiddenDim> linear_w;
-  std::array<T, kModelDim> pre_attention_norm_scale;
-  std::array<T, kModelDim> pre_ffw_norm_scale;
-  std::array<T, kPostNormScale ? kModelDim : 0> post_attention_norm_scale;
-  std::array<T, kPostNormScale ? kModelDim : 0> post_ffw_norm_scale;
-
-  std::array<T, kFFBiases ? 2 * kFFHiddenDim : 0> ffw_gating_biases;
-  std::array<T, kFFBiases ? kModelDim : 0> ffw_output_biases;
-};
-
-template <class TConfig>
-using LayerF = Layer<float, TConfig>;
-
-// Array instead of single large allocation for parallel mem init. Split out of
-// Weights so that only these pointers are initialized.
-template <typename T, class TConfig>
-struct LayerPointers {
-  explicit LayerPointers(hwy::ThreadPool& pool) {
-    pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) {
-      this->layers[task] = hwy::AllocateAligned<Layer<T, TConfig>>(1);
-    });
-  }
-
-  using TLayer = Layer<T, TConfig>;
-  std::array<hwy::AlignedFreeUniquePtr<TLayer[]>, TConfig::kLayers> layers;
-};
-
-template <typename T, class TConfig>
-struct Weights {
-  // No ctor/dtor, allocated via AllocateAligned.
-
-  std::array<T, TConfig::kVocabSize * TConfig::kModelDim>
-      embedder_input_embedding;
-
-  std::array<T, TConfig::kModelDim> final_norm_scale;
-
-  LayerPointers<T, TConfig> layer_ptrs;
-
-  std::array<T, TConfig::kNumTensorScales> scales;
-
-  const Layer<T, TConfig>* GetLayer(size_t layer) const {
-    return layer_ptrs.layers[layer].get();
-  }
-  Layer<T, TConfig>* GetLayer(size_t layer) {
-    return layer_ptrs.layers[layer].get();
-  }
-};
-
-template <class TConfig>
-using WeightsF = Weights<float, TConfig>;
-
-// ----------------------------------------------------------------------------
-// Compressed
-
-template <class TConfig>
-struct CompressedLayer {
-  // No ctor/dtor, allocated via AllocateAligned.
-
-  using TLayer = gcpp::LayerF<TConfig>;
-  using Weight = typename TConfig::Weight;
-  // If weights are f32, also f32; otherwise at least bf16. Useful for ops that
-  // do not yet support smaller compressed types, or require at least bf16. When
-  // weights are f32, we also want such tensors to be f32.
-  using WeightF32OrBF16 =
-      hwy::If<hwy::IsSame<Weight, float>(), float, hwy::bfloat16_t>;
-
-  static constexpr size_t kHeads = TLayer::kHeads;
-  static constexpr size_t kKVHeads = TLayer::kKVHeads;
-  static constexpr size_t kModelDim = TLayer::kModelDim;
-  static constexpr size_t kQKVDim = TLayer::kQKVDim;
-  static constexpr size_t kFFHiddenDim = TLayer::kFFHiddenDim;
-  static constexpr size_t kAttVecEinsumWSize = TLayer::kAttVecEinsumWSize;
-  static constexpr size_t kQKVEinsumWSize = TLayer::kQKVEinsumWSize;
-  static constexpr size_t kGatingEinsumWSize = TLayer::kGatingEinsumWSize;
-  static constexpr size_t kConv1dWidth = TLayer::kConv1dWidth;
-  static constexpr bool kFFBiases = TLayer::kFFBiases;
-  static constexpr bool kPostNormScale = TConfig::kPostNormScale;
-  static constexpr size_t kAOBiasDim = TLayer::kAOBiasDim;
-  static constexpr size_t kGriffinDim = TLayer::kGriffinDim;
-
-  // Compressed Parameters
-
   template <class T, size_t N>
   using ArrayT = CompressedArray<T, N>;
 
@@ -171,7 +71,7 @@ struct CompressedLayer {
       ArrayT<float, kGriffinDim> linear_y_biases;
       ArrayT<Weight, kGriffinDim * kGriffinDim> linear_out_w;
       ArrayT<float, kGriffinDim> linear_out_biases;
-      ArrayT<float, TConfig::kConv1dWidth * kGriffinDim> conv_w;
+      ArrayT<float, kConv1dWidth * kGriffinDim> conv_w;
       ArrayT<float, kGriffinDim> conv_biases;
       ArrayT<Weight, kGriffinDim * kGriffinDim / kHeads * 2> gate_w;
       ArrayT<float, kGriffinDim * 2> gate_biases;
@@ -179,7 +79,7 @@ struct CompressedLayer {
     } griffin;
   };
 
-  ArrayT<Weight, TLayer::kGatingEinsumWSize> gating_einsum_w;
+  ArrayT<Weight, kGatingEinsumWSize> gating_einsum_w;
   ArrayT<Weight, kModelDim * kFFHiddenDim> linear_w;
   // We don't yet have an RMSNorm that accepts all Weight.
   ArrayT<WeightF32OrBF16, kModelDim> pre_attention_norm_scale;
@@ -251,25 +151,6 @@ struct CompressedWeights {
 // ----------------------------------------------------------------------------
 // Interface
 
-// TODO: can we use TConfig::Weight instead of T?
-template <typename T, typename TConfig>
-struct AllocateWeights {
-  ByteStorageT operator()(hwy::ThreadPool& pool) const {
-    using TWeights = Weights<T, TConfig>;
-    ByteStorageT weights_u8 = AllocateSizeof<TWeights>();
-    TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
-    new (&weights->layer_ptrs) LayerPointers<T, TConfig>(pool);
-    return weights_u8;
-  }
-};
-
-template <typename TConfig>
-struct AllocateWeightsF {
-  ByteStorageT operator()(hwy::ThreadPool& pool) const {
-    return AllocateWeights<float, TConfig>()(pool);
-  }
-};
-
 template <typename TConfig>
 struct AllocateCompressedWeights {
   ByteStorageT operator()(hwy::ThreadPool& pool) const {
@@ -281,86 +162,26 @@ struct AllocateCompressedWeights {
   }
 };
 
-template <typename T, typename TConfig>
-struct ZeroInitWeights {
-  void operator()(ByteStorageT& weights, hwy::ThreadPool& pool) const {
-    Weights<T, TConfig>& w =
-        *reinterpret_cast<Weights<T, TConfig>*>(weights.get());
-    hwy::ZeroBytes(&w.embedder_input_embedding,
-                   sizeof(w.embedder_input_embedding));
-    hwy::ZeroBytes(&w.final_norm_scale, sizeof(w.final_norm_scale));
-    for (int i = 0; i < TConfig::kLayers; ++i) {
-      hwy::ZeroBytes(w.GetLayer(i), sizeof(*w.GetLayer(i)));
-    }
-  }
-};
-
-template <typename TConfig>
-struct ZeroInitWeightsF {
-  void operator()(ByteStorageT& weights, hwy::ThreadPool& pool) const {
-    ZeroInitWeights<float, TConfig>()(weights, pool);
-  }
-};
-
 template <typename TConfig>
 struct ZeroInitCompressedWeights {
-  void operator()(ByteStorageT& weights, hwy::ThreadPool& pool) const {
-    CompressedWeights<TConfig>& w =
-        *reinterpret_cast<CompressedWeights<TConfig>*>(weights.get());
-    w.ZeroInit();
+  void operator()(ByteStorageT& weights_u8, hwy::ThreadPool& pool) const {
+    CompressedWeights<TConfig>& weights =
+        *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
+    weights.ZeroInit();
   }
 };
 
-template <typename T, typename TConfig>
-struct CopyWeights {
-void operator()(Weights<T, TConfig>& dst,
-                const Weights<T, TConfig>& src) const {
-    hwy::CopyBytes(&src.embedder_input_embedding, &dst.embedder_input_embedding,
-                   sizeof(src.embedder_input_embedding));
-    hwy::CopyBytes(&src.final_norm_scale, &dst.final_norm_scale,
-                   sizeof(src.final_norm_scale));
-    for (int i = 0; i < TConfig::kLayers; ++i) {
-      hwy::CopyBytes(src.GetLayer(i), dst.GetLayer(i),
-                     sizeof(*dst.GetLayer(i)));
-    }
-  }
-};
+// TODO: also add RandInitCompressedWeights
 
 template <class TConfig>
-struct DeleteLayersPtrs {
+struct DeleteCompressedWeights {
   void operator()(ByteStorageT& weights_u8) const {
-    auto* weights =
-        reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
-    weights->~CompressedWeights<TConfig>();
+    CompressedWeights<TConfig>& weights =
+        *reinterpret_cast<CompressedWeights<TConfig>*>(weights_u8.get());
+    weights.~CompressedWeights<TConfig>();
   }
 };
 
-// Owns weights and provides access to TConfig.
-template <typename T, typename TConfig>
-class WeightsWrapper {
- public:
-  WeightsWrapper()
-      : pool_(0),
-        data_(AllocateWeights<T, TConfig>()(pool_)),
-        weights_(reinterpret_cast<Weights<T, TConfig>*>(data_.get())) {}
-
-  ~WeightsWrapper() {
-    get().layer_ptrs.~LayerPointers<T, TConfig>();
-  }
-
-  const Weights<T, TConfig>& get() const { return *weights_; }
-  Weights<T, TConfig>& get() { return *weights_; }
-  void clear() { ZeroInitWeights<T, TConfig>()(data_, pool_); }
-  void copy(const WeightsWrapper<T, TConfig>& other) {
-    CopyWeights<T, TConfig>()(get(), other.get());
-  }
-
- private:
-  hwy::ThreadPool pool_;
-  ByteStorageT data_;
-  Weights<T, TConfig>* weights_;
-};
-
 ByteStorageT LoadCompressedWeights(const Path& weights, Model model_type,
                                    Type weight_type, hwy::ThreadPool& pool);
 
@@ -369,30 +190,60 @@ void LogWeightStats(Model model, Type weight_type, const ByteStorageT& weights);
 // ----------------------------------------------------------------------------
 // Iterators
 
-// Calls func(name, float*, CompressedArray&) for each tensor. float* is null
-// if weights = null, which happens during the first call where we attempt to
-// load from cache.
-//
-// This avoids repeating the list of tensors between loading and compressing.
-template <class TConfig, class Func>
-void ForEachTensor(const WeightsF<TConfig>* weights,
-                   CompressedWeights<TConfig>& c_weights, Func& func) {
-  func("c_embedding",
-       weights ? weights->embedder_input_embedding.data() : nullptr,
-       c_weights.embedder_input_embedding);
-  func("c_final_norm", weights ? weights->final_norm_scale.data() : nullptr,
-       c_weights.final_norm_scale);
+// We rely on `if constexpr` to ensure raw_weights->member is only compiled
+// when valid, i.e., kHaveRaw == true, but the IDE analysis does not understand
+// this, hence hide the member access from it.
+#if HWY_IDE
+#define GEMMA_MEMBER(aggregate, member) nullptr
+#else
+#define GEMMA_MEMBER(aggregate, member) aggregate->member
+#endif
 
+// Used by ForEachTensor for tensors that are not in a layer.
+#define GEMMA_CALL_TOP_FUNC(name, member)                    \
+  {                                                          \
+    const float* raw_tensor = nullptr;                       \
+    if constexpr (kHaveRaw) {                                \
+      raw_tensor = GEMMA_MEMBER(raw_weights, member.data()); \
+    }                                                        \
+    func(name, raw_tensor, c_weights.member);                \
+  }
+
+// Used by ForEachTensor for per-layer tensors. Writes into name_buf.
 #define GEMMA_CALL_FUNC(name, member)                          \
   snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
-  func(name_buf, layer ? layer->member.data() : nullptr, layer_weights->member)
+  {                                                            \
+    const float* raw_tensor = nullptr;                         \
+    if constexpr (kHaveRaw) {                                  \
+      raw_tensor = GEMMA_MEMBER(raw_layer, member.data());     \
+    }                                                          \
+    func(name_buf, raw_tensor, c_layer->member);               \
+  }
+
+// Calls func(name, float*, CompressedArray&) for each tensor. float* is
+// null if !kHaveRaw, in which case raw_weights can be nullptr. This happens
+// when loading weights from BlobStore. If kHaveRaw, then RawLayer must be
+// specified and we pass a float* pointing to the raw float weights for that
+// tensor for use by compress_weights.cc.
+//
+// This avoids repeating the list of tensors between loading and compressing,
+// while also avoiding dependency on raw_weights.h.
+template <bool kHaveRaw, class TConfig, class RawLayer = void,
+          class RawWeights = void, class Func>
+void ForEachTensor(const RawWeights* raw_weights,
+                   CompressedWeights<TConfig>& c_weights, Func& func) {
+  GEMMA_CALL_TOP_FUNC("c_embedding", embedder_input_embedding);
+  GEMMA_CALL_TOP_FUNC("c_final_norm", final_norm_scale);
 
   char name_buf[16];
   for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
     auto type = TConfig::kLayerConfig[layer_idx];
     const size_t idx = static_cast<size_t>(layer_idx);
-    const LayerF<TConfig>* layer = weights ? weights->GetLayer(idx) : nullptr;
-    CompressedLayer<TConfig>* layer_weights = c_weights.GetLayer(idx);
+    const RawLayer* raw_layer = nullptr;
+    if constexpr (kHaveRaw) {
+      raw_layer = raw_weights->GetLayer(idx);
+    }
+    CompressedLayer<TConfig>* c_layer = c_weights.GetLayer(idx);
 
     GEMMA_CALL_FUNC("pre_ff_ns", pre_ffw_norm_scale);
     GEMMA_CALL_FUNC("gating_ein", gating_einsum_w);
@@ -430,6 +281,7 @@ void ForEachTensor(const WeightsF<TConfig>* weights,
     }
   }
 #undef GEMMA_CALL_FUNC
+#undef GEMMA_CALL_TOP_FUNC
 }  // ForEachTensor
 
 #define GEMMA_CALL_TOP_FUNC1(name, member) func(name, weights1.member)
diff --git a/gemma/weights_raw.h b/gemma/weights_raw.h
new file mode 100644
index 0000000..cb66876
--- /dev/null
+++ b/gemma/weights_raw.h
@@ -0,0 +1,242 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_RAW_H_
+#define THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_RAW_H_
+
+// NOTE: this file should only be used by compress_weights; it is currently
+// also referenced by backprop, but we plan to remove that. Historical note:
+// this was the original f32-only simple on-disk format created by a Python
+// export script. BlobStore is now the preferred on-disk format, and we load
+// that into CompressedWeights.
+
+#include <random>
+
+#include "gemma/common.h"
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+
+namespace gcpp {
+
+template <typename T, class TConfig>
+struct Layer {
+  Layer() {}
+  static constexpr size_t kHeads = TConfig::kHeads;
+  static constexpr size_t kKVHeads = TConfig::kKVHeads;
+  static constexpr size_t kModelDim = TConfig::kModelDim;
+  static constexpr size_t kQKVDim = TConfig::kQKVDim;
+  static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
+  static constexpr size_t kAttVecEinsumWSize = kHeads * kQKVDim * kModelDim;
+  static constexpr size_t kQKVEinsumWSize =
+      (kHeads + 2 * kKVHeads) * kQKVDim * kModelDim;
+  // 2x for (gelu gating vector, gated vector)
+  static constexpr size_t kGatingEinsumWSize = 2 * kFFHiddenDim * kModelDim;
+  static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
+  static constexpr bool kFFBiases = TConfig::kFFBiases;
+  static constexpr bool kPostNormScale = TConfig::kPostNormScale;
+  static constexpr size_t kAOBiasDim =
+      TConfig::kSoftmaxAttnOutputBiases ? kModelDim : 0;
+  static constexpr size_t kGriffinDim =
+      TConfig::kGriffinLayers > 0 ? kModelDim : 0;
+
+  union {
+    struct {
+      std::array<T, kAttVecEinsumWSize> attn_vec_einsum_w;
+      std::array<T, kQKVEinsumWSize> qkv_einsum_w;
+      std::array<T, kAOBiasDim> attention_output_biases;
+    };
+
+    struct {
+      std::array<T, kGriffinDim * kGriffinDim> linear_x_w;
+      std::array<T, kGriffinDim> linear_x_biases;
+      std::array<T, kGriffinDim * kGriffinDim> linear_y_w;
+      std::array<T, kGriffinDim> linear_y_biases;
+      std::array<T, kGriffinDim * kGriffinDim> linear_out_w;
+      std::array<T, kGriffinDim> linear_out_biases;
+      std::array<T, kConv1dWidth * kGriffinDim> conv_w;
+      std::array<T, kGriffinDim> conv_biases;
+      std::array<T, kGriffinDim * kGriffinDim / kHeads * 2> gate_w;
+      std::array<T, kGriffinDim * 2> gate_biases;
+      std::array<T, kGriffinDim> a;
+    } griffin;
+  };
+
+  std::array<T, kGatingEinsumWSize> gating_einsum_w;
+  std::array<T, kModelDim * kFFHiddenDim> linear_w;
+  std::array<T, kModelDim> pre_attention_norm_scale;
+  std::array<T, kModelDim> pre_ffw_norm_scale;
+  std::array<T, kPostNormScale ? kModelDim : 0> post_attention_norm_scale;
+  std::array<T, kPostNormScale ? kModelDim : 0> post_ffw_norm_scale;
+
+  std::array<T, kFFBiases ? 2 * kFFHiddenDim : 0> ffw_gating_biases;
+  std::array<T, kFFBiases ? kModelDim : 0> ffw_output_biases;
+};
+
+template <class TConfig>
+using LayerF = Layer<float, TConfig>;
+
+// Array instead of single large allocation for parallel mem init. Split out of
+// Weights so that only these pointers are initialized.
+template <typename T, class TConfig>
+struct LayerPointers {
+  explicit LayerPointers(hwy::ThreadPool& pool) {
+    pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) {
+      this->layers[task] = hwy::AllocateAligned<Layer<T, TConfig>>(1);
+    });
+  }
+
+  using TLayer = Layer<T, TConfig>;
+  std::array<hwy::AlignedFreeUniquePtr<TLayer[]>, TConfig::kLayers> layers;
+};
+
+template <typename T, class TConfig>
+struct Weights {
+  // No ctor/dtor, allocated via AllocateAligned.
+
+  std::array<T, TConfig::kVocabSize * TConfig::kModelDim>
+      embedder_input_embedding;
+
+  std::array<T, TConfig::kModelDim> final_norm_scale;
+
+  LayerPointers<T, TConfig> layer_ptrs;
+
+  std::array<T, TConfig::kNumTensorScales> scales;
+
+  const Layer<T, TConfig>* GetLayer(size_t layer) const {
+    return layer_ptrs.layers[layer].get();
+  }
+  Layer<T, TConfig>* GetLayer(size_t layer) {
+    return layer_ptrs.layers[layer].get();
+  }
+};
+
+template <class TConfig>
+using WeightsF = Weights<float, TConfig>;
+
+// TODO: can we use TConfig::Weight instead of T?
+template <typename T, typename TConfig>
+struct AllocateWeights {
+  ByteStorageT operator()(hwy::ThreadPool& pool) const {
+    using TWeights = Weights<T, TConfig>;
+    ByteStorageT weights_u8 = AllocateSizeof<TWeights>();
+    TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
+    new (&weights->layer_ptrs) LayerPointers<T, TConfig>(pool);
+    return weights_u8;
+  }
+};
+
+template <typename TConfig>
+struct AllocateWeightsF {
+  ByteStorageT operator()(hwy::ThreadPool& pool) const {
+    return AllocateWeights<float, TConfig>()(pool);
+  }
+};
+
+// TODO: make a member of Weights<T>.
+template <typename T, typename TConfig>
+struct ZeroInitWeights {
+  void operator()(ByteStorageT& weights, hwy::ThreadPool& pool) const {
+    Weights<T, TConfig>& w =
+        *reinterpret_cast<Weights<T, TConfig>*>(weights.get());
+    hwy::ZeroBytes(&w.embedder_input_embedding,
+                   sizeof(w.embedder_input_embedding));
+    hwy::ZeroBytes(&w.final_norm_scale, sizeof(w.final_norm_scale));
+    for (int i = 0; i < TConfig::kLayers; ++i) {
+      hwy::ZeroBytes(w.GetLayer(i), sizeof(*w.GetLayer(i)));
+    }
+  }
+};
+
+template <typename TConfig>
+struct ZeroInitWeightsF {
+  void operator()(ByteStorageT& weights, hwy::ThreadPool& pool) const {
+    ZeroInitWeights<float, TConfig>()(weights, pool);
+  }
+};
+
+template <typename T, typename TConfig>
+struct CopyWeights {
+void operator()(Weights<T, TConfig>& dst,
+                const Weights<T, TConfig>& src) const {
+    hwy::CopyBytes(&src.embedder_input_embedding, &dst.embedder_input_embedding,
+                   sizeof(src.embedder_input_embedding));
+    hwy::CopyBytes(&src.final_norm_scale, &dst.final_norm_scale,
+                   sizeof(src.final_norm_scale));
+    for (int i = 0; i < TConfig::kLayers; ++i) {
+      hwy::CopyBytes(src.GetLayer(i), dst.GetLayer(i),
+                     sizeof(*dst.GetLayer(i)));
+    }
+  }
+};
+
+template <typename T, size_t kLen>
+void RandInit(std::array<T, kLen>& x, T stddev, std::mt19937& gen) {
+  std::normal_distribution<T> dist(0.0, stddev);
+  for (size_t i = 0; i < kLen; ++i) {
+    x[i] = dist(gen);
+  }
+}
+
+// TODO: make a member of Layer<T>.
+template <typename T, typename TConfig>
+void RandInit(Layer<T, TConfig>& w, T stddev, std::mt19937& gen) {
+  RandInit(w.pre_attention_norm_scale, stddev, gen);
+  RandInit(w.attn_vec_einsum_w, stddev, gen);
+  RandInit(w.qkv_einsum_w, stddev, gen);
+  RandInit(w.pre_ffw_norm_scale, stddev, gen);
+  RandInit(w.gating_einsum_w, stddev, gen);
+  RandInit(w.linear_w, stddev, gen);
+}
+
+template <typename T, typename TConfig>
+void RandInit(Weights<T, TConfig>& w, T stddev, std::mt19937& gen) {
+  static constexpr size_t kLayers = TConfig::kLayers;
+  RandInit(w.embedder_input_embedding, stddev, gen);
+  RandInit(w.final_norm_scale, stddev, gen);
+  for (size_t i = 0; i < kLayers; ++i) {
+    RandInit(*w.GetLayer(i), stddev, gen);
+  }
+}
+
+// Owns weights and provides access to TConfig.
+template <typename T, typename TConfig>
+class WeightsWrapper {
+ public:
+  WeightsWrapper()
+      : pool_(0),
+        data_(AllocateWeights<T, TConfig>()(pool_)),
+        weights_(reinterpret_cast<Weights<T, TConfig>*>(data_.get())) {}
+
+  ~WeightsWrapper() {
+    get().layer_ptrs.~LayerPointers<T, TConfig>();
+  }
+
+  const Weights<T, TConfig>& get() const { return *weights_; }
+  Weights<T, TConfig>& get() { return *weights_; }
+  void clear() { ZeroInitWeights<T, TConfig>()(data_, pool_); }
+  void copy(const WeightsWrapper<T, TConfig>& other) {
+    CopyWeights<T, TConfig>()(get(), other.get());
+  }
+
+ private:
+  hwy::ThreadPool pool_;
+  ByteStorageT data_;
+  Weights<T, TConfig>* weights_;
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_WEIGHTS_RAW_H_