From 4c23932289af7737bf53239bc6ac5bdb2490a624 Mon Sep 17 00:00:00 2001
From: Luca Versari <veluca@google.com>
Date: Thu, 4 Apr 2024 13:39:44 +0200
Subject: [PATCH] Improve weight handling.

- Allow scaling of SFP weights
- Allow using uncompressed weights
- Do not try to compress weights in the main model calls
- Reduce code duplication in weight handling with some macros

Co-authored-by: Eugene Kliuchnikov <eustas@google.com>
Co-authored-by: Thomas Fischbacher <tfish@google.com>
Co-authored-by: Zoltan Szabadka <szabadka@google.com>
---
 compression/blob_store.cc   |  45 ++--
 compression/blob_store.h    |   6 +
 compression/compress-inl.h  |  41 ++-
 compression/compress.h      |  18 +-
 configs.h                   |  14 ++
 examples/hello_world/run.cc |   5 +-
 gemma.cc                    | 483 ++++++++++++++++++++++--------------
 gemma.h                     |  19 +-
 ops_test.cc                 |   1 +
 run.cc                      |  23 +-
 util/app.h                  |  39 +--
 11 files changed, 424 insertions(+), 270 deletions(-)
diff --git a/compression/blob_store.cc b/compression/blob_store.cc
index b47515a..2458fb9 100644
--- a/compression/blob_store.cc
+++ b/compression/blob_store.cc
@@ -29,15 +29,15 @@
 // copybara:import_next_line:gemma_cpp
 #include "compression/blob_store.h"
 
+#include <fcntl.h>  // open
 #include <stdint.h>
 #include <stdio.h>     // SEEK_END - unistd isn't enough for IDE.
 #include <sys/stat.h>  // O_RDONLY
-#include <fcntl.h>  // open
 #if HWY_OS_WIN
-#include <io.h>  // read, write, close
 #include <fileapi.h>
+#include <io.h>  // read, write, close
 #else
-#include <unistd.h>    // read, write, close
+#include <unistd.h>  // read, write, close
 #endif
 
 #include <atomic>
@@ -113,8 +113,9 @@ hwy::uint128_t MakeKey(const char* string) {
   return ret;
 }
 
-static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
-                                 std::vector<BlobIO>& requests) {
+namespace {
+void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
+                          std::vector<BlobIO>& requests) {
   // Split into chunks for load-balancing even if blob sizes vary.
   constexpr size_t kChunkSize = 4 * 1024 * 1024;
 
@@ -129,7 +130,7 @@ static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
     requests.emplace_back(offset + pos, size - pos, data + pos, 0);
   }
 }
-
+}  // namespace
 
 struct IO {
   // Returns size in bytes or 0.
@@ -197,12 +198,6 @@ static_assert(HWY_IS_LITTLE_ENDIAN, "Assumes little endian");
 class BlobStore {
   static constexpr uint32_t kMagic = 0x0A534253;  // SBS\n
 
-  // Blob offsets on disk and memory addresses are a multiple of this, because
-  // we pad the header and each blob's size. This matches CUDA alignment and the
-  // maximum SVE vector size, and exceeds typical x86 cache line sizes (64 or
-  // 128), which can help performance.
-  static constexpr size_t kAlign = 256;
-
  public:
   // NOT including padding, so that we can also use ZeroFillPadding after
   // copying the header.
@@ -215,13 +210,13 @@ class BlobStore {
   // blobs. Requires num_blobs_ to already be set, typically by reading
   // sizeof(BlobStore) bytes from disk.
   size_t PaddedHeaderSize() const {
-    return hwy::RoundUpTo(HeaderSize(num_blobs_), kAlign);
+    return hwy::RoundUpTo(HeaderSize(num_blobs_), kBlobAlign);
   }
 
   // Returns aligned offset and zero-fills between that and `offset`.
   uint64_t ZeroFillPadding(uint64_t offset) {
     uint8_t* const bytes = reinterpret_cast<uint8_t*>(this);
-    const uint64_t padded = hwy::RoundUpTo(offset, kAlign);
+    const uint64_t padded = hwy::RoundUpTo(offset, kBlobAlign);
     hwy::ZeroBytes(bytes + offset, padded - offset);
     return padded;
   }
@@ -236,7 +231,7 @@ class BlobStore {
     for (size_t i = 0; i < num_blobs_; ++i) {
       const hwy::uint128_t val = keys_[num_blobs_ + i];
       if (val.lo != offset) return __LINE__;
-      offset = ZeroFillPadding(offset + val.hi);
+      offset = hwy::RoundUpTo(offset + val.hi, kBlobAlign);
     }
 
     if (offset != file_size_) return __LINE__;
@@ -253,25 +248,24 @@ class BlobStore {
 
   static std::vector<BlobIO> PrepareWriteRequests(
       const hwy::uint128_t keys[], const hwy::Span<uint8_t> blobs[],
-      size_t num_blobs) {
+      size_t num_blobs, BlobStore* bs) {
     // Sanity check and ensure the cast below is safe.
     HWY_ASSERT(num_blobs < (1ULL << 20));
 
     // Allocate var-length header.
     const size_t header_size = HeaderSize(num_blobs);
-    const size_t padded_header_size = hwy::RoundUpTo(header_size, kAlign);
-    BlobStorePtr bs = Allocate(padded_header_size);
+    const size_t padded_header_size = hwy::RoundUpTo(header_size, kBlobAlign);
     const uint64_t padded_header_end = bs->ZeroFillPadding(header_size);
     HWY_ASSERT(padded_header_end == padded_header_size);
 
     // All-zero buffer used to write padding to the file without copying the
     // input blobs.
-    static uint8_t zeros[kAlign] = {0};
+    static uint8_t zeros[kBlobAlign] = {0};
 
     // Total file size will be the header plus all padded blobs.
     uint64_t payload = 0;
     for (size_t i = 0; i < num_blobs; ++i) {
-      payload += hwy::RoundUpTo(blobs[i].size(), kAlign);
+      payload += hwy::RoundUpTo(blobs[i].size(), kBlobAlign);
     }
     const size_t total_size = padded_header_size + payload;
 
@@ -285,7 +279,7 @@ class BlobStore {
     std::vector<BlobIO> requests;
     requests.reserve(1 + 2 * num_blobs);
     requests.emplace_back(/*offset=*/0, padded_header_size,
-                          reinterpret_cast<uint8_t*>(bs.get()), 0);
+                          reinterpret_cast<uint8_t*>(bs), 0);
 
     // Fill second half of keys_ with offset/size and prepare IO requests.
     uint64_t offset = padded_header_end;
@@ -295,10 +289,10 @@ class BlobStore {
 
       EnqueueChunkRequests(offset, blobs[i].size(), blobs[i].data(), requests);
       offset += blobs[i].size();
-      const size_t padded_size = hwy::RoundUpTo(blobs[i].size(), kAlign);
+      const size_t padded_size = hwy::RoundUpTo(blobs[i].size(), kBlobAlign);
       if (padded_size != blobs[i].size()) {
         const size_t padding = padded_size - blobs[i].size();
-        HWY_ASSERT(padding <= kAlign);
+        HWY_ASSERT(padding <= kBlobAlign);
         requests.emplace_back(offset, padding, zeros, 0);
         offset += padding;
       }
@@ -418,8 +412,11 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
   HWY_ASSERT(keys_.size() == blobs_.size());
 
   // Concatenate blobs in memory.
+  const size_t header_size = BlobStore::HeaderSize(keys_.size());
+  const size_t padded_header_size = hwy::RoundUpTo(header_size, kBlobAlign);
+  BlobStorePtr bs = BlobStore::Allocate(padded_header_size);
   std::vector<BlobIO> requests = BlobStore::PrepareWriteRequests(
-      keys_.data(), blobs_.data(), keys_.size());
+      keys_.data(), blobs_.data(), keys_.size(), bs.get());
 
   // Create/replace existing file.
 #if HWY_OS_WIN
diff --git a/compression/blob_store.h b/compression/blob_store.h
index 6ced37f..8736d0f 100644
--- a/compression/blob_store.h
+++ b/compression/blob_store.h
@@ -40,6 +40,12 @@ using BlobStorePtr = hwy::AlignedFreeUniquePtr<BlobStore>;
 // 0 if successful, otherwise the line number of the failing check.
 using BlobError = int;
 
+// Blob offsets on disk and memory addresses are a multiple of this, because
+// we pad the header and each blob's size. This matches CUDA alignment and the
+// maximum SVE vector size, and exceeds typical x86 cache line sizes (64 or
+// 128), which can help performance.
+static constexpr size_t kBlobAlign = 256;
+
 struct BlobIO {
   BlobIO(uint64_t offset, size_t size, void* data, uint64_t padding)
       : offset(offset), size(size), data(data), padding(padding) {}
diff --git a/compression/compress-inl.h b/compression/compress-inl.h
index 5f11ca1..5717545 100644
--- a/compression/compress-inl.h
+++ b/compression/compress-inl.h
@@ -381,13 +381,14 @@ HWY_INLINE void Compress(const std::array<float, kCapacity>& in,
 }
 
 // Decompresses `num` values from `compressed` starting at `compressed_ofs`.
-template <typename MatT, size_t kCapacity, typename OutT>
-HWY_NOINLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
-                             size_t compressed_ofs, OutT* out, size_t num) {
-  HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
+template <typename ArrayT, typename OutT>
+HWY_NOINLINE void Decompress(const ArrayT& compressed, size_t compressed_ofs,
+                             OutT* out, size_t num) {
+  HWY_DASSERT(compressed_ofs + num <= compressed.size());
   const hn::ScalableTag<OutT> d;
-  using Traits = CompressTraits<MatT>;
-  Traits::Decompress(d, kCapacity, compressed.data(), compressed_ofs, out, num);
+  using Traits = CompressTraits<typename ArrayT::value_type>;
+  Traits::Decompress(d, compressed.size(), compressed.data(), compressed_ofs,
+                     out, num);
 }
 
 // As above, but with threading and benchmarking.
@@ -395,7 +396,7 @@ template <typename MatT, size_t kCapacity, typename OutT>
 HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
                            size_t compressed_ofs, OutT* out, size_t num,
                            hwy::ThreadPool& pool) {
-  HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
+  HWY_DASSERT(compressed_ofs + num <= compressed.size());
   const double t0 = hwy::platform::Now();
 
   using Traits = CompressTraits<MatT>;
@@ -407,7 +408,7 @@ HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
 
         const size_t ofs = idx_batch * kBatch;
         const size_t num = idx_batch == num_batches - 1 ? (num - ofs) : kBatch;
-        Traits::Decompress(d, compressed.NumElements(), compressed.data(),
+        Traits::Decompress(d, compressed.size(), compressed.data(),
                            compressed_ofs + ofs, out + ofs, num);
       });
 
@@ -417,16 +418,28 @@ HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
   fprintf(stderr, "Decompress %.1f MB/s\n", mbps);
 }
 
+// Returns dot product with `vec_aligned` of length `num`.
+template <class DF, typename ArrayT, typename VecT>
+HWY_INLINE float Dot(DF df, const ArrayT& compressed, size_t compressed_ofs,
+                     const VecT* vec_aligned, size_t num) {
+  HWY_DASSERT(compressed_ofs + num <= compressed.size());
+  HWY_DASSERT(hn::IsAligned(df, vec_aligned));
+  using Traits = CompressTraits<typename ArrayT::value_type>;
+  return Traits::Dot(df, compressed.size(), compressed.data(), compressed_ofs,
+                     vec_aligned, num);
+}
+
 // Returns dot product with `vec_aligned` of length `num`.
 template <class DF, typename MatT, size_t kCapacity, typename VecT>
 HWY_INLINE float Dot(DF df, const CompressedArray<MatT, kCapacity>& compressed,
                      size_t compressed_ofs, const VecT* vec_aligned,
                      size_t num) {
-  HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
+  HWY_DASSERT(compressed_ofs + num <= compressed.size());
   HWY_DASSERT(hn::IsAligned(df, vec_aligned));
   using Traits = CompressTraits<MatT>;
-  return Traits::Dot(df, kCapacity, compressed.data(), compressed_ofs,
-                     vec_aligned, num);
+  return (compressed.scale() * Traits::Dot(df, compressed.size(),
+                                           compressed.data(), compressed_ofs,
+                                           vec_aligned, num));
 }
 
 // Callback used by ForeachTensor.
@@ -445,6 +458,12 @@ class Compressor {
                 compressed.CompressedSize());
   }
 
+  void AddScales(float* scales, size_t len) {
+    if (len) {
+      writer_.Add(CacheKey<float>("scales"), scales, len * sizeof(scales[0]));
+    }
+  }
+
   void WriteAll(hwy::ThreadPool& pool, const char* blob_filename) {
     const BlobError err = writer_.WriteAll(pool, blob_filename);
     if (err != 0) {
diff --git a/compression/compress.h b/compression/compress.h
index e09d7e5..118ded2 100644
--- a/compression/compress.h
+++ b/compression/compress.h
@@ -71,10 +71,15 @@ class CompressedArray {
   }
 
  public:
+  using value_type = MatT;
+
   MatT* data() { return data_.data(); }
   const MatT* data() const { return data_.data(); }
 
-  constexpr size_t NumElements() const { return kCapacity; }
+  float scale() const { return scale_[0]; }
+  void set_scale(float scale) { scale_[0] = scale; }
+
+  constexpr size_t size() const { return kCapacity; }
 
   constexpr size_t CompressedSize() const {
     return NumCompressed() * sizeof(MatT);
@@ -82,6 +87,7 @@ class CompressedArray {
 
  private:
   std::array<MatT, NumCompressed()> data_;
+  float scale_[kBlobAlign / sizeof(float)];
 };
 
 #if COMPRESS_STATS
@@ -187,11 +193,21 @@ class CacheLoader {
 
     err_ = reader_.Enqueue(CacheKey<MatT>(name), compressed.data(),
                            compressed.CompressedSize());
+    compressed.set_scale(1.0f);
     if (err_ != 0) {
       fprintf(stderr, "Failed to read cache %s (error %d)\n", name, err_);
     }
   }
 
+  void LoadScales(float* scales, size_t len) {
+    if (0 != reader_.Enqueue(CacheKey<float>("scales"), scales,
+                             len * sizeof(scales[0]))) {
+      for (size_t i = 0; i < len; ++i) {
+        scales[i] = 1.0f;
+      }
+    }
+  }
+
   // Returns whether all tensors are successfully loaded from cache.
   bool ReadAll(hwy::ThreadPool& pool) {
     // reader_ invalid or any Enqueue failed
diff --git a/configs.h b/configs.h
index e704664..f1d7f9d 100644
--- a/configs.h
+++ b/configs.h
@@ -30,6 +30,16 @@
 
 #include <stddef.h>
 
+// copybara:import_next_line:gemma_cpp
+#include "compression/sfp.h"
+#include "hwy/base.h"  // hwy::bfloat16_t
+
+// Allowable types for GEMMA_WEIGHT_T (can be specified at compilation time):
+// float, hwy::bfloat16_t, SfpStream, NuqStream
+#ifndef GEMMA_WEIGHT_T
+#define GEMMA_WEIGHT_T SfpStream
+#endif  // !GEMMA_WEIGHT_T
+
 namespace gcpp {
 
 static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN;
@@ -45,6 +55,8 @@ struct ConfigGemma7B {
   static constexpr int kKVHeads = 16;  // standard MHA
   static constexpr int kQKVDim = 256;  // query size == key size == value size
   static constexpr int kTopK = gcpp::kTopK;
+  static constexpr int kNumTensorScales = 0;
+  using WeightT = GEMMA_WEIGHT_T;
 };
 
 struct ConfigGemma2B {
@@ -57,6 +69,8 @@ struct ConfigGemma2B {
   static constexpr int kKVHeads = 1;
   static constexpr int kQKVDim = 256;  // query size == key size == value size
   static constexpr int kTopK = gcpp::kTopK;
+  static constexpr int kNumTensorScales = 0;
+  using WeightT = GEMMA_WEIGHT_T;
 };
 
 }  // namespace gcpp
diff --git a/examples/hello_world/run.cc b/examples/hello_world/run.cc
index a352250..484b4a4 100644
--- a/examples/hello_world/run.cc
+++ b/examples/hello_world/run.cc
@@ -19,9 +19,9 @@
 #include "gemma.h"
 // copybara:import_next_line:gemma_cpp
 #include "util/app.h"  // LoaderArgs
+#include "hwy/contrib/thread_pool/thread_pool.h"
 // copybara:import_next_line:gemma_cpp
 #include "util/args.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
 
 std::vector<int> tokenize(
     const std::string& prompt_string,
@@ -43,8 +43,7 @@ int main(int argc, char** argv) {
   hwy::ThreadPool pool(num_threads);
 
   // Instantiate model and KV Cache
-  gcpp::Gemma model(loader.tokenizer, loader.compressed_weights,
-                    loader.ModelType(), pool);
+  gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
   auto kv_cache = CreateKVCache(loader.ModelType());
   size_t pos = 0;  // KV Cache position
 
diff --git a/gemma.cc b/gemma.cc
index bfaa812..77478bd 100644
--- a/gemma.cc
+++ b/gemma.cc
@@ -19,18 +19,18 @@
 // which we pass the filename via macro 'argument'.
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "gemma.cc"  // NOLINT
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/foreach_target.h"        // IWYU pragma: keep
 // Must come after foreach_target.h to avoid redefinition errors.
 // copybara:import_next_line:gemma_cpp
 #include "compression/compress-inl.h"
 // copybara:import_next_line:gemma_cpp
 #include "ops.h"
-// copybara:import_next_line:gemma_cpp
-#include "util/args.h"  // Path
 #include "hwy/contrib/matvec/matvec-inl.h"
 #include "hwy/highway.h"
 #include "hwy/profiler.h"
 #include "hwy/timer.h"
+// copybara:import_next_line:gemma_cpp
+#include "util/args.h"  // Path
 
 // Non-SIMD includes and types. Note that HWY_ONCE is only true on the last
 // compile pass, whereas we want this defined in the first.
@@ -64,6 +64,12 @@
 // copybara:import_next_line:sentencepiece
 #include "src/sentencepiece_processor.h"
 
+// Setting this to true disables fread() calls that read the model file.
+constexpr bool kDryRunFread = false;
+
+// Setting this to false will load and use uncompressed weights.
+constexpr bool kWeightsAreCompressed = true;
+
 namespace gcpp {
 
 template <class TConfig>
@@ -88,70 +94,145 @@ struct Layer {
   std::array<float, kModelDim> pre_ffw_norm_scale;
 };
 
+float ScaleWeights(float* data, size_t len) {
+  float maxabs = 0.0;
+  for (size_t i = 0; i < len; ++i) {
+    maxabs = std::max(maxabs, std::abs(data[i]));
+  }
+  const float kMaxRange = 1.875f;
+  if (maxabs <= kMaxRange) {
+    return 1.0f;
+  }
+  const float scale = maxabs / kMaxRange;
+  const float inv_scale = 1.0f / scale;
+  for (size_t i = 0; i < len; ++i) {
+    data[i] *= inv_scale;
+  }
+  return scale;
+}
+
+// Array instead of single large allocation for parallel mem init. Split out of
+// Weights so that only these pointers are initialized.
+template <class TConfig>
+struct LayerPointers {
+  explicit LayerPointers(hwy::ThreadPool& pool) {
+    pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) {
+      this->layers[task] = hwy::AllocateAligned<Layer<TConfig>>(1);
+    });
+  }
+
+  using TLayer = Layer<TConfig>;
+  std::array<hwy::AlignedFreeUniquePtr<TLayer[]>, TConfig::kLayers> layers;
+};
+
 template <class TConfig>
 struct Weights {
-  Weights() = default;
-
-  hwy::AlignedUniquePtr<Layer<TConfig>[]> layers;  // kLayers
+  // No ctor/dtor, allocated via AllocateAligned.
 
   std::array<float, TConfig::kVocabSize * TConfig::kModelDim>
       embedder_input_embedding;
 
   std::array<float, TConfig::kModelDim> final_norm_scale;
+
+  LayerPointers<TConfig> layer_ptrs;
+
+  std::array<float, TConfig::kNumTensorScales> scales;
+
+  const Layer<TConfig>* GetLayer(size_t layer) const {
+    return layer_ptrs.layers[layer].get();
+  }
+  Layer<TConfig>* GetLayer(size_t layer) {
+    return layer_ptrs.layers[layer].get();
+  }
 };
 
-// Only called if cached loading fails.
 template <typename TConfig>
-hwy::AlignedUniquePtr<Weights<TConfig>> LoadWeights(const Path& checkpoint) {
+hwy::AlignedFreeUniquePtr<uint8_t[]> LoadWeights(
+    const Path& checkpoint, hwy::ThreadPool& pool,
+    bool scale_for_compression = false) {
   PROFILER_ZONE("Startup.LoadWeights");
-  using TWeights = Weights<TConfig>;
-  hwy::AlignedUniquePtr<TWeights> weights = hwy::MakeUniqueAligned<TWeights>();
-  weights->layers =
-      hwy::MakeUniqueAlignedArray<Layer<TConfig>>(TConfig::kLayers);
-
-  if (checkpoint.path.empty()) {
-    HWY_ABORT(
-        "Loading --compressed_weights failed; we require a --weights argument. "
-        "Please see issue #11 on how to create this file.\n");
+  if (!std::filesystem::exists(checkpoint.path)) {
+    HWY_ABORT("The model weights file '%s' does not exist.",
+              checkpoint.path.c_str());
   }
 
+  using TWeights = Weights<TConfig>;
+  hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8 =
+      hwy::AllocateAligned<uint8_t>(sizeof(TWeights));
+  TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
+  new (&weights->layer_ptrs) LayerPointers<TConfig>(pool);
+
+  size_t scale_pos = 0;
   FILE* fptr;
-  fptr = fopen(checkpoint.path.c_str(), "rb");
-  if (fptr == nullptr) {
-    HWY_ABORT("Failed to open model file %s - does it exist?",
-              checkpoint.path.c_str());
+  if constexpr (kDryRunFread) {
+    fprintf(stderr, "Dry-Run, not reading model-file.\n");
+  } else {
+    fptr = fopen(checkpoint.path.c_str(), "rb");
+    if (fptr == nullptr) {
+      HWY_ABORT("Failed to open model file %s - does it exist?",
+                checkpoint.path.c_str());
+    }
   }
   bool ok = true;
   uint64_t total_size = 0;
-  ok &= 1 == fread(&(weights->embedder_input_embedding),
-                   sizeof(weights->embedder_input_embedding), 1, fptr);
-  ok &= 1 == fread(&(weights->final_norm_scale),
-                   sizeof(weights->final_norm_scale), 1, fptr);
-  total_size += sizeof(weights->embedder_input_embedding) +
-                sizeof(weights->final_norm_scale);
+  auto do_fread = [&](void* var, int layer, const char* name, size_t size) {
+    if (layer == -1) {
+      fprintf(stderr, "Loading Parameters (size %zu): %s\n", size, name);
+    } else {
+      fprintf(stderr, "Loading Parameters (layer=%d, size %zu): %s\n", layer,
+              size, name);
+    }
+    if constexpr (!kDryRunFread) {
+      ok &= 1 == fread(var, size, 1, fptr);
+      total_size += size;
+    }
+  };
+  do_fread(&(weights->embedder_input_embedding), -1, "embedder_input_embedding",
+           sizeof(weights->embedder_input_embedding));
+  do_fread(&(weights->final_norm_scale), -1, "final_norm_scale",
+           sizeof(weights->final_norm_scale));
   for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
-    Layer<TConfig>* layer_view = &weights->layers[layer];
-    ok &= 1 == fread(&layer_view->attn_vec_einsum_w,
-                     sizeof(layer_view->attn_vec_einsum_w), 1, fptr);
-    ok &= 1 == fread(&layer_view->qkv_einsum_w,
-                     sizeof(layer_view->qkv_einsum_w), 1, fptr);
-    ok &= 1 == fread(&layer_view->gating_einsum_w,
-                     sizeof(layer_view->gating_einsum_w), 1, fptr);
-    ok &= 1 ==
-          fread(&layer_view->linear_w, sizeof(layer_view->linear_w), 1, fptr);
-    ok &= 1 == fread(&layer_view->pre_attention_norm_scale,
-                     sizeof(layer_view->pre_attention_norm_scale), 1, fptr);
-    ok &= 1 == fread(&layer_view->pre_ffw_norm_scale,
-                     sizeof(layer_view->pre_ffw_norm_scale), 1, fptr);
-    total_size += sizeof(*layer_view);
+    Layer<TConfig>* layer_view = weights->GetLayer(layer);
+
+#define READ_WEIGHTS(name)                                                 \
+  do {                                                                     \
+    do_fread(&(layer_view->name), layer, #name, sizeof(layer_view->name)); \
+  } while (0)
+
+#define SCALE_WEIGHTS(name)                                               \
+  do {                                                                    \
+    if (ok && !kDryRunFread && scale_for_compression) {                   \
+      weights->scales[scale_pos++] =                                      \
+          ScaleWeights(layer_view->name.data(), layer_view->name.size()); \
+    }                                                                     \
+  } while (0)
+    // Make sure we don't have uninitialized memory.
+    hwy::ZeroBytes(layer_view, sizeof(*layer_view));
+    READ_WEIGHTS(attn_vec_einsum_w);
+    READ_WEIGHTS(qkv_einsum_w);
+    SCALE_WEIGHTS(attn_vec_einsum_w);
+    SCALE_WEIGHTS(qkv_einsum_w);
+    READ_WEIGHTS(gating_einsum_w);
+    READ_WEIGHTS(linear_w);
+    SCALE_WEIGHTS(gating_einsum_w);
+    SCALE_WEIGHTS(linear_w);
+    READ_WEIGHTS(pre_attention_norm_scale);
+    READ_WEIGHTS(pre_ffw_norm_scale);
+#undef READ_WEIGHTS
   }
   if (!ok) {
-    HWY_ABORT("Failed to read from %s - might be a directory, or too small? "
-              "expected size: %d kB", checkpoint.path.c_str(),
-              static_cast<uint32_t>(total_size >> 10));
+    HWY_ABORT(
+        "Failed to read from %s - might be a directory, or too small? "
+        "expected size: %d kB",
+        checkpoint.path.c_str(), static_cast<uint32_t>(total_size >> 10));
   }
-  HWY_ASSERT(0 == fclose(fptr));
-  return weights;
+  if (!kDryRunFread) {
+    HWY_ASSERT(0 == fclose(fptr));
+    if (scale_for_compression) {
+      HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
+    }
+  }
+  return weights_u8;
 }
 
 template <class TConfig>
@@ -159,18 +240,19 @@ struct CompressedLayer {
   // No ctor/dtor, allocated via AllocateAligned.
 
   using TLayer = gcpp::Layer<TConfig>;
+  using WeightT = typename TConfig::WeightT;
 
   static constexpr size_t kModelDim = TConfig::kModelDim;
   static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
 
   // Compressed Parameters
   // We don't yet have an RMSNorm that accepts all WeightT.
-  CompressedArray<hwy::bfloat16_t, kModelDim> c_pre_attention_norm_scale;
-  CompressedArray<hwy::bfloat16_t, kModelDim> c_pre_ffw_norm_scale;
-  CompressedArray<WeightT, TLayer::kGatingEinsumWSize> c_gating_einsum_w;
-  CompressedArray<WeightT, kModelDim * kFFHiddenDim> c_linear_w;
-  CompressedArray<WeightT, TLayer::kQKVEinsumWSize> c_qkv_einsum_w;
-  CompressedArray<WeightT, TLayer::kAttVecEinsumWSize> c_attn_vec_einsum_w;
+  CompressedArray<hwy::bfloat16_t, kModelDim> pre_attention_norm_scale;
+  CompressedArray<hwy::bfloat16_t, kModelDim> pre_ffw_norm_scale;
+  CompressedArray<WeightT, TLayer::kGatingEinsumWSize> gating_einsum_w;
+  CompressedArray<WeightT, kModelDim * kFFHiddenDim> linear_w;
+  CompressedArray<WeightT, TLayer::kQKVEinsumWSize> qkv_einsum_w;
+  CompressedArray<WeightT, TLayer::kAttVecEinsumWSize> attn_vec_einsum_w;
 };
 
 // Array instead of single large allocation for parallel mem init. Split out of
@@ -193,21 +275,25 @@ struct CompressedWeights {
   // No ctor/dtor, allocated via AllocateAligned.
 
   CompressedArray<EmbedderInputT, TConfig::kVocabSize * TConfig::kModelDim>
-      c_embedder_input_embedding;
+      embedder_input_embedding;
 
-  CompressedArray<hwy::bfloat16_t, TConfig::kModelDim> c_final_norm_scale;
+  CompressedArray<hwy::bfloat16_t, TConfig::kModelDim> final_norm_scale;
 
   // Must be last so that the other arrays remain aligned.
   CompressedLayerPointers<TConfig> c_layer_ptrs;
 
-  const CompressedLayer<TConfig>* CLayer(size_t layer) const {
+  const CompressedLayer<TConfig>* GetLayer(size_t layer) const {
     return c_layer_ptrs.c_layers[layer].get();
   }
-  CompressedLayer<TConfig>* CLayer(size_t layer) {
+  CompressedLayer<TConfig>* GetLayer(size_t layer) {
     return c_layer_ptrs.c_layers[layer].get();
   }
 };
 
+template <class TConfig>
+using WeightsT = hwy::If<kWeightsAreCompressed, CompressedWeights<TConfig>,
+                         Weights<TConfig>>;
+
 // Aligned.
 template <class TConfig, size_t TBatchSize>
 struct Activations {
@@ -272,16 +358,27 @@ KVCache CreateKVCache(Model type) {
   }
 }
 
+namespace {
+template <class Config>
+void DeleteLayersPtrs(CompressedWeights<Config>* c_weights) {
+  c_weights->c_layer_ptrs.~CompressedLayerPointers<Config>();
+}
+template <class Config>
+void DeleteLayersPtrs(Weights<Config>* weights) {
+  weights->layer_ptrs.~LayerPointers<Config>();
+}
+}  // namespace
+
 template <class Config>
 struct GemmaImpl : public GemmaInterface {
   GemmaImpl(std::unique_ptr<sentencepiece::SentencePieceProcessor>& tokenizer,
-            hwy::AlignedFreeUniquePtr<uint8_t[]>& compressed_weights,
+            hwy::AlignedFreeUniquePtr<uint8_t[]>& weights_u8,
             hwy::ThreadPool& pool);
 
   ~GemmaImpl() {
-    using CWeights = CompressedWeights<Config>;
-    CWeights* c_weights = reinterpret_cast<CWeights*>(compressed_weights.get());
-    c_weights->c_layer_ptrs.~CompressedLayerPointers<Config>();
+    WeightsT<Config>* weights =
+        reinterpret_cast<WeightsT<Config>*>(weights_u8.get());
+    DeleteLayersPtrs(weights);
   }
 
   const sentencepiece::SentencePieceProcessor* Tokenizer() const override {
@@ -296,7 +393,7 @@ struct GemmaImpl : public GemmaInterface {
                 int verbosity) override;
 
   std::unique_ptr<sentencepiece::SentencePieceProcessor> tokenizer;
-  hwy::AlignedFreeUniquePtr<uint8_t[]> compressed_weights;
+  hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8;
   hwy::AlignedUniquePtr<Activations<Config, kPrefillBatchSize>> prefill;
   hwy::AlignedUniquePtr<Activations<Config, 1>> state;
 };
@@ -309,11 +406,11 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-template <class TConfig, size_t kBatchSize>
+template <size_t kBatchSize, typename LayerT, class TConfig>
 HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
                             Activations<TConfig, kBatchSize>& activations,
-                            const CompressedLayer<TConfig>* c_layer,
-                            KVCache& kv_cache, hwy::ThreadPool& pool) {
+                            const LayerT* layer_weights, KVCache& kv_cache,
+                            hwy::ThreadPool& pool) {
   PROFILER_ZONE("Gen.Attention");
   const size_t pos = batch_start + batch_idx;
   HWY_DASSERT(batch_idx < kBatchSize);
@@ -329,27 +426,26 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
   static const float kQueryScale =
       static_cast<float>(1.0 / sqrt(static_cast<double>(kQKVDim)));
 
-  const size_t batch_offset = batch_idx * kModelDim;
+  float* x = activations.pre_att_rms_out.data() + batch_idx * kModelDim;
 
   auto ProjQ = [&](uint64_t head, size_t head_offset) HWY_ATTR {
     float* HWY_RESTRICT q =
         activations.q.data() + head * kQKVDim + batch_idx * kHeads * kQKVDim;
 
-    MatVecLoop<kQKVDim, kModelDim>(
-        c_layer->c_qkv_einsum_w, head_offset + 0 * kQKVDim * kModelDim,
-        activations.pre_att_rms_out.data() + batch_offset, q);
+    MatVecLoop<kQKVDim, kModelDim>(layer_weights->qkv_einsum_w,
+                                   head_offset + 0 * kQKVDim * kModelDim, x, q);
   };
 
-  auto ProjKV =
-      [&](size_t k_offset, size_t v_offset, size_t kv_offset) HWY_ATTR {
-        TwoOfsMatVecLoop<kQKVDim, kModelDim>(
-            c_layer->c_qkv_einsum_w, k_offset, v_offset,
-            activations.pre_att_rms_out.data() + batch_offset,
-            kv_cache.key_cache.get() + kv_offset,
-            kv_cache.value_cache.get() + kv_offset);
+  auto ProjKV = [&](size_t k_offset, size_t v_offset,
+                    size_t kv_offset) HWY_ATTR {
+    float* HWY_RESTRICT k = kv_cache.key_cache.get() + kv_offset;
+    float* HWY_RESTRICT v = kv_cache.value_cache.get() + kv_offset;
 
-        Rope(kv_cache.key_cache.get() + kv_offset, kQKVDim, pos);
-      };
+    TwoOfsMatVecLoop<kQKVDim, kModelDim>(layer_weights->qkv_einsum_w, k_offset,
+                                         v_offset, x, k, v);
+
+    Rope(k, kQKVDim, pos);
+  };
 
   auto Attn = [&](uint64_t head, size_t head_offset) HWY_ATTR {
     // Calculate scores
@@ -388,7 +484,7 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
         head == 0
             ? activations.att_post2.data() + batch_idx * kModelDim
             : activations.att_post1.data() + head * kBatchSize * kModelDim;
-    MatVecLoop<kModelDim, kQKVDim>(c_layer->c_attn_vec_einsum_w,
+    MatVecLoop<kModelDim, kQKVDim>(layer_weights->attn_vec_einsum_w,
                                    head * kModelDim * kQKVDim, att_out,
                                    head_out);
   };
@@ -431,9 +527,9 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
   }
 }
 
-template <typename TConfig, size_t kBatchSize>
+template <size_t kBatchSize, typename LayerT, typename TConfig>
 HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
-                      size_t batch_idx, const CompressedLayer<TConfig>* c_layer,
+                      size_t batch_idx, const LayerT* layer_weights,
                       hwy::ThreadPool& pool) {
   HWY_DASSERT(batch_idx < kBatchSize);
   static constexpr size_t kModelDim = TConfig::kModelDim;
@@ -449,12 +545,12 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
 
     // Same matrix, first and second half of rows. Could fuse into one MatVec,
     // but separating them could help on NUMA e.g. multiple sockets.
-    MatVec<kFFHiddenDim, kModelDim>(c_layer->c_gating_einsum_w,
+    MatVec<kFFHiddenDim, kModelDim>(layer_weights->gating_einsum_w,
                                     kFFHiddenDim * kModelDim, vec, out_mul,
                                     pool);
 
     // Gate, will go through the nonlinearity.
-    MatVec<kFFHiddenDim, kModelDim>(c_layer->c_gating_einsum_w, 0, vec, out,
+    MatVec<kFFHiddenDim, kModelDim>(layer_weights->gating_einsum_w, 0, vec, out,
                                     pool);
 
     namespace hn = hwy::HWY_NAMESPACE;
@@ -467,7 +563,7 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
 
   PROFILER_ZONE("Gen.FFW\\GatedGELU");
   MatVec<kModelDim, kFFHiddenDim>(
-      c_layer->c_linear_w, 0, activations.ffw_hidden.data() + hidden_offset,
+      layer_weights->linear_w, 0, activations.ffw_hidden.data() + hidden_offset,
       activations.ffw_out.data() + batch_idx * kModelDim, pool);
 }
 
@@ -486,9 +582,9 @@ GEMMA_CONSTEXPR_EMBSCALING float EmbeddingScaling() {
       Sqrt(static_cast<float>(TConfig::kModelDim))));
 }
 
-template <typename TConfig, size_t kBatchSize>
+template <size_t kBatchSize, typename WeightArrayT, typename TConfig>
 HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
-                          const CompressedWeights<TConfig>& c_weights,
+                          const WeightArrayT& weights,
                           Activations<TConfig, kBatchSize>& activations,
                           KVCache& kv_cache, hwy::ThreadPool& pool,
                           hwy::ThreadPool& inner_pool) {
@@ -500,22 +596,22 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
   pool.Run(
       0, num_tokens, [&](const uint64_t token_idx, size_t /*thread*/) HWY_ATTR {
         const int token = tokens[token_idx];
-        Decompress(c_weights.c_embedder_input_embedding, token * kModelDim,
+        Decompress(weights.embedder_input_embedding, token * kModelDim,
                    activations.x.data() + token_idx * kModelDim, kModelDim);
         MulByConst(kEmbScaling, activations.x.data() + token_idx * kModelDim,
                    kModelDim);
       });
 
   for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
-    const CompressedLayer<TConfig>* c_layer = c_weights.CLayer(layer);
+    const auto* layer_weights = weights.GetLayer(layer);
 
     for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
       RMSNorm(activations.x.data() + token_idx * kModelDim,
-              c_layer->c_pre_attention_norm_scale.data(),
+              layer_weights->pre_attention_norm_scale.data(),
               activations.pre_att_rms_out.data() + token_idx * kModelDim,
               kModelDim);
-      Attention<TConfig, kBatchSize>(pos, token_idx, layer, activations,
-                                     c_layer, kv_cache, pool);
+      Attention<kBatchSize>(pos, token_idx, layer, activations, layer_weights,
+                            kv_cache, pool);
     }
 
     // TODO: sink the loop into these functions, i.e. make them matmuls.
@@ -525,10 +621,10 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
           AddFrom(activations.att_post2.data() + token_idx * kModelDim,
                   activations.x.data() + token_idx * kModelDim, kModelDim);
           RMSNorm(activations.x.data() + token_idx * kModelDim,
-                  c_layer->c_pre_ffw_norm_scale.data(),
+                  layer_weights->pre_ffw_norm_scale.data(),
                   activations.bf_pre_ffw_rms_out.data() + token_idx * kModelDim,
                   kModelDim);
-          FFW<TConfig, kBatchSize>(activations, token_idx, c_layer, inner_pool);
+          FFW<kBatchSize>(activations, token_idx, layer_weights, inner_pool);
           AddFrom(activations.ffw_out.data() + token_idx * kModelDim,
                   activations.x.data() + token_idx * kModelDim, kModelDim);
         });
@@ -536,21 +632,20 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
 
   pool.Run(
       0, num_tokens, [&](const uint64_t token_idx, size_t /*thread*/) HWY_ATTR {
-        RMSNormInplace(c_weights.c_final_norm_scale.data(),
+        RMSNormInplace(weights.final_norm_scale.data(),
                        activations.x.data() + token_idx * kModelDim, kModelDim);
       });
 }
 
 // n = 1 specialization
-template <class TConfig>
-void Transformer(int token, size_t pos,
-                 const CompressedWeights<TConfig>& c_weights,
+template <typename WeightArrayT, class TConfig>
+void Transformer(int token, size_t pos, const WeightArrayT& weights,
                  Activations<TConfig, 1>& activations, KVCache& kv_cache,
                  hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool) {
   static constexpr size_t kLayers = TConfig::kLayers;
   static constexpr size_t kModelDim = TConfig::kModelDim;
 
-  Decompress(c_weights.c_embedder_input_embedding, token * kModelDim,
+  Decompress(weights.embedder_input_embedding, token * kModelDim,
              activations.x.data(), kModelDim);
 
   GEMMA_CONSTEXPR_EMBSCALING const float kEmbScaling =
@@ -558,17 +653,18 @@ void Transformer(int token, size_t pos,
   MulByConst(kEmbScaling, activations.x.data(), kModelDim);
 
   for (size_t layer = 0; layer < kLayers; ++layer) {
-    const CompressedLayer<TConfig>* c_layer = c_weights.CLayer(layer);
-    RMSNorm(activations.x.data(), c_layer->c_pre_attention_norm_scale.data(),
+    const auto* layer_weights = weights.GetLayer(layer);
+    RMSNorm(activations.x.data(),
+            layer_weights->pre_attention_norm_scale.data(),
             activations.pre_att_rms_out.data(), kModelDim);
-    Attention<TConfig, 1>(pos, 0, layer, activations, c_layer, kv_cache, pool);
+    Attention<1>(pos, 0, layer, activations, layer_weights, kv_cache, pool);
     AddFrom(activations.att_post2.data(), activations.x.data(), kModelDim);
-    RMSNorm(activations.x.data(), c_layer->c_pre_ffw_norm_scale.data(),
+    RMSNorm(activations.x.data(), layer_weights->pre_ffw_norm_scale.data(),
             activations.bf_pre_ffw_rms_out.data(), kModelDim);
-    FFW<TConfig, 1>(activations, /* batch_idx = */ 0, c_layer, pool);
+    FFW<1>(activations, /* batch_idx = */ 0, layer_weights, pool);
     AddFrom(activations.ffw_out.data(), activations.x.data(), kModelDim);
   }
-  RMSNormInplace(c_weights.c_final_norm_scale.data(), activations.x.data(),
+  RMSNormInplace(weights.final_norm_scale.data(), activations.x.data(),
                  kModelDim);
 }
 
@@ -609,9 +705,9 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
   Activations<TConfig, 1>& activations = *gemma.state.get();
   Activations<TConfig, kPrefillBatchSize>& prefill_activations =
       *gemma.prefill.get();
-  const CompressedWeights<TConfig>& c_weights =
-      *reinterpret_cast<CompressedWeights<TConfig>*>(
-          gemma.compressed_weights.get());
+
+  const WeightsT<TConfig>& weights =
+      *reinterpret_cast<WeightsT<TConfig>*>(gemma.weights_u8.get());
 
   size_t prompt_size = prompt.size();
   RangeChecks<TConfig>(max_tokens, max_generated_tokens, prompt_size);
@@ -643,9 +739,8 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
     HWY_DASSERT(batch_size <= kPrefillBatchSize);
     HWY_DASSERT(pos_offset + batch_size <= prompt_size - 1);
     const int* batch_tokens = prompt.data() + pos_offset;
-    Prefill<TConfig, kPrefillBatchSize>(batch_tokens, batch_size, pos,
-                                        c_weights, prefill_activations,
-                                        kv_cache, pool, inner_pool);
+    Prefill<kPrefillBatchSize>(batch_tokens, batch_size, pos, weights,
+                               prefill_activations, kv_cache, pool, inner_pool);
     for (size_t idx = 0; idx < batch_size; ++idx) {
       stream_token(batch_tokens[idx], 0.0f);
     }
@@ -672,7 +767,7 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
   for (size_t generate_pos = 0;
        pos < max_tokens && generate_pos < max_generated_tokens;
        ++pos, ++pos_offset, ++generate_pos) {
-    Transformer(token, pos, c_weights, activations, kv_cache, pool, inner_pool);
+    Transformer(token, pos, weights, activations, kv_cache, pool, inner_pool);
     float* final_activation = activations.x.data();
     // The condition below is always true if we are doing Prefill above.
     // We keep it here for clarity so that the code is correct even if Prefill
@@ -680,9 +775,9 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
     if (pos_offset >= prompt_size - 1) {
       PROFILER_ZONE("Gen.Embedding");
       // Generation phase
-      MatVec<kVocabSize, TConfig::kModelDim>(
-          c_weights.c_embedder_input_embedding, 0, final_activation,
-          activations.logits.data(), pool);
+      MatVec<kVocabSize, TConfig::kModelDim>(weights.embedder_input_embedding,
+                                             0, final_activation,
+                                             activations.logits.data(), pool);
       // Barrier: must have all logits so we can subtract max.
       Softmax(activations.logits.data(), kVocabSize);
       token = SampleTopK<TConfig::kTopK>(activations.logits.data(), kVocabSize,
@@ -743,52 +838,37 @@ void ForEachTensor(const Weights<TConfig>* weights,
                    CompressedWeights<TConfig>& c_weights, Func& func) {
   func("c_embedding",
        weights ? weights->embedder_input_embedding.data() : nullptr,
-       c_weights.c_embedder_input_embedding);
+       c_weights.embedder_input_embedding);
   func("c_final_norm", weights ? weights->final_norm_scale.data() : nullptr,
-       c_weights.c_final_norm_scale);
+       c_weights.final_norm_scale);
 
-  char name[16];
-  for (int layer_idx = 0; layer_idx < static_cast<int>(TConfig::kLayers);
-       ++layer_idx) {
+  char name_buf[16];
+  for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
     const size_t idx = static_cast<size_t>(layer_idx);
-    Layer<TConfig>* layer = weights ? &weights->layers[idx] : nullptr;
-    CompressedLayer<TConfig>* c_layer = c_weights.CLayer(idx);
+    const Layer<TConfig>* layer = weights ? weights->GetLayer(idx) : nullptr;
+    CompressedLayer<TConfig>* layer_weights = c_weights.GetLayer(idx);
 
-    snprintf(name, sizeof(name), "pre_ff_ns_%d", layer_idx);
-    func(name, layer ? layer->pre_ffw_norm_scale.data() : nullptr,
-         c_layer->c_pre_ffw_norm_scale);
+#define CALL_FUNC(name, member)                                \
+  snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
+  func(name_buf, layer ? layer->member.data() : nullptr, layer_weights->member)
 
-    snprintf(name, sizeof(name), "gating_ein_%d", layer_idx);
-    func(name, layer ? layer->gating_einsum_w.data() : nullptr,
-         c_layer->c_gating_einsum_w);
-
-    snprintf(name, sizeof(name), "linear_w_%d", layer_idx);
-    func(name, layer ? layer->linear_w.data() : nullptr, c_layer->c_linear_w);
-    snprintf(name, sizeof(name), "qkv_ein_%d", layer_idx);
-
-    func(name, layer ? layer->qkv_einsum_w.data() : nullptr,
-         c_layer->c_qkv_einsum_w);
-    snprintf(name, sizeof(name), "att_ein_%d", layer_idx);
-
-    func(name, layer ? layer->attn_vec_einsum_w.data() : nullptr,
-         c_layer->c_attn_vec_einsum_w);
-
-    snprintf(name, sizeof(name), "pre_att_ns_%d", layer_idx);
-    func(name, layer ? layer->pre_attention_norm_scale.data() : nullptr,
-         c_layer->c_pre_attention_norm_scale);
+    CALL_FUNC("pre_ff_ns", pre_ffw_norm_scale);
+    CALL_FUNC("gating_ein", gating_einsum_w);
+    CALL_FUNC("linear_w", linear_w);
+    CALL_FUNC("qkv_ein", qkv_einsum_w);
+    CALL_FUNC("att_ein", attn_vec_einsum_w);
+    CALL_FUNC("pre_att_ns", pre_attention_norm_scale);
+#undef CALL_FUNC
   }
 }
 
 template <class TConfig>
-hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeights(
-    const Path& weights_path, const Path& cache, hwy::ThreadPool& pool) {
+hwy::AlignedFreeUniquePtr<uint8_t[]> LoadCompressedWeights(
+    const Path& weights, hwy::ThreadPool& pool) {
   PROFILER_ZONE("Startup.LoadCache");
-
-  if (!std::filesystem::exists(weights_path.path) &&
-      !std::filesystem::exists(cache.path)) {
-    HWY_ABORT(
-        "Either the model weights (--weights) or cached compressed weights "
-        "(--compressed_weights) must exist.");
+  if (!std::filesystem::exists(weights.path)) {
+    HWY_ABORT("The model weights file '%s' does not exist.",
+              weights.path.c_str());
   }
 
   // Allocate compressed weights.
@@ -798,32 +878,49 @@ hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeights(
   CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
   new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
 
-  // First attempt to load them from cache, without requiring weights.
-  CacheLoader loader(cache.path.c_str());
+  std::array<float, TConfig::kNumTensorScales> scales;
+  CacheLoader loader(weights.path.c_str());
   ForEachTensor<TConfig>(nullptr, *c_weights, loader);
-  if (loader.ReadAll(pool)) return c_weights_u8;
-
-  // Get weights, compress, and store in cache.
-  const hwy::AlignedUniquePtr<Weights<TConfig>> weights =
-      LoadWeights<TConfig>(weights_path);
-  Compressor compressor(pool);
-  ForEachTensor<TConfig>(weights.get(), *c_weights, compressor);
-  compressor.WriteAll(pool, cache.path.c_str());
-
+  loader.LoadScales(scales.data(), scales.size());
+  if (!loader.ReadAll(pool)) {
+    HWY_ABORT("Failed to load model weights.");
+  }
+  if (TConfig::kNumTensorScales > 0) {
+    size_t scale_pos = 0;
+    for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
+      const size_t idx = static_cast<size_t>(layer_idx);
+      CompressedLayer<TConfig>* layer_weights = c_weights->GetLayer(idx);
+      layer_weights->attn_vec_einsum_w.set_scale(scales[scale_pos++]);
+      layer_weights->qkv_einsum_w.set_scale(scales[scale_pos++]);
+      layer_weights->gating_einsum_w.set_scale(scales[scale_pos++]);
+      layer_weights->linear_w.set_scale(scales[scale_pos++]);
+    }
+    HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
+  }
   return c_weights_u8;
 }
 
 // Type-erased because this function is called via a function pointer.
-hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeightsT(
-    gcpp::Model model, const Path& weights, const Path& compressed_weights,
-    hwy::ThreadPool& pool) {
+hwy::AlignedFreeUniquePtr<uint8_t[]> LoadCompressedWeightsT(
+    gcpp::Model model, const Path& weights, hwy::ThreadPool& pool) {
   switch (model) {
     case Model::GEMMA_2B:
-      return GetCompressedWeights<ConfigGemma2B>(weights, compressed_weights,
-                                                 pool);
+      return LoadCompressedWeights<ConfigGemma2B>(weights, pool);
     case Model::GEMMA_7B:
-      return GetCompressedWeights<ConfigGemma7B>(weights, compressed_weights,
-                                                 pool);
+      return LoadCompressedWeights<ConfigGemma7B>(weights, pool);
+    default:
+      HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
+  }
+}
+
+hwy::AlignedFreeUniquePtr<uint8_t[]> LoadWeightsT(gcpp::Model model,
+                                                  const Path& weights,
+                                                  hwy::ThreadPool& pool) {
+  switch (model) {
+    case Model::GEMMA_2B:
+      return LoadWeights<ConfigGemma2B>(weights, pool);
+    case Model::GEMMA_7B:
+      return LoadWeights<ConfigGemma7B>(weights, pool);
     default:
       HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
   }
@@ -846,18 +943,22 @@ void CompressWeights(const Path& weights_path,
   new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
 
   // Get weights, compress, and store.
-  const hwy::AlignedUniquePtr<Weights<TConfig>> weights =
-      LoadWeights<TConfig>(weights_path);
+  const bool scale_for_compression = TConfig::kNumTensorScales > 0;
+  const hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8 =
+      LoadWeights<TConfig>(weights_path, pool, scale_for_compression);
+  Weights<TConfig>* weights =
+      reinterpret_cast<Weights<TConfig>*>(weights_u8.get());
   Compressor compressor(pool);
-  ForEachTensor<TConfig>(weights.get(), *c_weights, compressor);
+  ForEachTensor<TConfig>(weights, *c_weights, compressor);
+  compressor.AddScales(weights->scales.data(), weights->scales.size());
   compressor.WriteAll(pool, compressed_weights_path.path.c_str());
 
+  weights->layer_ptrs.~LayerPointers<TConfig>();
   c_weights->c_layer_ptrs.~CompressedLayerPointers<TConfig>();
 }
 
 void CompressWeightsT(gcpp::Model model, const Path& weights,
-                      const Path& compressed_weights,
-                      hwy::ThreadPool& pool) {
+                      const Path& compressed_weights, hwy::ThreadPool& pool) {
   switch (model) {
     case Model::GEMMA_2B:
       CompressWeights<ConfigGemma2B>(weights, compressed_weights, pool);
@@ -877,7 +978,8 @@ HWY_AFTER_NAMESPACE();
 #if HWY_ONCE
 namespace gcpp {
 
-HWY_EXPORT(GetCompressedWeightsT);
+HWY_EXPORT(LoadCompressedWeightsT);
+HWY_EXPORT(LoadWeightsT);
 HWY_EXPORT(CompressWeightsT);
 HWY_EXPORT(Generate2B);
 HWY_EXPORT(Generate7B);
@@ -892,10 +994,9 @@ KVCache CreateKVCache(size_t size_cache_pos, size_t seq_len) {
 template <class Config>
 GemmaImpl<Config>::GemmaImpl(
     std::unique_ptr<sentencepiece::SentencePieceProcessor>& tokenizer,
-    hwy::AlignedFreeUniquePtr<uint8_t[]>& compressed_weights,
-    hwy::ThreadPool& pool)
+    hwy::AlignedFreeUniquePtr<uint8_t[]>& weights_u8, hwy::ThreadPool& pool)
     : tokenizer(std::move(tokenizer)),
-      compressed_weights(std::move(compressed_weights)),
+      weights_u8(std::move(weights_u8)),
       prefill(hwy::MakeUniqueAligned<Activations<Config, kPrefillBatchSize>>()),
       state(hwy::MakeUniqueAligned<Activations<Config, 1>>()) {}
 
@@ -922,10 +1023,8 @@ void GemmaImpl<ConfigGemma7B>::Generate(
    kv_cache, pool, inner_pool, stream_token, accept_token, gen, verbosity);
 }
 
-Gemma::Gemma(const Path& tokenizer_path, const Path& compressed_weights_path,
-             const Path& weights_path, Model model_type, ModelTraining training,
-             hwy::ThreadPool& pool)
-    : model_training(training) {
+Gemma::Gemma(const Path& tokenizer_path, const Path& weights, Model model_type,
+             hwy::ThreadPool& pool) {
   std::unique_ptr<sentencepiece::SentencePieceProcessor> tokenizer;
   {
     PROFILER_ZONE("Startup.tokenizer");
@@ -934,16 +1033,21 @@ Gemma::Gemma(const Path& tokenizer_path, const Path& compressed_weights_path,
       HWY_ABORT("Failed to load the tokenizer file.");
     }
   }
-  auto compressed_weights = HWY_DYNAMIC_DISPATCH(GetCompressedWeightsT)(
-      model_type, weights_path, compressed_weights_path, pool);
+
+  hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8;
+  if constexpr (kWeightsAreCompressed) {
+    weights_u8 =
+        HWY_DYNAMIC_DISPATCH(LoadCompressedWeightsT)(model_type, weights, pool);
+  } else {
+    weights_u8 = HWY_DYNAMIC_DISPATCH(LoadWeightsT)(model_type, weights, pool);
+  }
+
   switch (model_type) {
     case Model::GEMMA_2B:
-      impl_.reset(
-          new GemmaImpl<ConfigGemma2B>(tokenizer, compressed_weights, pool));
+      impl_.reset(new GemmaImpl<ConfigGemma2B>(tokenizer, weights_u8, pool));
       break;
     case Model::GEMMA_7B:
-      impl_.reset(
-          new GemmaImpl<ConfigGemma7B>(tokenizer, compressed_weights, pool));
+      impl_.reset(new GemmaImpl<ConfigGemma7B>(tokenizer, weights_u8, pool));
       break;
     default:
       HWY_ABORT("Model type %d unknown.", static_cast<int>(model_type));
@@ -981,10 +1085,9 @@ void GenerateGemma(Gemma& gemma, RuntimeConfig runtime_config,
 }
 
 void CompressWeights(gcpp::Model model, const Path& weights,
-                     const Path& compressed_weights,
-                     hwy::ThreadPool& pool) {
-  HWY_DYNAMIC_DISPATCH(CompressWeightsT)(
-      model, weights, compressed_weights, pool);
+                     const Path& compressed_weights, hwy::ThreadPool& pool) {
+  HWY_DYNAMIC_DISPATCH(CompressWeightsT)
+  (model, weights, compressed_weights, pool);
 }
 
 }  // namespace gcpp
diff --git a/gemma.h b/gemma.h
index a3caa43..dc96c7b 100644
--- a/gemma.h
+++ b/gemma.h
@@ -24,22 +24,18 @@
 // copybara:import_next_line:gemma_cpp
 #include "compression/compress.h"  // SfpStream/NuqStream
 // copybara:import_next_line:gemma_cpp
-#include "util/args.h"             // Path
+#include "configs.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // hwy::bfloat16_t
 #include "hwy/contrib/thread_pool/thread_pool.h"
+// copybara:import_next_line:gemma_cpp
+#include "util/args.h"  // Path
 // copybara:import_next_line:sentencepiece
 #include "src/sentencepiece_processor.h"
 
 namespace gcpp {
 
-// Allowable types for GEMMA_WEIGHT_T (can be specified at compilation time):
-// float, hwy::bfloat16_t, SfpStream, NuqStream
-#ifndef GEMMA_WEIGHT_T
-#define GEMMA_WEIGHT_T SfpStream
-#endif  // !GEMMA_WEIGHT_T
-using WeightT = GEMMA_WEIGHT_T;
-
+using GemmaWeightT = GEMMA_WEIGHT_T;
 using EmbedderInputT = hwy::bfloat16_t;
 constexpr size_t kPrefillBatchSize = 16;
 constexpr bool kSystemPrompt = false;
@@ -65,13 +61,11 @@ struct RuntimeConfig {
 struct GemmaInterface;
 
 struct Gemma {
-  Gemma(const Path& tokenizer_path, const Path& compressed_weights_path,
-        const Path& weights_path, Model model_type, ModelTraining training,
+  Gemma(const Path& tokenizer_path, const Path& weights, Model model_type,
         hwy::ThreadPool& pool);
   ~Gemma();  // must be defined after GemmaInterface's dtor is defined.
   const sentencepiece::SentencePieceProcessor* Tokenizer() const;
   std::unique_ptr<GemmaInterface> impl_;
-  gcpp::ModelTraining model_training;
 };
 
 KVCache CreateKVCache(Model type);  // convenient workaround for now
@@ -99,8 +93,7 @@ void GenerateGemma(Gemma& gemma, RuntimeConfig runtime_config,
                    const StreamFunc& stream_token, std::mt19937& gen);
 
 void CompressWeights(gcpp::Model model, const Path& weights,
-                     const Path& compressed_weights,
-                     hwy::ThreadPool& pool);
+                     const Path& compressed_weights, hwy::ThreadPool& pool);
 
 constexpr int EOS_ID = 1;
 
diff --git a/ops_test.cc b/ops_test.cc
index c711946..d74ceb8 100644
--- a/ops_test.cc
+++ b/ops_test.cc
@@ -369,6 +369,7 @@ CompressedArray<float, kOuter * kInner> GenerateMat(size_t offset) {
     }
   }
   Compress(content, ws, mat, pool);
+  mat.set_scale(1.0f);
   return mat;
 }
 
diff --git a/run.cc b/run.cc
index 46ac1ba..7ebf7b4 100644
--- a/run.cc
+++ b/run.cc
@@ -29,14 +29,14 @@
 #include "gemma.h"  // Gemma
 // copybara:import_next_line:gemma_cpp
 #include "util/app.h"
-// copybara:import_next_line:gemma_cpp
-#include "util/args.h"  // HasHelp
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
 #include "hwy/per_target.h"
 #include "hwy/profiler.h"
 #include "hwy/timer.h"
+// copybara:import_next_line:gemma_cpp
+#include "util/args.h"  // HasHelp
 
 namespace gcpp {
 
@@ -66,7 +66,7 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
               << hwy::VectorBytes() * 8 << " bits)" << "\n"
               << "Compiled config               : " << CompiledConfig() << "\n"
               << "Weight Type                   : "
-              << gcpp::TypeName(gcpp::WeightT()) << "\n"
+              << gcpp::TypeName(gcpp::GemmaWeightT()) << "\n"
               << "EmbedderInput Type            : "
               << gcpp::TypeName(gcpp::EmbedderInputT()) << "\n";
   }
@@ -93,10 +93,11 @@ void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
   std::cerr << "\n";
 }
 
-void ReplGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache,
-               hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
-               const InferenceArgs& args, int verbosity,
-               const gcpp::AcceptFunc& accept_token, std::string& eot_line) {
+void ReplGemma(gcpp::Gemma& model, ModelTraining training,
+               gcpp::KVCache& kv_cache, hwy::ThreadPool& pool,
+               hwy::ThreadPool& inner_pool, const InferenceArgs& args,
+               int verbosity, const gcpp::AcceptFunc& accept_token,
+               std::string& eot_line) {
   PROFILER_ZONE("Gen.misc");
   int abs_pos = 0;      // absolute token index over all turns
   int current_pos = 0;  // token index within the current turn
@@ -177,7 +178,7 @@ void ReplGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache,
       continue;
     }
 
-    if (model.model_training == ModelTraining::GEMMA_IT) {
+    if (training == ModelTraining::GEMMA_IT) {
       // For instruction-tuned models: add control tokens.
       prompt_string = "<start_of_turn>user\n" + prompt_string +
                       "<end_of_turn>\n<start_of_turn>model\n";
@@ -232,8 +233,7 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
              [](uint64_t /*task*/, size_t thread) { PinThreadToCore(thread); });
   }
 
-  gcpp::Gemma model(loader.tokenizer, loader.compressed_weights, loader.weights,
-                    loader.ModelType(), loader.ModelTraining(), pool);
+  gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
 
   auto kv_cache = CreateKVCache(loader.ModelType());
 
@@ -265,7 +265,8 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
   }
 
   ReplGemma(
-      model, kv_cache, pool, inner_pool, inference, app.verbosity,
+      model, loader.ModelTraining(), kv_cache, pool, inner_pool, inference,
+      app.verbosity,
       /*accept_token=*/[](int) { return true; }, app.eot_line);
 }
 
diff --git a/util/app.h b/util/app.h
index 4735e8f..2348051 100644
--- a/util/app.h
+++ b/util/app.h
@@ -36,9 +36,9 @@
 #include "configs.h"
 // copybara:import_next_line:gemma_cpp
 #include "gemma.h"
+#include "hwy/base.h"  // HWY_ASSERT
 // copybara:import_next_line:gemma_cpp
 #include "util/args.h"
-#include "hwy/base.h"  // HWY_ASSERT
 
 namespace gcpp {
 
@@ -151,7 +151,7 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
   }
 
   // Returns error string or nullptr if OK.
-  const char* Validate() const {
+  const char* Validate() {
     const std::string model_type_lc = ToLower(model_type);
     if (model_type.empty()) {
       return "Missing --model flag, need to specify either 2b-pt, 7b-pt, "
@@ -165,37 +165,42 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
     if (tokenizer.path.empty()) {
       return "Missing --tokenizer flag, a file for the tokenizer is required.";
     }
-    if (compressed_weights.path.empty()) {
-      return "Missing --compressed_weights flag, a file for the compressed "
-             "model.";
+    if (!compressed_weights.path.empty()) {
+      if (weights.path.empty()) {
+        weights = compressed_weights;
+      } else {
+        return "Only one of --weights and --compressed_weights can be "
+            "specified. To create compressed weights use the compress_weights "
+            "tool.";
+      }
     }
+    if (weights.path.empty()) {
+      return "Missing --weights flag, a file for the model weights.";
+    }
+    if (!weights.exists()) {
+      return "Can't open file specified with --weights flag.";
+     }
     return nullptr;
   }
 
   Path tokenizer;
-  Path weights;             // uncompressed weights file location
-  Path compressed_weights;  // compressed weights file location
+  Path weights;             // weights file location
+  Path compressed_weights;
   std::string model_type;
 
   template <class Visitor>
   void ForEach(const Visitor& visitor) {
     visitor(tokenizer, "tokenizer", Path(),
             "Path name of tokenizer model file.\n    Required argument.");
-    visitor(
-        compressed_weights, "compressed_weights", Path(),
-        "Path name of compressed weights file, regenerated from `--weights` "
-        "file if "
-        "the compressed weights file does not exist.\n    Required argument.");
+    visitor(weights, "weights", Path(),
+            "Path name of model weights (.sbs) file.\n    Required argument.");
+    visitor(compressed_weights, "compressed_weights", Path(),
+            "Alias for --weights.");
     visitor(model_type, "model", std::string(),
             "Model type\n    2b-it = 2B parameters, instruction-tuned\n    "
             "2b-pt = 2B parameters, pretrained\n    7b-it = 7B parameters "
             "instruction-tuned\n    7b-pt = 7B parameters, pretrained\n"
             "    Required argument.");
-    visitor(weights, "weights", Path(),
-            "Path name of model weights (.sbs) file. Only required if "
-            "compressed_weights file is not present and needs to be "
-            "regenerated. This parameter is only required for compressing "
-            "new model weight exports, otherwise it is not needed.");
   }
 };