diff --git a/compression/compress-inl.h b/compression/compress-inl.h
index 2bac834..b3e943d 100644
--- a/compression/compress-inl.h
+++ b/compression/compress-inl.h
@@ -22,6 +22,7 @@
 #include <stdio.h>
 
 #include <array>
+#include <cmath>  // lroundf, only if COMPRESS_STATS
 
 #include "compression/blob_store.h"
 #include "compression/compress.h"
@@ -55,6 +56,7 @@ namespace hn = hwy::HWY_NAMESPACE;
 template <typename T>  // primary, must specialize
 struct CompressTraits {};
 
+// Useful for backprop/, where weights are currently f32.
 template <>
 struct CompressTraits<float> {
   using MatT = float;
@@ -267,11 +269,14 @@ struct CompressTraits<hwy::bfloat16_t> {
   }
 };
 
+// Switching floating point: 8-bit, 2..3 mantissa bits.
 template <>
 struct CompressTraits<SfpStream> {
   using MatT = SfpStream;
   static constexpr bool kSupportsEvenOdd = true;
 
+  // Callers are responsible for scaling `in` such that its magnitudes do not
+  // exceed 1.875. See CompressedArray::scale().
   template <class DF, HWY_IF_F32_D(DF)>
   static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
                                   size_t num, CompressPerThread& tls,
@@ -351,6 +356,7 @@ struct CompressTraits<SfpStream> {
   }
 };
 
+// Nonuniform quantization, 4.5 bits per element, two separate streams.
 template <>
 struct CompressTraits<NuqStream> {
   using MatT = NuqStream;
@@ -525,12 +531,12 @@ HWY_INLINE float Dot(DF df, const CompressedArray<MatT, kCapacity>& compressed,
   return compressed.scale() * dot_result;
 }
 
-// Callback used by ForeachTensor.
+// Functor called for each tensor, which compresses and stores them along with
+// their scaling factors to BlobStore.
 class Compressor {
  public:
   explicit Compressor(hwy::ThreadPool& pool) : pool_(pool) {}
 
-  // Called for each tensor; compresses it and stores to the cache.
   template <typename MatT, size_t kCapacity>
   void operator()(const char* name, const float* weights,
                   CompressedArray<MatT, kCapacity>& compressed) {
diff --git a/compression/compress.h b/compression/compress.h
index edb7fdb..7df9b73 100644
--- a/compression/compress.h
+++ b/compression/compress.h
@@ -79,6 +79,9 @@ class CompressedArray {
   MatT* data() { return data_.data(); }
   const MatT* data() const { return data_.data(); }
 
+  // Decoded elements should be multiplied by this to restore their original
+  // range. This is required because SfpStream can only encode a limited range
+  // of magnitudes.
   float scale() const { return scale_[0]; }
   void set_scale(float scale) { scale_[0] = scale; }
 
@@ -90,6 +93,7 @@ class CompressedArray {
 
  private:
   std::array<MatT, NumCompressed()> data_;
+  // Blobs are at least kBlobAlign bytes anyway.
   float scale_[kBlobAlign / sizeof(float)];
 };
 
@@ -172,6 +176,8 @@ hwy::uint128_t CacheKey(const char* name) {
   return MakeKey((std::string(1, prefix) + name).c_str());
 }
 
+// Functor called for each tensor, which loads them and their scaling factors
+// from BlobStore.
 class CacheLoader {
  public:
   explicit CacheLoader(const Path& blob_filename) {
diff --git a/compression/sfp-inl.h b/compression/sfp-inl.h
index e96a1a0..86505c7 100644
--- a/compression/sfp-inl.h
+++ b/compression/sfp-inl.h
@@ -260,7 +260,8 @@ class SfpCodec {
     hi = hn::BitwiseIfThenElse(k80, sign_in_msb, hn::ShiftRight<1>(biased_e));
   }
 
-  // Encodes `num` bf16 values from `in_bf` to `out_packed`.
+  // Encodes `num` bf16 values from `in_bf` to `out_packed`. Their magnitude
+  // must be at most 1.875.
   template <class DBF, HWY_IF_BF16_D(DBF)>
   static HWY_INLINE void Enc(DBF dbf, const hwy::bfloat16_t* HWY_RESTRICT in_bf,
                              size_t num, SfpStream* HWY_RESTRICT out_packed) {
@@ -288,7 +289,8 @@ class SfpCodec {
     }
   }
 
-  // Encodes `num` f32 values from `in_f` to `packed`.
+  // Encodes `num` f32 values from `in_f` to `packed`. Their magnitude
+  // must be at most 1.875.
   template <class DF, HWY_IF_F32_D(DF)>
   static HWY_INLINE void Enc(DF df, const float* HWY_RESTRICT in_f, size_t num,
                              SfpStream* HWY_RESTRICT out_packed) {