Fix file mapping: was letting the smart pointer go out of scope

Also save+print the IO mode used. PiperOrigin-RevId: 788848165
2025-07-30 04:29:27 -07:00 · 2025-07-30 04:29:27 -07:00 · d831ddce5b
parent 2141d4788d
commit d831ddce5b
8 changed files with 95 additions and 58 deletions
--- a/evals/benchmark_helper.cc
+++ b/evals/benchmark_helper.cc
@ -56,7 +56,8 @@ GemmaEnv::GemmaEnv(const LoaderArgs& loader, const ThreadingArgs& threading,
  kv_caches_.push_back(KVCache(config, inference, ctx_.allocator));

  if (inference.verbosity >= 2) {
-    ShowConfig(loader, threading, inference, config, ctx_);
+    ShowConfig(loader, threading, inference, config, gemma_.WeightReadMode(),
+               ctx_);
  }

  InitGenerator(inference, gen_);
@ -229,13 +230,15 @@ static constexpr const char* CompiledConfig() {

 void ShowConfig(const LoaderArgs& loader, const ThreadingArgs& threading,
                const InferenceArgs& inference, const ModelConfig& config,
+                const WeightsPtrs::Mode weight_read_mode,
                const ThreadingContext& ctx) {
  threading.Print(inference.verbosity);
  loader.Print(inference.verbosity);
  inference.Print(inference.verbosity);
-  fprintf(stderr, "Model                         : %s, to_bf16 %d, mmap %d\n",
-          config.Specifier().c_str(), static_cast<int>(loader.to_bf16),
-          static_cast<int>(loader.map));
+  fprintf(
+      stderr, "Model                         : %s, to_bf16 %d, mmap %d => %s\n",
+      config.Specifier().c_str(), static_cast<int>(loader.to_bf16),
+      static_cast<int>(loader.map), WeightsPtrs::ToString(weight_read_mode));

  if (inference.verbosity >= 2) {
    time_t now = time(nullptr);
--- a/evals/benchmark_helper.h
+++ b/evals/benchmark_helper.h
@ -125,6 +125,7 @@ void LogSpeedStats(double time_start, size_t total_tokens);

 void ShowConfig(const LoaderArgs& loader, const ThreadingArgs& threading,
                const InferenceArgs& inference, const ModelConfig& config,
+                WeightsPtrs::Mode weight_read_mode,
                const ThreadingContext& ctx);
 void ShowHelp(const LoaderArgs& loader, const ThreadingArgs& threading,
              const InferenceArgs& inference);
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@ -609,7 +609,9 @@ Gemma::Gemma(const LoaderArgs& loader, const InferenceArgs& inference,
      weights_(model_.Config()),
      chat_template_(model_.Tokenizer(), model_.Config().model),
      inference_(inference) {
-  weights_.ReadFromBlobs(model_, reader_, loader, inference, mat_owners_, ctx);
+  weight_read_mode_ = weights_.ReadFromBlobs(model_, reader_, loader, inference,
+                                             mat_owners_, ctx);
+  // Read everything into memory, or `weights_.mapped_` keeps the mapping alive.
  reader_.CloseFile();
 }

--- a/gemma/gemma.h
+++ b/gemma/gemma.h
@ -239,6 +239,7 @@ class Gemma {
  const ModelConfig& Config() const { return model_.Config(); }
  const GemmaTokenizer& Tokenizer() const { return model_.Tokenizer(); }
  const WeightsPtrs& Weights() const { return weights_; }
+  WeightsPtrs::Mode WeightReadMode() const { return weight_read_mode_; }
  const GemmaChatTemplate& ChatTemplate() const { return chat_template_; }
  const InferenceArgs& Inference() const { return inference_; }

@ -271,6 +272,7 @@ class Gemma {
  ModelStore model_;
  std::vector<MatOwner> mat_owners_;
  WeightsPtrs weights_;
+  WeightsPtrs::Mode weight_read_mode_;
  GemmaChatTemplate chat_template_;
  InferenceArgs inference_;
 };
--- a/gemma/run.cc
+++ b/gemma/run.cc
@ -285,7 +285,8 @@ void Run(const LoaderArgs& loader, const ThreadingArgs& threading,
    if (inference.IsInteractive()) {
      std::cout << "\033[2J\033[1;1H"  // clear screen
                << kAsciiArtBanner << "\n\n";
-      ShowConfig(loader, threading, inference, gemma.Config(), ctx);
+      ShowConfig(loader, threading, inference, gemma.Config(),
+                 gemma.WeightReadMode(), ctx);
      std::cout << "\n" << instructions << "\n";
    }
  }
--- a/gemma/weights.cc
+++ b/gemma/weights.cc
@ -223,7 +223,7 @@ void WeightsPtrs::CopyFrom(const WeightsPtrs& other) {
 }

 // For reshaping file tensors to the shape expected by the code. This would
-// ideally already happen in the importer. Called by WeightsOwner::Fixup.
+// ideally already happen in the importer. Called by `ReadFromBlobs`.
 void WeightsPtrs::Fixup(std::vector<MatOwner>& mat_owners,
                        ThreadingContext& ctx) {
  // TODO: use 1D parallel-for helper function
@ -251,21 +251,11 @@ std::vector<uint32_t> WeightsPtrs::AddTensorDataToWriter(
  return serialized_mat_ptrs;
 }

-enum class Mode {
-  // Parallel I/O, decompress to BF16. Best for large batch sizes.
-  kReadBF16,
-  // Parallel I/O, insert row-wise padding. Safe default.
-  kRead,
-  // Best for large weights relative to available memory, especially for
-  // frequent invocations of small batches and short sequences. Adds noise to
-  // performance measurements due to I/O variability.
-  kMap
-};
-
 // Decides whether to read or map based on heuristics and user override.
-static Mode ChooseMode(uint64_t file_bytes, const LoaderArgs& loader,
-                       const InferenceArgs& inference,
-                       const Allocator& allocator) {
+static WeightsPtrs::Mode ChooseMode(uint64_t file_bytes,
+                                    const LoaderArgs& loader,
+                                    const InferenceArgs& inference,
+                                    const Allocator& allocator) {
  Tristate to_bf16 = loader.to_bf16;
  Tristate map = loader.map;

@ -283,8 +273,8 @@ static Mode ChooseMode(uint64_t file_bytes, const LoaderArgs& loader,
  if (to_bf16 == Tristate::kTrue && map == Tristate::kTrue) {
    HWY_WARN("Cannot have to_bf16 && map, to_bf16 takes precedence.");
  }
-  if (to_bf16 == Tristate::kTrue) return Mode::kReadBF16;
-  if (map == Tristate::kTrue) return Mode::kMap;
+  if (to_bf16 == Tristate::kTrue) return WeightsPtrs::Mode::kReadBF16;
+  if (map == Tristate::kTrue) return WeightsPtrs::Mode::kMap;

  if (to_bf16 == Tristate::kDefault) {
    // Heuristic: sub-bf16 compression is not helpful if compute-bound.
@ -307,8 +297,9 @@ static Mode ChooseMode(uint64_t file_bytes, const LoaderArgs& loader,
  }

  // If the `map` heuristic triggers, use that for safety.
-  if (map == Tristate::kTrue) return Mode::kMap;
-  return (to_bf16 == Tristate::kTrue) ? Mode::kReadBF16 : Mode::kRead;
+  if (map == Tristate::kTrue) return WeightsPtrs::Mode::kMap;
+  return (to_bf16 == Tristate::kTrue) ? WeightsPtrs::Mode::kReadBF16
+                                      : WeightsPtrs::Mode::kRead;
 }

 struct TensorToRead {
@ -324,7 +315,8 @@ struct TensorToRead {

 // Allocates multiple in parallel and binds to NUMA nodes.
 static void AllocateAndBindAll(std::vector<TensorToRead>& tensors,
-                               const Mode mode, std::vector<MatOwner>& owners,
+                               const WeightsPtrs::Mode mode,
+                               std::vector<MatOwner>& owners,
                               ThreadingContext& ctx) {
  const size_t start = owners.size();
  owners.resize(start + tensors.size());
@ -342,7 +334,7 @@ static void AllocateAndBindAll(std::vector<TensorToRead>& tensors,
        if (tensor.prev_type == Type::kF32 || mat.Rows() < 1024) {
          tensor.keep_type = true;
          tensor.padding = MatPadding::kPacked;  // single I/O for simplicity
-        } else if (mode == Mode::kReadBF16) {
+        } else if (mode == WeightsPtrs::Mode::kReadBF16) {
          mat.SetType(Type::kBF16);
        }

@ -354,7 +346,7 @@ static void AllocateAndBindAll(std::vector<TensorToRead>& tensors,

 // Mode == kMap
 static void MapAll(const std::vector<TensorToRead>& tensors,
-                   const MapPtr& mapped) {
+                   const MapPtr& mapped, uint64_t file_bytes) {
  PROFILER_ZONE("Startup.Weights.Map");
  for (size_t i = 0; i < tensors.size(); ++i) {
    // SetPtr does not change the stride, but it is expected to be packed
@ -362,10 +354,12 @@ static void MapAll(const std::vector<TensorToRead>& tensors,
    const size_t mat_bytes = tensors[i].mat->PackedBytes();
    // Ensure blob size matches that computed from metadata.
    HWY_ASSERT_M(mat_bytes == tensors[i].range.bytes, tensors[i].mat->Name());
+    // Ensure the blob lies within the file mapping.
+    const uint64_t offset = tensors[i].range.offset;
+    HWY_ASSERT_M(offset + mat_bytes <= file_bytes, tensors[i].mat->Name());

-    tensors[i].mat->SetPtr(
-        const_cast<uint8_t*>(mapped.get() + tensors[i].range.offset),
-        tensors[i].mat->Stride());
+    tensors[i].mat->SetPtr(const_cast<uint8_t*>(mapped.get() + offset),
+                           tensors[i].mat->Stride());
  }
 }

@ -484,40 +478,49 @@ static void ReadBatches(const BlobReader& reader,
  });
 }

-// Aborts on error.
-static void MapOrReadAll(std::vector<TensorToRead>& tensors, BlobReader& reader,
-                         Mode mode, std::vector<MatOwner>& mat_owners,
-                         ThreadingContext& ctx) {
-  if (mode == Mode::kMap) {
-    MapPtr mapped = reader.file().Map();
-    if (mapped) return MapAll(tensors, mapped);
+// Aborts on error. Updates `mode` to the actual mode used. Returns mapped
+// memory or nullptr if `kMap` was not used.
+static MapPtr MapOrReadAll(std::vector<TensorToRead>& tensors,
+                           BlobReader& reader, WeightsPtrs::Mode* mode,
+                           std::vector<MatOwner>& mat_owners,
+                           ThreadingContext& ctx) {
+  if (*mode == WeightsPtrs::Mode::kMap) {
+    if (MapPtr mapped = reader.Map()) {
+      MapAll(tensors, mapped, reader.file().FileSize());
+      return mapped;
+    }
    HWY_WARN("Failed to map file (%zu KiB), reading instead.",
             static_cast<size_t>(reader.file_bytes() >> 10));
    // If we wanted to map but failed, memory is probably not plentiful, so
    // fall through to kRead because kReadBF16 requires more memory.
-    mode = Mode::kRead;
+    *mode = WeightsPtrs::Mode::kRead;
  }

  {
    PROFILER_ZONE("Startup.Weights.Allocate");
    // NOTE: this changes the stride of `mats`!
-    AllocateAndBindAll(tensors, mode, mat_owners, ctx);
+    AllocateAndBindAll(tensors, *mode, mat_owners, ctx);
  }

  hwy::ThreadPool& pool = ctx.pools.Pool();

-  if (mode == Mode::kReadBF16) return ReadAllToBF16(tensors, reader, pool);
+  if (*mode == WeightsPtrs::Mode::kReadBF16) {
+    ReadAllToBF16(tensors, reader, pool);
+    return MapPtr();
+  }

  const std::vector<IOBatch> batches =
      MakeBatches(tensors, reader.file_bytes());
  ReadBatches(reader, batches, pool);
+  return MapPtr();
 }

-void WeightsPtrs::ReadFromBlobs(const ModelStore& model, BlobReader& reader,
-                                const LoaderArgs& loader,
-                                const InferenceArgs& inference,
-                                std::vector<MatOwner>& mat_owners,
-                                ThreadingContext& ctx) {
+WeightsPtrs::Mode WeightsPtrs::ReadFromBlobs(const ModelStore& model,
+                                             BlobReader& reader,
+                                             const LoaderArgs& loader,
+                                             const InferenceArgs& inference,
+                                             std::vector<MatOwner>& mat_owners,
+                                             ThreadingContext& ctx) {
  // List of tensors to read/map, and where from.
  std::vector<TensorToRead> tensors;

@ -536,15 +539,14 @@ void WeightsPtrs::ReadFromBlobs(const ModelStore& model, BlobReader& reader,
    HWY_ABORT("Tensor %s is required but not found in file.", t.mat.Name());
  });

-  const Mode mode =
-      ChooseMode(reader.file_bytes(), loader, inference, ctx.allocator);
-
-  MapOrReadAll(tensors, reader, mode, mat_owners, ctx);
+  Mode mode = ChooseMode(reader.file_bytes(), loader, inference, ctx.allocator);
+  mapped_ = MapOrReadAll(tensors, reader, &mode, mat_owners, ctx);

  {
    PROFILER_ZONE("Startup.Fixup");
    Fixup(mat_owners, ctx);
  }
+  return mode;
 }

 }  // namespace gcpp
--- a/gemma/weights.h
+++ b/gemma/weights.h
@ -90,7 +90,7 @@ class MatFinder {
 };

 // Per-layer weight metadata and pointers. The tensor data is owned by
-// `WeightsOwner`.
+// `MatOwner`.
 struct LayerWeightsPtrs {
  // Initializes tensor metadata without allocating.
  // NOTE: do not store layer_idx, TransformerLayer and Attention may use
@ -314,7 +314,7 @@ struct LayerWeightsPtrs {
 };

 // Holds layer-independent weight metadata and pointers plus per-layer
-// `LayerWeightsPtrs`. The tensor data is owned by `WeightsOwner`.
+// `LayerWeightsPtrs`. The tensor data is owned by `MatOwner`.
 struct WeightsPtrs {
  explicit WeightsPtrs(const ModelConfig& config)
      : config_(config),
@ -423,9 +423,34 @@ struct WeightsPtrs {
  // Copies only the allocated tensors in `*this` from tensors in `other`.
  void CopyFrom(const WeightsPtrs& other);

+  enum class Mode {
+    // Parallel I/O, decompress to BF16. Best for large batch sizes.
+    kReadBF16,
+    // Parallel I/O, insert row-wise padding. Safe default.
+    kRead,
+    // Best for large weights relative to available memory, especially for
+    // frequent invocations of small batches and short sequences. Adds noise to
+    // performance measurements due to I/O variability.
+    kMap
+  };
+
+  static const char* ToString(Mode mode) {
+    switch (mode) {
+      case Mode::kReadBF16:
+        return "ReadBF16";
+      case Mode::kRead:
+        return "Read";
+      case Mode::kMap:
+        return "Map";
+      default:
+        HWY_DASSERT(false);
+        return "?";
+    }
+  }
+
  // Reads tensor data from `BlobStore` or aborts on error. `map` is a user
-  // override for whether to map blobs or read them.
-  void ReadFromBlobs(const ModelStore& model, BlobReader& reader,
+  // override for whether to map blobs or read them. Returns the mode used.
+  Mode ReadFromBlobs(const ModelStore& model, BlobReader& reader,
                     const LoaderArgs& loader, const InferenceArgs& inference,
                     std::vector<MatOwner>& mat_owners, ThreadingContext& ctx);

@ -436,6 +461,8 @@ struct WeightsPtrs {
  // For reshaping file tensors to the shape expected by the code. This would
  // ideally already happen in the importer. Called by ReadFromBlobs.
  void Fixup(std::vector<MatOwner>& mat_owners, ThreadingContext& ctx);
+
+  MapPtr mapped_;
 };  // `WeightsPtrs`
 #undef TENSOR_ARGS

--- a/io/blob_store.h
+++ b/io/blob_store.h
@ -47,7 +47,7 @@ struct BlobRange {
 // Reads `BlobStore` header, converts keys to strings and creates a hash map for
 // faster lookups.
 // TODO(janwas): rename to BlobFinder or similar.
-// Thread-safe: it is safe to concurrently call all methods.
+// Thread-safe: it is safe to concurrently call all methods except `CloseFile`.
 class BlobReader {
 public:
  // Acquires ownership of `file` (which must be non-null) and reads its header.
@ -56,11 +56,10 @@ class BlobReader {

  const Path& blob_path() const { return blob_path_; }

-  // Non-const version required for File::Map().
-  File& file() { return *file_; }
  const File& file() const { return *file_; }
  uint64_t file_bytes() const { return file_bytes_; }
-
+  MapPtr Map() { return file_->Map(); }
+  // OK to call if Map() was called; the smart pointer keeps the mapping alive.
  void CloseFile() { file_.reset(); }

  const std::vector<std::string>& Keys() const { return keys_; }