diff --git a/gemma/model_store.cc b/gemma/model_store.cc index ca454bd..2aab1f5 100644 --- a/gemma/model_store.cc +++ b/gemma/model_store.cc @@ -463,7 +463,7 @@ void WriteSingleFile(const ModelConfig& config, const GemmaTokenizer& tokenizer, AddBlob(kMatPtrsName, serialized_mat_ptrs, writer); - writer.WriteAll(); + writer.Finalize(); } } // namespace gcpp diff --git a/io/blob_store.cc b/io/blob_store.cc index 00d4fff..9ce7b09 100644 --- a/io/blob_store.cc +++ b/io/blob_store.cc @@ -99,7 +99,10 @@ struct BlobIO { // For V2: the file is represented as // Header + PadToBlobAlign + Payload + PadToEndAlign + Directory + Header // The Header at the beginning has num_blobs == 0; and the Header at the end has -// the correct num_blobs. +// the correct num_blobs. This allows writing blobs without knowing the total +// number of them, nor holding all them in memory. As of 2025-07-31, we support +// reading both, but always write V2. Note that its num_blobs == 0 was +// previously disallowed. To read V2, pull the latest code from the dev branch. // // Actual payload is indexed by the directory with keys, offset and bytes; keys // are unique, opaque 128-bit keys. @@ -108,10 +111,6 @@ struct BlobIO { // Additional data may be added only inside new blobs. Changes to the blob // contents or type should be handled by renaming keys. // -// The file format deliberately omits a version number because it is unchanging. -// Additional data may be added only inside new blobs. Changes to the blob -// contents or type should be handled by renaming keys. -// // This class is for internal use by `BlobReader` and `BlobWriter`. Its // interface is more low-level: fixed-size keys instead of strings. class BlobStore { @@ -473,7 +472,8 @@ static void EnqueueChunks(size_t key_idx, uint64_t offset, uint64_t bytes, BlobWriter::BlobWriter(const Path& filename, hwy::ThreadPool& pool) : file_(OpenFileOrNull(filename, "w+")), pool_(pool) { if (!file_) HWY_ABORT("Failed to open for writing %s", filename.path.c_str()); - // Write a fake header to the beginning of the file. + // Write a placeholder header to the beginning of the file. If append-only, + // we will later write a footer, else we will update the header. std::vector bytes_before_blobs = BlobStore::BytesBeforeBlobsV2(); file_->Write(bytes_before_blobs.data(), bytes_before_blobs.size(), 0); } @@ -503,13 +503,15 @@ void BlobWriter::Add(const std::string& key, const void* data, size_t bytes) { }); } -void BlobWriter::WriteAll() { +void BlobWriter::Finalize() { const BlobStore bs = BlobStore(keys_, blob_sizes_); // Write the rest of the bytes, which contains: paddings + directory + header. const auto bytes_after_blobs = bs.BytesAfterBlobs(); file_->Write(bytes_after_blobs.data(), bytes_after_blobs.size(), file_->FileSize()); + + file_.reset(); // closes the file } } // namespace gcpp diff --git a/io/blob_store.h b/io/blob_store.h index b7dba0f..916091c 100644 --- a/io/blob_store.h +++ b/io/blob_store.h @@ -111,21 +111,19 @@ class BlobReader { std::unordered_map key_idx_for_key_; }; -// Collects references to blobs and writes them all at once with parallel I/O. +// Writes blobs immediately using parallel I/O, and collects their metadata for +// writing the file footer. // Thread-compatible: independent instances can be used concurrently, but it // does not make sense to call the methods concurrently. class BlobWriter { public: - explicit BlobWriter(const Path& filename, hwy::ThreadPool& pool); + BlobWriter(const Path& filename, hwy::ThreadPool& pool); + // Writes the blob to disk with padding for alignment. Aborts on error. void Add(const std::string& key, const void* data, size_t bytes); - // For `ModelStore`: this is the `key_idx` of the next blob to be added. - size_t NumAdded() const { return keys_.size(); } - - // Stores all blobs to disk in the given order with padding for alignment. - // Aborts on error. - void WriteAll(); + // Appends a footer and closes the file. Must be called once after all `Add`. + void Finalize(); private: std::unique_ptr file_; diff --git a/io/blob_store_test.cc b/io/blob_store_test.cc index 36ba27f..078a974 100644 --- a/io/blob_store_test.cc +++ b/io/blob_store_test.cc @@ -55,7 +55,7 @@ TEST(BlobStoreTest, TestReadWrite) { BlobWriter writer(path, pool); writer.Add(keyA, "DATA", 5); writer.Add(keyB, buffer.data(), sizeof(buffer)); - writer.WriteAll(); + writer.Finalize(); HWY_ASSERT_ARRAY_EQ(kOriginalData.data(), buffer.data(), buffer.size()); std::fill(buffer.begin(), buffer.end(), 0); @@ -126,7 +126,7 @@ TEST(BlobStoreTest, TestNumBlobs) { } HWY_ASSERT(keys.size() == num_blobs); HWY_ASSERT(blobs.size() == num_blobs); - writer.WriteAll(); + writer.Finalize(); BlobReader reader(path); HWY_ASSERT_EQ(reader.Keys().size(), num_blobs);