Update BlobWriter comments, WriteAll->Finalize

PiperOrigin-RevId: 790792133
This commit is contained in:
Jan Wassenberg 2025-08-04 10:00:54 -07:00 committed by Copybara-Service
parent 701841897b
commit 4e062d68f7
4 changed files with 18 additions and 18 deletions

View File

@ -463,7 +463,7 @@ void WriteSingleFile(const ModelConfig& config, const GemmaTokenizer& tokenizer,
AddBlob(kMatPtrsName, serialized_mat_ptrs, writer);
writer.WriteAll();
writer.Finalize();
}
} // namespace gcpp

View File

@ -99,7 +99,10 @@ struct BlobIO {
// For V2: the file is represented as
// Header + PadToBlobAlign + Payload + PadToEndAlign + Directory + Header
// The Header at the beginning has num_blobs == 0; and the Header at the end has
// the correct num_blobs.
// the correct num_blobs. This allows writing blobs without knowing the total
// number of them, nor holding all them in memory. As of 2025-07-31, we support
// reading both, but always write V2. Note that its num_blobs == 0 was
// previously disallowed. To read V2, pull the latest code from the dev branch.
//
// Actual payload is indexed by the directory with keys, offset and bytes; keys
// are unique, opaque 128-bit keys.
@ -108,10 +111,6 @@ struct BlobIO {
// Additional data may be added only inside new blobs. Changes to the blob
// contents or type should be handled by renaming keys.
//
// The file format deliberately omits a version number because it is unchanging.
// Additional data may be added only inside new blobs. Changes to the blob
// contents or type should be handled by renaming keys.
//
// This class is for internal use by `BlobReader` and `BlobWriter`. Its
// interface is more low-level: fixed-size keys instead of strings.
class BlobStore {
@ -473,7 +472,8 @@ static void EnqueueChunks(size_t key_idx, uint64_t offset, uint64_t bytes,
BlobWriter::BlobWriter(const Path& filename, hwy::ThreadPool& pool)
: file_(OpenFileOrNull(filename, "w+")), pool_(pool) {
if (!file_) HWY_ABORT("Failed to open for writing %s", filename.path.c_str());
// Write a fake header to the beginning of the file.
// Write a placeholder header to the beginning of the file. If append-only,
// we will later write a footer, else we will update the header.
std::vector<uint8_t> bytes_before_blobs = BlobStore::BytesBeforeBlobsV2();
file_->Write(bytes_before_blobs.data(), bytes_before_blobs.size(), 0);
}
@ -503,13 +503,15 @@ void BlobWriter::Add(const std::string& key, const void* data, size_t bytes) {
});
}
void BlobWriter::WriteAll() {
void BlobWriter::Finalize() {
const BlobStore bs = BlobStore(keys_, blob_sizes_);
// Write the rest of the bytes, which contains: paddings + directory + header.
const auto bytes_after_blobs = bs.BytesAfterBlobs();
file_->Write(bytes_after_blobs.data(), bytes_after_blobs.size(),
file_->FileSize());
file_.reset(); // closes the file
}
} // namespace gcpp

View File

@ -111,21 +111,19 @@ class BlobReader {
std::unordered_map<std::string, size_t> key_idx_for_key_;
};
// Collects references to blobs and writes them all at once with parallel I/O.
// Writes blobs immediately using parallel I/O, and collects their metadata for
// writing the file footer.
// Thread-compatible: independent instances can be used concurrently, but it
// does not make sense to call the methods concurrently.
class BlobWriter {
public:
explicit BlobWriter(const Path& filename, hwy::ThreadPool& pool);
BlobWriter(const Path& filename, hwy::ThreadPool& pool);
// Writes the blob to disk with padding for alignment. Aborts on error.
void Add(const std::string& key, const void* data, size_t bytes);
// For `ModelStore`: this is the `key_idx` of the next blob to be added.
size_t NumAdded() const { return keys_.size(); }
// Stores all blobs to disk in the given order with padding for alignment.
// Aborts on error.
void WriteAll();
// Appends a footer and closes the file. Must be called once after all `Add`.
void Finalize();
private:
std::unique_ptr<File> file_;

View File

@ -55,7 +55,7 @@ TEST(BlobStoreTest, TestReadWrite) {
BlobWriter writer(path, pool);
writer.Add(keyA, "DATA", 5);
writer.Add(keyB, buffer.data(), sizeof(buffer));
writer.WriteAll();
writer.Finalize();
HWY_ASSERT_ARRAY_EQ(kOriginalData.data(), buffer.data(), buffer.size());
std::fill(buffer.begin(), buffer.end(), 0);
@ -126,7 +126,7 @@ TEST(BlobStoreTest, TestNumBlobs) {
}
HWY_ASSERT(keys.size() == num_blobs);
HWY_ASSERT(blobs.size() == num_blobs);
writer.WriteAll();
writer.Finalize();
BlobReader reader(path);
HWY_ASSERT_EQ(reader.Keys().size(), num_blobs);