mirror of https://github.com/google/gemma.cpp.git
Update BlobWriter comments, WriteAll->Finalize
PiperOrigin-RevId: 790792133
This commit is contained in:
parent
701841897b
commit
4e062d68f7
|
|
@ -463,7 +463,7 @@ void WriteSingleFile(const ModelConfig& config, const GemmaTokenizer& tokenizer,
|
|||
|
||||
AddBlob(kMatPtrsName, serialized_mat_ptrs, writer);
|
||||
|
||||
writer.WriteAll();
|
||||
writer.Finalize();
|
||||
}
|
||||
|
||||
} // namespace gcpp
|
||||
|
|
|
|||
|
|
@ -99,7 +99,10 @@ struct BlobIO {
|
|||
// For V2: the file is represented as
|
||||
// Header + PadToBlobAlign + Payload + PadToEndAlign + Directory + Header
|
||||
// The Header at the beginning has num_blobs == 0; and the Header at the end has
|
||||
// the correct num_blobs.
|
||||
// the correct num_blobs. This allows writing blobs without knowing the total
|
||||
// number of them, nor holding all them in memory. As of 2025-07-31, we support
|
||||
// reading both, but always write V2. Note that its num_blobs == 0 was
|
||||
// previously disallowed. To read V2, pull the latest code from the dev branch.
|
||||
//
|
||||
// Actual payload is indexed by the directory with keys, offset and bytes; keys
|
||||
// are unique, opaque 128-bit keys.
|
||||
|
|
@ -108,10 +111,6 @@ struct BlobIO {
|
|||
// Additional data may be added only inside new blobs. Changes to the blob
|
||||
// contents or type should be handled by renaming keys.
|
||||
//
|
||||
// The file format deliberately omits a version number because it is unchanging.
|
||||
// Additional data may be added only inside new blobs. Changes to the blob
|
||||
// contents or type should be handled by renaming keys.
|
||||
//
|
||||
// This class is for internal use by `BlobReader` and `BlobWriter`. Its
|
||||
// interface is more low-level: fixed-size keys instead of strings.
|
||||
class BlobStore {
|
||||
|
|
@ -473,7 +472,8 @@ static void EnqueueChunks(size_t key_idx, uint64_t offset, uint64_t bytes,
|
|||
BlobWriter::BlobWriter(const Path& filename, hwy::ThreadPool& pool)
|
||||
: file_(OpenFileOrNull(filename, "w+")), pool_(pool) {
|
||||
if (!file_) HWY_ABORT("Failed to open for writing %s", filename.path.c_str());
|
||||
// Write a fake header to the beginning of the file.
|
||||
// Write a placeholder header to the beginning of the file. If append-only,
|
||||
// we will later write a footer, else we will update the header.
|
||||
std::vector<uint8_t> bytes_before_blobs = BlobStore::BytesBeforeBlobsV2();
|
||||
file_->Write(bytes_before_blobs.data(), bytes_before_blobs.size(), 0);
|
||||
}
|
||||
|
|
@ -503,13 +503,15 @@ void BlobWriter::Add(const std::string& key, const void* data, size_t bytes) {
|
|||
});
|
||||
}
|
||||
|
||||
void BlobWriter::WriteAll() {
|
||||
void BlobWriter::Finalize() {
|
||||
const BlobStore bs = BlobStore(keys_, blob_sizes_);
|
||||
|
||||
// Write the rest of the bytes, which contains: paddings + directory + header.
|
||||
const auto bytes_after_blobs = bs.BytesAfterBlobs();
|
||||
file_->Write(bytes_after_blobs.data(), bytes_after_blobs.size(),
|
||||
file_->FileSize());
|
||||
|
||||
file_.reset(); // closes the file
|
||||
}
|
||||
|
||||
} // namespace gcpp
|
||||
|
|
|
|||
|
|
@ -111,21 +111,19 @@ class BlobReader {
|
|||
std::unordered_map<std::string, size_t> key_idx_for_key_;
|
||||
};
|
||||
|
||||
// Collects references to blobs and writes them all at once with parallel I/O.
|
||||
// Writes blobs immediately using parallel I/O, and collects their metadata for
|
||||
// writing the file footer.
|
||||
// Thread-compatible: independent instances can be used concurrently, but it
|
||||
// does not make sense to call the methods concurrently.
|
||||
class BlobWriter {
|
||||
public:
|
||||
explicit BlobWriter(const Path& filename, hwy::ThreadPool& pool);
|
||||
BlobWriter(const Path& filename, hwy::ThreadPool& pool);
|
||||
|
||||
// Writes the blob to disk with padding for alignment. Aborts on error.
|
||||
void Add(const std::string& key, const void* data, size_t bytes);
|
||||
|
||||
// For `ModelStore`: this is the `key_idx` of the next blob to be added.
|
||||
size_t NumAdded() const { return keys_.size(); }
|
||||
|
||||
// Stores all blobs to disk in the given order with padding for alignment.
|
||||
// Aborts on error.
|
||||
void WriteAll();
|
||||
// Appends a footer and closes the file. Must be called once after all `Add`.
|
||||
void Finalize();
|
||||
|
||||
private:
|
||||
std::unique_ptr<File> file_;
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ TEST(BlobStoreTest, TestReadWrite) {
|
|||
BlobWriter writer(path, pool);
|
||||
writer.Add(keyA, "DATA", 5);
|
||||
writer.Add(keyB, buffer.data(), sizeof(buffer));
|
||||
writer.WriteAll();
|
||||
writer.Finalize();
|
||||
HWY_ASSERT_ARRAY_EQ(kOriginalData.data(), buffer.data(), buffer.size());
|
||||
|
||||
std::fill(buffer.begin(), buffer.end(), 0);
|
||||
|
|
@ -126,7 +126,7 @@ TEST(BlobStoreTest, TestNumBlobs) {
|
|||
}
|
||||
HWY_ASSERT(keys.size() == num_blobs);
|
||||
HWY_ASSERT(blobs.size() == num_blobs);
|
||||
writer.WriteAll();
|
||||
writer.Finalize();
|
||||
|
||||
BlobReader reader(path);
|
||||
HWY_ASSERT_EQ(reader.Keys().size(), num_blobs);
|
||||
|
|
|
|||
Loading…
Reference in New Issue