mirror of https://github.com/google/gemma.cpp.git
Update BlobWriter comments, WriteAll->Finalize
PiperOrigin-RevId: 790792133
This commit is contained in:
parent
701841897b
commit
4e062d68f7
|
|
@ -463,7 +463,7 @@ void WriteSingleFile(const ModelConfig& config, const GemmaTokenizer& tokenizer,
|
||||||
|
|
||||||
AddBlob(kMatPtrsName, serialized_mat_ptrs, writer);
|
AddBlob(kMatPtrsName, serialized_mat_ptrs, writer);
|
||||||
|
|
||||||
writer.WriteAll();
|
writer.Finalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace gcpp
|
} // namespace gcpp
|
||||||
|
|
|
||||||
|
|
@ -99,7 +99,10 @@ struct BlobIO {
|
||||||
// For V2: the file is represented as
|
// For V2: the file is represented as
|
||||||
// Header + PadToBlobAlign + Payload + PadToEndAlign + Directory + Header
|
// Header + PadToBlobAlign + Payload + PadToEndAlign + Directory + Header
|
||||||
// The Header at the beginning has num_blobs == 0; and the Header at the end has
|
// The Header at the beginning has num_blobs == 0; and the Header at the end has
|
||||||
// the correct num_blobs.
|
// the correct num_blobs. This allows writing blobs without knowing the total
|
||||||
|
// number of them, nor holding all them in memory. As of 2025-07-31, we support
|
||||||
|
// reading both, but always write V2. Note that its num_blobs == 0 was
|
||||||
|
// previously disallowed. To read V2, pull the latest code from the dev branch.
|
||||||
//
|
//
|
||||||
// Actual payload is indexed by the directory with keys, offset and bytes; keys
|
// Actual payload is indexed by the directory with keys, offset and bytes; keys
|
||||||
// are unique, opaque 128-bit keys.
|
// are unique, opaque 128-bit keys.
|
||||||
|
|
@ -108,10 +111,6 @@ struct BlobIO {
|
||||||
// Additional data may be added only inside new blobs. Changes to the blob
|
// Additional data may be added only inside new blobs. Changes to the blob
|
||||||
// contents or type should be handled by renaming keys.
|
// contents or type should be handled by renaming keys.
|
||||||
//
|
//
|
||||||
// The file format deliberately omits a version number because it is unchanging.
|
|
||||||
// Additional data may be added only inside new blobs. Changes to the blob
|
|
||||||
// contents or type should be handled by renaming keys.
|
|
||||||
//
|
|
||||||
// This class is for internal use by `BlobReader` and `BlobWriter`. Its
|
// This class is for internal use by `BlobReader` and `BlobWriter`. Its
|
||||||
// interface is more low-level: fixed-size keys instead of strings.
|
// interface is more low-level: fixed-size keys instead of strings.
|
||||||
class BlobStore {
|
class BlobStore {
|
||||||
|
|
@ -473,7 +472,8 @@ static void EnqueueChunks(size_t key_idx, uint64_t offset, uint64_t bytes,
|
||||||
BlobWriter::BlobWriter(const Path& filename, hwy::ThreadPool& pool)
|
BlobWriter::BlobWriter(const Path& filename, hwy::ThreadPool& pool)
|
||||||
: file_(OpenFileOrNull(filename, "w+")), pool_(pool) {
|
: file_(OpenFileOrNull(filename, "w+")), pool_(pool) {
|
||||||
if (!file_) HWY_ABORT("Failed to open for writing %s", filename.path.c_str());
|
if (!file_) HWY_ABORT("Failed to open for writing %s", filename.path.c_str());
|
||||||
// Write a fake header to the beginning of the file.
|
// Write a placeholder header to the beginning of the file. If append-only,
|
||||||
|
// we will later write a footer, else we will update the header.
|
||||||
std::vector<uint8_t> bytes_before_blobs = BlobStore::BytesBeforeBlobsV2();
|
std::vector<uint8_t> bytes_before_blobs = BlobStore::BytesBeforeBlobsV2();
|
||||||
file_->Write(bytes_before_blobs.data(), bytes_before_blobs.size(), 0);
|
file_->Write(bytes_before_blobs.data(), bytes_before_blobs.size(), 0);
|
||||||
}
|
}
|
||||||
|
|
@ -503,13 +503,15 @@ void BlobWriter::Add(const std::string& key, const void* data, size_t bytes) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void BlobWriter::WriteAll() {
|
void BlobWriter::Finalize() {
|
||||||
const BlobStore bs = BlobStore(keys_, blob_sizes_);
|
const BlobStore bs = BlobStore(keys_, blob_sizes_);
|
||||||
|
|
||||||
// Write the rest of the bytes, which contains: paddings + directory + header.
|
// Write the rest of the bytes, which contains: paddings + directory + header.
|
||||||
const auto bytes_after_blobs = bs.BytesAfterBlobs();
|
const auto bytes_after_blobs = bs.BytesAfterBlobs();
|
||||||
file_->Write(bytes_after_blobs.data(), bytes_after_blobs.size(),
|
file_->Write(bytes_after_blobs.data(), bytes_after_blobs.size(),
|
||||||
file_->FileSize());
|
file_->FileSize());
|
||||||
|
|
||||||
|
file_.reset(); // closes the file
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace gcpp
|
} // namespace gcpp
|
||||||
|
|
|
||||||
|
|
@ -111,21 +111,19 @@ class BlobReader {
|
||||||
std::unordered_map<std::string, size_t> key_idx_for_key_;
|
std::unordered_map<std::string, size_t> key_idx_for_key_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Collects references to blobs and writes them all at once with parallel I/O.
|
// Writes blobs immediately using parallel I/O, and collects their metadata for
|
||||||
|
// writing the file footer.
|
||||||
// Thread-compatible: independent instances can be used concurrently, but it
|
// Thread-compatible: independent instances can be used concurrently, but it
|
||||||
// does not make sense to call the methods concurrently.
|
// does not make sense to call the methods concurrently.
|
||||||
class BlobWriter {
|
class BlobWriter {
|
||||||
public:
|
public:
|
||||||
explicit BlobWriter(const Path& filename, hwy::ThreadPool& pool);
|
BlobWriter(const Path& filename, hwy::ThreadPool& pool);
|
||||||
|
|
||||||
|
// Writes the blob to disk with padding for alignment. Aborts on error.
|
||||||
void Add(const std::string& key, const void* data, size_t bytes);
|
void Add(const std::string& key, const void* data, size_t bytes);
|
||||||
|
|
||||||
// For `ModelStore`: this is the `key_idx` of the next blob to be added.
|
// Appends a footer and closes the file. Must be called once after all `Add`.
|
||||||
size_t NumAdded() const { return keys_.size(); }
|
void Finalize();
|
||||||
|
|
||||||
// Stores all blobs to disk in the given order with padding for alignment.
|
|
||||||
// Aborts on error.
|
|
||||||
void WriteAll();
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<File> file_;
|
std::unique_ptr<File> file_;
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ TEST(BlobStoreTest, TestReadWrite) {
|
||||||
BlobWriter writer(path, pool);
|
BlobWriter writer(path, pool);
|
||||||
writer.Add(keyA, "DATA", 5);
|
writer.Add(keyA, "DATA", 5);
|
||||||
writer.Add(keyB, buffer.data(), sizeof(buffer));
|
writer.Add(keyB, buffer.data(), sizeof(buffer));
|
||||||
writer.WriteAll();
|
writer.Finalize();
|
||||||
HWY_ASSERT_ARRAY_EQ(kOriginalData.data(), buffer.data(), buffer.size());
|
HWY_ASSERT_ARRAY_EQ(kOriginalData.data(), buffer.data(), buffer.size());
|
||||||
|
|
||||||
std::fill(buffer.begin(), buffer.end(), 0);
|
std::fill(buffer.begin(), buffer.end(), 0);
|
||||||
|
|
@ -126,7 +126,7 @@ TEST(BlobStoreTest, TestNumBlobs) {
|
||||||
}
|
}
|
||||||
HWY_ASSERT(keys.size() == num_blobs);
|
HWY_ASSERT(keys.size() == num_blobs);
|
||||||
HWY_ASSERT(blobs.size() == num_blobs);
|
HWY_ASSERT(blobs.size() == num_blobs);
|
||||||
writer.WriteAll();
|
writer.Finalize();
|
||||||
|
|
||||||
BlobReader reader(path);
|
BlobReader reader(path);
|
||||||
HWY_ASSERT_EQ(reader.Keys().size(), num_blobs);
|
HWY_ASSERT_EQ(reader.Keys().size(), num_blobs);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue