diff --git a/compression/blob_store.h b/compression/blob_store.h index 94bbace..d98235c 100644 --- a/compression/blob_store.h +++ b/compression/blob_store.h @@ -104,6 +104,9 @@ class BlobWriter { // Stores all blobs to disk in the given order with padding for alignment. BlobError WriteAll(hwy::ThreadPool& pool, const Path& filename); + // Returns the number of blobs added. + size_t DebugNumBlobsAdded() const { return keys_.size(); } + private: std::vector keys_; std::vector> blobs_; diff --git a/compression/compress-inl.h b/compression/compress-inl.h index 6d8ba28..651b8a2 100644 --- a/compression/compress-inl.h +++ b/compression/compress-inl.h @@ -705,6 +705,9 @@ class Compressor { return err; } + // Returns the number of blobs added. + size_t DebugNumBlobsAdded() const { return writer_.DebugNumBlobsAdded(); } + private: CompressWorkingSet work_; hwy::ThreadPool& pool_; diff --git a/compression/compress.h b/compression/compress.h index fc13bdf..b717ac8 100644 --- a/compression/compress.h +++ b/compression/compress.h @@ -216,8 +216,9 @@ class MatPtrT : public MatPtr { : MatPtr(name, TypeEnum(), sizeof(MatT), rows, cols) {} // Construction from TensorIndex entry to remove duplication of sizes. MatPtrT(const std::string& name, const TensorIndex& tensor_index) + : MatPtrT(name, tensor_index.FindName(name)) {} + MatPtrT(const std::string& name, const TensorInfo* tensor) : MatPtr(name, TypeEnum(), sizeof(MatT), 0, 0) { - const TensorInfo* tensor = tensor_index.FindName(name); HWY_ASSERT(tensor != nullptr); cols_ = tensor->shape.back(); rows_ = 1; diff --git a/compression/python/BUILD.bazel b/compression/python/BUILD.bazel index f12d8be..016f90f 100644 --- a/compression/python/BUILD.bazel +++ b/compression/python/BUILD.bazel @@ -15,6 +15,7 @@ cc_library( visibility = ["//visibility:private"], deps = [ "@abseil-cpp//absl/types:span", + "//:common", "//compression:compress", "//compression:io", "@highway//:hwy", @@ -28,6 +29,7 @@ pybind_extension( deps = [ ":compression_clif_aux", "@abseil-cpp//absl/types:span", + "//:common", "//compression:sfp", ], ) diff --git a/compression/python/compression_clif_aux.cc b/compression/python/compression_clif_aux.cc index e313bbe..fe0e128 100644 --- a/compression/python/compression_clif_aux.cc +++ b/compression/python/compression_clif_aux.cc @@ -1,5 +1,7 @@ #include "compression/python/compression_clif_aux.h" +#include +#include #include #include @@ -22,6 +24,7 @@ #include "absl/types/span.h" #include "compression/io.h" +#include "gemma/tensor_index.h" #include "hwy/base.h" #include "hwy/contrib/thread_pool/thread_pool.h" @@ -32,7 +35,8 @@ class WriterInterface { virtual ~WriterInterface() = default; virtual void Insert(std::string name, absl::Span weights, - Type type) = 0; + Type type, const TensorInfo& tensor_info, + float scale) = 0; virtual void InsertSfp(std::string name, absl::Span weights) = 0; virtual void InsertNUQ(std::string name, absl::Span weights) = 0; virtual void InsertBfloat16(std::string name, @@ -41,6 +45,8 @@ class WriterInterface { absl::Span weights) = 0; virtual void AddScales(const std::vector& scales) = 0; + virtual size_t DebugNumBlobsAdded() const = 0; + virtual int Write(std::string path) = 0; }; @@ -65,24 +71,39 @@ class SbsWriterImpl : public WriterInterface { std::string decorated_name = storage.CacheName(); compressor_(&storage, decorated_name.c_str(), weights.data()); } + template + void AllocateWithShape(const std::string& name, + absl::Span weights, + const TensorInfo& tensor_info, float scale) { + MatPtrT storage(name, &tensor_info); + storage.set_scale(scale); + storage.SetNumElements(CompressedArrayElements(weights.size())); + model_memory_.push_back(storage); + if (mode_ == CompressorMode::kTEST_ONLY) return; + model_memory_.back().Allocate(); + storage.SetPtr(model_memory_.back()); + std::string decorated_name = storage.CacheName(); + compressor_(&storage, decorated_name.c_str(), weights.data()); + } public: - SbsWriterImpl() : pool_(0), compressor_(pool_) {} + explicit SbsWriterImpl(CompressorMode mode) + : pool_(0), compressor_(pool_), mode_(mode) {} - void Insert(std::string name, absl::Span weights, - Type type) override { + void Insert(std::string name, absl::Span weights, Type type, + const TensorInfo& tensor_info, float scale) override { switch (type) { case Type::kSFP: - AllocateAndCompress(name, weights); + AllocateWithShape(name, weights, tensor_info, scale); break; case Type::kNUQ: - AllocateAndCompress(name, weights); + AllocateWithShape(name, weights, tensor_info, scale); break; case Type::kBF16: - AllocateAndCompress(name, weights); + AllocateWithShape(name, weights, tensor_info, scale); break; case Type::kF32: - AllocateAndCompress(name, weights); + AllocateWithShape(name, weights, tensor_info, scale); break; default: HWY_ABORT("Unsupported type"); @@ -112,6 +133,12 @@ class SbsWriterImpl : public WriterInterface { compressor_.AddScales(scales_.data(), scales_.size()); } + // Returns the number of blobs added. + size_t DebugNumBlobsAdded() const { + if (mode_ == CompressorMode::kTEST_ONLY) return model_memory_.size(); + return compressor_.DebugNumBlobsAdded(); + } + int Write(std::string path) override { return compressor_.WriteAll(pool_, gcpp::Path(path)); } @@ -121,9 +148,12 @@ class SbsWriterImpl : public WriterInterface { CompressWorkingSet working_set_; std::vector model_memory_; std::vector scales_; + CompressorMode mode_; }; -WriterInterface* NewSbsWriter() { return new SbsWriterImpl(); } +WriterInterface* NewSbsWriter(CompressorMode mode) { + return new SbsWriterImpl(mode); +} } // namespace HWY_NAMESPACE } // namespace gcpp @@ -134,12 +164,13 @@ namespace gcpp { HWY_EXPORT(NewSbsWriter); -SbsWriter::SbsWriter() : impl_(HWY_DYNAMIC_DISPATCH(NewSbsWriter)()) {} +SbsWriter::SbsWriter(CompressorMode mode) + : impl_(HWY_DYNAMIC_DISPATCH(NewSbsWriter)(mode)) {} SbsWriter::~SbsWriter() = default; void SbsWriter::Insert(std::string name, absl::Span weights, - Type type) { - impl_->Insert(name, weights, type); + Type type, const TensorInfo& tensor_info, float scale) { + impl_->Insert(name, weights, type, tensor_info, scale); } void SbsWriter::InsertSfp(std::string name, absl::Span weights) { impl_->InsertSfp(name, weights); @@ -158,6 +189,11 @@ void SbsWriter::InsertFloat(std::string name, absl::Span weights) { void SbsWriter::AddScales(const std::vector& scales) { impl_->AddScales(scales); } + +size_t SbsWriter::DebugNumBlobsAdded() const { + return impl_->DebugNumBlobsAdded(); +} + int SbsWriter::Write(std::string path) { return impl_->Write(path); } } // namespace gcpp diff --git a/compression/python/compression_clif_aux.h b/compression/python/compression_clif_aux.h index cd2e4f1..72eb4e4 100644 --- a/compression/python/compression_clif_aux.h +++ b/compression/python/compression_clif_aux.h @@ -1,29 +1,44 @@ #ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_PYTHON_COMPRESSION_CLIF_AUX_H_ #define THIRD_PARTY_GEMMA_CPP_COMPRESSION_PYTHON_COMPRESSION_CLIF_AUX_H_ +#include #include #include #include #include "absl/types/span.h" #include "compression/shared.h" +#include "gemma/tensor_index.h" namespace gcpp { +// How to process the data. +enum class CompressorMode { + // No compression, no write to file, just for testing. + kTEST_ONLY, + // Old-style compression, no table of contents. + kNO_TOC, + // New-style compression, with table of contents. + kWITH_TOC, +}; + class WriterInterface; class SbsWriter { public: - SbsWriter(); + explicit SbsWriter(CompressorMode mode); ~SbsWriter(); - void Insert(std::string name, absl::Span weights, Type type); + void Insert(std::string name, absl::Span weights, Type type, + const TensorInfo& tensor_info, float scale); void InsertSfp(std::string name, absl::Span weights); void InsertNUQ(std::string name, absl::Span weights); void InsertBfloat16(std::string name, absl::Span weights); void InsertFloat(std::string name, absl::Span weights); void AddScales(const std::vector& scales); + size_t DebugNumBlobsAdded() const; + int Write(std::string path); private: diff --git a/compression/python/compression_extension.cc b/compression/python/compression_extension.cc index 5fcebdf..4669f62 100644 --- a/compression/python/compression_extension.cc +++ b/compression/python/compression_extension.cc @@ -9,6 +9,7 @@ #include "compression/python/compression_clif_aux.h" #include "compression/shared.h" +using gcpp::CompressorMode; using gcpp::SbsWriter; namespace py = pybind11; @@ -23,18 +24,24 @@ void wrap_span(SbsWriter& writer, std::string name, py::array_t data) { } template void wrap_span_typed(SbsWriter& writer, std::string name, - py::array_t data, gcpp::Type type) { + py::array_t data, gcpp::Type type, + gcpp::TensorInfo tensor_info, float scale) { if (data.ndim() != 1 || data.strides(0) != sizeof(float)) { throw std::domain_error("Input array must be 1D and densely packed."); } std::invoke(Func, writer, name, absl::MakeSpan(data.data(0), data.size()), - type); + type, tensor_info, scale); } } // namespace PYBIND11_MODULE(compression, m) { + py::enum_(m, "CompressorMode") + .value("TEST_ONLY", CompressorMode::kTEST_ONLY) + .value("NO_TOC", CompressorMode::kNO_TOC) + .value("WITH_TOC", CompressorMode::kWITH_TOC); + py::class_(m, "SbsWriter") - .def(py::init<>()) + .def(py::init()) // NOTE: Individual compression backends may impose constraints on the // array length, such as a minimum of (say) 32 elements. .def("insert", wrap_span_typed<&SbsWriter::Insert>) @@ -43,5 +50,6 @@ PYBIND11_MODULE(compression, m) { .def("insert_bf16", wrap_span<&SbsWriter::InsertBfloat16>) .def("insert_float", wrap_span<&SbsWriter::InsertFloat>) .def("add_scales", &SbsWriter::AddScales) + .def("debug_num_blobs_added", &SbsWriter::DebugNumBlobsAdded) .def("write", &SbsWriter::Write); } diff --git a/compression/python/compression_test.py b/compression/python/compression_test.py index e25f06b..077e513 100644 --- a/compression/python/compression_test.py +++ b/compression/python/compression_test.py @@ -11,12 +11,18 @@ class CompressionTest(unittest.TestCase): def test_sbs_writer(self): temp_file = self.create_tempfile("test.sbs") + tensor_info = configs.TensorInfo() + tensor_info.name = "foo" + tensor_info.axes = [0] + tensor_info.shape = [192] - writer = compression.SbsWriter() + writer = compression.SbsWriter(compression.CompressorMode.NO_TOC) writer.insert( "foo", np.array([0.0012] * 128 + [0.001] * 64, dtype=np.float32), configs.Type.kSFP, + tensor_info, + 1.0, ) writer.insert_sfp( "bar", np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32) @@ -30,6 +36,7 @@ class CompressionTest(unittest.TestCase): writer.insert_float( "quux", np.array([0.000375] * 128 + [0.00006] * 128, dtype=np.float32) ) + self.assertEqual(writer.debug_num_blobs_added(), 5) self.assertEqual(writer.write(temp_file.full_path), 0) diff --git a/gemma/tensor_index.cc b/gemma/tensor_index.cc index 68d9e49..b6afa4d 100644 --- a/gemma/tensor_index.cc +++ b/gemma/tensor_index.cc @@ -138,8 +138,8 @@ std::vector ImageLayerTensors(const ModelConfig& config, TensorInfo{ .name = "qkv_ein_w", .source_names = {"MultiHeadDotProductAttention_0/qkv/kernel"}, - .axes = {2, 0, 3, 1}, - .shape = {layer_config.heads, 3, layer_config.qkv_dim, + .axes = {1, 2, 0}, + .shape = {layer_config.heads, 3 * layer_config.qkv_dim, config.vit_model_dim}, .min_size = Type::kBF16, }, @@ -156,7 +156,7 @@ std::vector ImageLayerTensors(const ModelConfig& config, .name = "k_ein_b", .source_names = {"MultiHeadDotProductAttention_0/key/bias"}, .axes = {0, 1}, - .shape = {layer_config.heads, layer_config.qkv_dim}, + .shape = {layer_config.kv_heads, layer_config.qkv_dim}, .concat_names = {""}, .min_size = Type::kF32, }, @@ -164,15 +164,16 @@ std::vector ImageLayerTensors(const ModelConfig& config, .name = "v_ein_b", .source_names = {"MultiHeadDotProductAttention_0/value/bias"}, .axes = {0, 1}, - .shape = {layer_config.heads, layer_config.qkv_dim}, + .shape = {layer_config.kv_heads, layer_config.qkv_dim}, .concat_names = {""}, .min_size = Type::kF32, }, TensorInfo{ .name = "qkv_ein_b", .source_names = {"MultiHeadDotProductAttention_0/qkv/bias"}, - .axes = {1, 0, 2}, - .shape = {layer_config.heads * 3, layer_config.qkv_dim}, + .axes = {0, 1}, + .shape = {layer_config.heads + layer_config.kv_heads * 2, + layer_config.qkv_dim}, .min_size = Type::kF32, }, TensorInfo{ @@ -243,14 +244,15 @@ std::vector LLMLayerTensors(const ModelConfig& config, .name = "qkv1_w", .source_names = {"attn/q_einsum/w"}, .axes = {0, 2, 1}, - .shape = {layer_config.heads, layer_config.qkv_dim, config.model_dim}, + .shape = {layer_config.heads * layer_config.qkv_dim, + config.model_dim}, .concat_names = {"qkv_ein", "qkv2_w"}, }, TensorInfo{ .name = "qkv2_w", .source_names = {"attn/kv_einsum/w"}, .axes = {1, 0, 3, 2}, - .shape = {2 * layer_config.kv_heads, layer_config.qkv_dim, + .shape = {2 * layer_config.kv_heads * layer_config.qkv_dim, config.model_dim}, .concat_names = {""}, }, @@ -279,8 +281,9 @@ std::vector LLMLayerTensors(const ModelConfig& config, .name = "qkv_ein", .source_names = {"attn/qkv_einsum/w"}, .axes = {1, 0, 3, 2}, - .shape = {(layer_config.heads + 2 * layer_config.kv_heads), - layer_config.qkv_dim, config.model_dim}, + .shape = {(layer_config.heads + 2 * layer_config.kv_heads) * + layer_config.qkv_dim, + config.model_dim}, }, TensorInfo{ .name = "attn_ob", @@ -535,7 +538,8 @@ TensorIndex::TensorIndex(const ModelConfig& config, int llm_layer_idx, } } -TensorInfo TensorIndex::GetTensorInfo(const std::string& path) const { +TensorInfo TensorIndex::TensorInfoFromSourcePath( + const std::string& path) const { for (const auto& tensor : tensors_) { for (const auto& source_name : tensor.source_names) { auto pos = path.rfind(source_name); diff --git a/gemma/tensor_index.h b/gemma/tensor_index.h index a1acfd6..dc6b86c 100644 --- a/gemma/tensor_index.h +++ b/gemma/tensor_index.h @@ -68,7 +68,17 @@ class TensorIndex { // or an empty TensorInfo if not found. // NOTE: that the returned TensorInfo is a copy, so that the source // TensorIndex can be destroyed without affecting the returned TensorInfo. - TensorInfo GetTensorInfo(const std::string& path) const; + TensorInfo TensorInfoFromSourcePath(const std::string& path) const; + + // Returns the TensorInfo whose name matches the given name, + // or an empty TensorInfo if not found. + // NOTE: that the returned TensorInfo is a copy, so that the source + // TensorIndex can be destroyed without affecting the returned TensorInfo. + TensorInfo TensorInfoFromName(const std::string& name) const { + const TensorInfo* info = FindName(name); + if (info == nullptr) return TensorInfo(); + return *info; + } // Returns the TensorInfo for the given tensor name, for concise construction // of ModelWeightsPtrs/LayerWeightsPtrs.