From 7ccc6abe87f2cb135b5b822687a65c881ab18c48 Mon Sep 17 00:00:00 2001 From: Phil Culliton Date: Wed, 5 Feb 2025 07:45:18 -0800 Subject: [PATCH] Allow conversion, loading and inference with NUQ. PiperOrigin-RevId: 723507890 --- compression/compress.h | 7 ++- compression/compress_test.cc | 2 +- compression/python/compression_clif_aux.cc | 10 ++-- compression/python/compression_test.py | 14 ++++- compression/shared.h | 4 +- gemma/weights.cc | 61 ++++++++++++++++++++++ gemma/weights.h | 26 +-------- 7 files changed, 92 insertions(+), 32 deletions(-) diff --git a/compression/compress.h b/compression/compress.h index 468c0d1..d875c4b 100644 --- a/compression/compress.h +++ b/compression/compress.h @@ -110,7 +110,12 @@ class MatPtr : public IFields { size_t NumElements() const { return num_elements_; } // Returns the number of bytes in the array. - size_t SizeBytes() const { return num_elements_ * element_size_; } + size_t SizeBytes() const { + if (this->GetType() == TypeEnum()) { + return NuqStream::PackedEnd(num_elements_); + } + return num_elements_ * element_size_; + } // Returns the number of rows in the 2-d array (outer dimension). size_t Rows() const { return rows_; } diff --git a/compression/compress_test.cc b/compression/compress_test.cc index 0beba1a..13b1982 100644 --- a/compression/compress_test.cc +++ b/compression/compress_test.cc @@ -159,7 +159,7 @@ struct TestShortLengthsT { stats.Notify(raw[i], hwy::ConvertScalarTo(dec[i])); } - if constexpr (false) { + if constexpr (true) { fprintf(stderr, "%s %s: %zu: %f %f %f %f\n", TypeName(), TypeName(), num, stats.SumL1(), stats.GeomeanValueDivL1(), stats.WeightedAverageL1(), stats.L1().Max()); diff --git a/compression/python/compression_clif_aux.cc b/compression/python/compression_clif_aux.cc index b843a93..2705756 100644 --- a/compression/python/compression_clif_aux.cc +++ b/compression/python/compression_clif_aux.cc @@ -81,8 +81,7 @@ class SbsWriterImpl : public WriterInterface { template void AllocateAndCompress(const std::string& name, absl::Span weights) { - const size_t num_packed = CompressedArrayElements(weights.size()); - MatPtrT storage(name, 1, num_packed); + MatPtrT storage(name, 1, weights.size()); model_memory_.push_back(storage); model_memory_.back().Allocate(); storage.SetPtr(model_memory_.back()); @@ -95,7 +94,12 @@ class SbsWriterImpl : public WriterInterface { const TensorInfo& tensor_info, float scale) { MatPtrT storage(name, &tensor_info); storage.set_scale(scale); - storage.SetNumElements(CompressedArrayElements(weights.size())); + + // Don't reset num_elements for NUQ. + if (!hwy::IsSame, NuqStream>()) { + storage.SetNumElements(CompressedArrayElements(weights.size())); + } + model_memory_.push_back(storage); if (mode_ == CompressorMode::kTEST_ONLY) return; model_memory_.back().Allocate(); diff --git a/compression/python/compression_test.py b/compression/python/compression_test.py index 33cd055..fdf00e3 100644 --- a/compression/python/compression_test.py +++ b/compression/python/compression_test.py @@ -39,6 +39,18 @@ class CompressionTest(absltest.TestCase): tensor_info, 1.0, ) + + tensor_info_nuq = configs.TensorInfo() + tensor_info_nuq.name = "fooNUQ" + tensor_info_nuq.axes = [0] + tensor_info_nuq.shape = [256] + writer.insert( + "fooNUQ", + np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32), + configs.Type.kNUQ, + tensor_info_nuq, + 1.0, + ) writer.insert_sfp( "bar", np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32) ) @@ -51,7 +63,7 @@ class CompressionTest(absltest.TestCase): writer.insert_float( "quux", np.array([0.000375] * 128 + [0.00006] * 128, dtype=np.float32) ) - self.assertEqual(writer.debug_num_blobs_added(), 5) + self.assertEqual(writer.debug_num_blobs_added(), 6) self.assertEqual(writer.write(temp_file.full_path), 0) diff --git a/compression/shared.h b/compression/shared.h index eb33d48..bccbb32 100644 --- a/compression/shared.h +++ b/compression/shared.h @@ -168,7 +168,9 @@ struct NuqStream { // Returns number of NuqStream to allocate for the stream, which matches its // size in bytes. static constexpr size_t PackedEnd(size_t capacity) { - return PackedStart(capacity) + hwy::DivCeil(capacity, 2); // 2x 4-bit/byte + const size_t num_groups = hwy::DivCeil(capacity, kGroupSize); + return (kClusters * num_groups) + + hwy::DivCeil(capacity, 2); // 2x 4-bit/byte } uint8_t byte; diff --git a/gemma/weights.cc b/gemma/weights.cc index 213bb7e..161a65d 100644 --- a/gemma/weights.cc +++ b/gemma/weights.cc @@ -23,6 +23,7 @@ #include #include "compression/blob_store.h" +#include "compression/compress-inl.h" #include "compression/compress.h" #include "compression/io.h" // Path #include "compression/shared.h" @@ -31,6 +32,7 @@ #include "hwy/aligned_allocator.h" #include "hwy/base.h" // HWY_ABORT #include "hwy/contrib/thread_pool/thread_pool.h" +#include "hwy/highway.h" #include "hwy/profiler.h" #include "hwy/stats.h" @@ -255,4 +257,63 @@ void ModelWeightsStorage::CreateForType(Type weight_type, } } +template +void LayerWeightsPtrs::Reshape(MatStorage* storage) { + if (attn_vec_einsum_w.data() == nullptr) return; + + const size_t model_dim = layer_config.model_dim; + const size_t heads = layer_config.heads; + const size_t qkv_dim = layer_config.qkv_dim; + + // Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim]. + if (storage != nullptr) { + storage->Allocate(); + att_weights.SetPtr(*storage); + } + + if (hwy::IsSame()) { + const hwy::HWY_NAMESPACE::ScalableTag df; + + hwy::AlignedFreeUniquePtr attn_vec_einsum_w_tmp = + hwy::AllocateAligned(model_dim * heads * qkv_dim); + hwy::AlignedFreeUniquePtr att_weights_tmp = + hwy::AllocateAligned(model_dim * heads * qkv_dim); + + HWY_NAMESPACE::DecompressAndZeroPad( + df, MakeSpan(attn_vec_einsum_w.data(), model_dim * heads * qkv_dim), 0, + attn_vec_einsum_w_tmp.get(), model_dim * heads * qkv_dim); + + for (size_t m = 0; m < model_dim; ++m) { + float* HWY_RESTRICT out_row = att_weights_tmp.get() + m * heads * qkv_dim; + for (size_t h = 0; h < heads; ++h) { + hwy::CopyBytes( + attn_vec_einsum_w_tmp.get() + h * model_dim * qkv_dim + m * qkv_dim, + out_row + h * qkv_dim, qkv_dim * sizeof(float)); + } + } + + CompressWorkingSet work; + hwy::ThreadPool pool(0); + + HWY_NAMESPACE::Compress( + att_weights_tmp.get(), model_dim * heads * qkv_dim, work, + MakeSpan(att_weights.data(), model_dim * heads * qkv_dim), + /*packed_ofs=*/0, pool); + + att_weights.set_scale(attn_vec_einsum_w.scale()); + + return; + } + + for (size_t m = 0; m < model_dim; ++m) { + Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim; + for (size_t h = 0; h < heads; ++h) { + hwy::CopyBytes( + attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim, + out_row + h * qkv_dim, qkv_dim * sizeof(Weight)); + } + } + att_weights.set_scale(attn_vec_einsum_w.scale()); +} + } // namespace gcpp diff --git a/gemma/weights.h b/gemma/weights.h index ca32ad3..5ddea0d 100644 --- a/gemma/weights.h +++ b/gemma/weights.h @@ -179,31 +179,7 @@ struct LayerWeightsPtrs { // Initializes att_weights from attn_vec_einsum_w, hence this must be called // after loading weights via ForEachTensor. // TODO: update compression/convert_weights to bake this in. - void Reshape(MatStorage* storage) { - if (attn_vec_einsum_w.data() == nullptr) return; - - const size_t model_dim = layer_config.model_dim; - const size_t heads = layer_config.heads; - const size_t qkv_dim = layer_config.qkv_dim; - - // TODO: implement a CompressTraits::Copy for NUQ. - // static_assert(!hwy::IsSame()); - - // Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim]. - if (storage != nullptr) { - storage->Allocate(); - att_weights.SetPtr(*storage); - } - for (size_t m = 0; m < model_dim; ++m) { - Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim; - for (size_t h = 0; h < heads; ++h) { - hwy::CopyBytes( - attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim, - out_row + h * qkv_dim, qkv_dim * sizeof(Weight)); - } - } - att_weights.set_scale(attn_vec_einsum_w.scale()); - } + void Reshape(MatStorage* storage); // Used by ForEachTensor for per-layer tensors. #define GEMMA_CALL_FUNC(member) \