Allow conversion, loading and inference with NUQ.

PiperOrigin-RevId: 723507890
2025-02-05 07:45:18 -08:00 · 2025-02-05 07:45:18 -08:00 · 7ccc6abe87
parent 8a6edff319
commit 7ccc6abe87
7 changed files with 92 additions and 32 deletions
--- a/compression/compress.h
+++ b/compression/compress.h
@ -110,7 +110,12 @@ class MatPtr : public IFields {
  size_t NumElements() const { return num_elements_; }

  // Returns the number of bytes in the array.
-  size_t SizeBytes() const { return num_elements_ * element_size_; }
+  size_t SizeBytes() const {
+    if (this->GetType() == TypeEnum<NuqStream>()) {
+      return NuqStream::PackedEnd(num_elements_);
+    }
+    return num_elements_ * element_size_;
+  }

  // Returns the number of rows in the 2-d array (outer dimension).
  size_t Rows() const { return rows_; }
--- a/compression/compress_test.cc
+++ b/compression/compress_test.cc
@ -159,7 +159,7 @@ struct TestShortLengthsT {
        stats.Notify(raw[i], hwy::ConvertScalarTo<float>(dec[i]));
      }

-      if constexpr (false) {
+      if constexpr (true) {
        fprintf(stderr, "%s %s: %zu: %f %f %f %f\n", TypeName<Packed>(),
                TypeName<T>(), num, stats.SumL1(), stats.GeomeanValueDivL1(),
                stats.WeightedAverageL1(), stats.L1().Max());
--- a/compression/python/compression_clif_aux.cc
+++ b/compression/python/compression_clif_aux.cc
@ -81,8 +81,7 @@ class SbsWriterImpl : public WriterInterface {
  template <typename Packed>
  void AllocateAndCompress(const std::string& name,
                           absl::Span<const float> weights) {
-    const size_t num_packed = CompressedArrayElements<Packed>(weights.size());
-    MatPtrT<Packed> storage(name, 1, num_packed);
+    MatPtrT<Packed> storage(name, 1, weights.size());
    model_memory_.push_back(storage);
    model_memory_.back().Allocate();
    storage.SetPtr(model_memory_.back());
@ -95,7 +94,12 @@ class SbsWriterImpl : public WriterInterface {
                         const TensorInfo& tensor_info, float scale) {
    MatPtrT<Packed> storage(name, &tensor_info);
    storage.set_scale(scale);
-    storage.SetNumElements(CompressedArrayElements<Packed>(weights.size()));
+
+    // Don't reset num_elements for NUQ.
+    if (!hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>()) {
+      storage.SetNumElements(CompressedArrayElements<Packed>(weights.size()));
+    }
+
    model_memory_.push_back(storage);
    if (mode_ == CompressorMode::kTEST_ONLY) return;
    model_memory_.back().Allocate();
--- a/compression/python/compression_test.py
+++ b/compression/python/compression_test.py
@ -39,6 +39,18 @@ class CompressionTest(absltest.TestCase):
        tensor_info,
        1.0,
    )
+
+    tensor_info_nuq = configs.TensorInfo()
+    tensor_info_nuq.name = "fooNUQ"
+    tensor_info_nuq.axes = [0]
+    tensor_info_nuq.shape = [256]
+    writer.insert(
+        "fooNUQ",
+        np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32),
+        configs.Type.kNUQ,
+        tensor_info_nuq,
+        1.0,
+    )
    writer.insert_sfp(
        "bar", np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32)
    )
@ -51,7 +63,7 @@ class CompressionTest(absltest.TestCase):
    writer.insert_float(
        "quux", np.array([0.000375] * 128 + [0.00006] * 128, dtype=np.float32)
    )
-    self.assertEqual(writer.debug_num_blobs_added(), 5)
+    self.assertEqual(writer.debug_num_blobs_added(), 6)
    self.assertEqual(writer.write(temp_file.full_path), 0)


--- a/compression/shared.h
+++ b/compression/shared.h
@ -168,7 +168,9 @@ struct NuqStream {
  // Returns number of NuqStream to allocate for the stream, which matches its
  // size in bytes.
  static constexpr size_t PackedEnd(size_t capacity) {
-    return PackedStart(capacity) + hwy::DivCeil(capacity, 2);  // 2x 4-bit/byte
+    const size_t num_groups = hwy::DivCeil(capacity, kGroupSize);
+    return (kClusters * num_groups) +
+           hwy::DivCeil(capacity, 2);  // 2x 4-bit/byte
  }

  uint8_t byte;
--- a/gemma/weights.cc
+++ b/gemma/weights.cc
@ -23,6 +23,7 @@
 #include <vector>

 #include "compression/blob_store.h"
+#include "compression/compress-inl.h"
 #include "compression/compress.h"
 #include "compression/io.h"  // Path
 #include "compression/shared.h"
@ -31,6 +32,7 @@
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"  // HWY_ABORT
 #include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/highway.h"
 #include "hwy/profiler.h"
 #include "hwy/stats.h"

@ -255,4 +257,63 @@ void ModelWeightsStorage::CreateForType(Type weight_type,
  }
 }

+template <class Weight>
+void LayerWeightsPtrs<Weight>::Reshape(MatStorage* storage) {
+  if (attn_vec_einsum_w.data() == nullptr) return;
+
+  const size_t model_dim = layer_config.model_dim;
+  const size_t heads = layer_config.heads;
+  const size_t qkv_dim = layer_config.qkv_dim;
+
+  // Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
+  if (storage != nullptr) {
+    storage->Allocate();
+    att_weights.SetPtr(*storage);
+  }
+
+  if (hwy::IsSame<Weight, NuqStream>()) {
+    const hwy::HWY_NAMESPACE::ScalableTag<float> df;
+
+    hwy::AlignedFreeUniquePtr<float[]> attn_vec_einsum_w_tmp =
+        hwy::AllocateAligned<float>(model_dim * heads * qkv_dim);
+    hwy::AlignedFreeUniquePtr<float[]> att_weights_tmp =
+        hwy::AllocateAligned<float>(model_dim * heads * qkv_dim);
+
+    HWY_NAMESPACE::DecompressAndZeroPad(
+        df, MakeSpan(attn_vec_einsum_w.data(), model_dim * heads * qkv_dim), 0,
+        attn_vec_einsum_w_tmp.get(), model_dim * heads * qkv_dim);
+
+    for (size_t m = 0; m < model_dim; ++m) {
+      float* HWY_RESTRICT out_row = att_weights_tmp.get() + m * heads * qkv_dim;
+      for (size_t h = 0; h < heads; ++h) {
+        hwy::CopyBytes(
+            attn_vec_einsum_w_tmp.get() + h * model_dim * qkv_dim + m * qkv_dim,
+            out_row + h * qkv_dim, qkv_dim * sizeof(float));
+      }
+    }
+
+    CompressWorkingSet work;
+    hwy::ThreadPool pool(0);
+
+    HWY_NAMESPACE::Compress(
+        att_weights_tmp.get(), model_dim * heads * qkv_dim, work,
+        MakeSpan(att_weights.data(), model_dim * heads * qkv_dim),
+        /*packed_ofs=*/0, pool);
+
+    att_weights.set_scale(attn_vec_einsum_w.scale());
+
+    return;
+  }
+
+  for (size_t m = 0; m < model_dim; ++m) {
+    Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim;
+    for (size_t h = 0; h < heads; ++h) {
+      hwy::CopyBytes(
+          attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim,
+          out_row + h * qkv_dim, qkv_dim * sizeof(Weight));
+    }
+  }
+  att_weights.set_scale(attn_vec_einsum_w.scale());
+}
+
 }  // namespace gcpp
--- a/gemma/weights.h
+++ b/gemma/weights.h
@ -179,31 +179,7 @@ struct LayerWeightsPtrs {
  // Initializes att_weights from attn_vec_einsum_w, hence this must be called
  // after loading weights via ForEachTensor.
  // TODO: update compression/convert_weights to bake this in.
-  void Reshape(MatStorage* storage) {
-    if (attn_vec_einsum_w.data() == nullptr) return;
-
-    const size_t model_dim = layer_config.model_dim;
-    const size_t heads = layer_config.heads;
-    const size_t qkv_dim = layer_config.qkv_dim;
-
-    // TODO: implement a CompressTraits::Copy for NUQ.
-    // static_assert(!hwy::IsSame<Weight, NuqStream>());
-
-    // Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
-    if (storage != nullptr) {
-      storage->Allocate();
-      att_weights.SetPtr(*storage);
-    }
-    for (size_t m = 0; m < model_dim; ++m) {
-      Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim;
-      for (size_t h = 0; h < heads; ++h) {
-        hwy::CopyBytes(
-            attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim,
-            out_row + h * qkv_dim, qkv_dim * sizeof(Weight));
-      }
-    }
-    att_weights.set_scale(attn_vec_einsum_w.scale());
-  }
+  void Reshape(MatStorage* storage);

 // Used by ForEachTensor for per-layer tensors.
 #define GEMMA_CALL_FUNC(member)                                             \