Allow conversion, loading and inference with NUQ.

PiperOrigin-RevId: 723507890
This commit is contained in:
Phil Culliton 2025-02-05 07:45:18 -08:00 committed by Copybara-Service
parent 8a6edff319
commit 7ccc6abe87
7 changed files with 92 additions and 32 deletions

View File

@ -110,7 +110,12 @@ class MatPtr : public IFields {
size_t NumElements() const { return num_elements_; }
// Returns the number of bytes in the array.
size_t SizeBytes() const { return num_elements_ * element_size_; }
size_t SizeBytes() const {
if (this->GetType() == TypeEnum<NuqStream>()) {
return NuqStream::PackedEnd(num_elements_);
}
return num_elements_ * element_size_;
}
// Returns the number of rows in the 2-d array (outer dimension).
size_t Rows() const { return rows_; }

View File

@ -159,7 +159,7 @@ struct TestShortLengthsT {
stats.Notify(raw[i], hwy::ConvertScalarTo<float>(dec[i]));
}
if constexpr (false) {
if constexpr (true) {
fprintf(stderr, "%s %s: %zu: %f %f %f %f\n", TypeName<Packed>(),
TypeName<T>(), num, stats.SumL1(), stats.GeomeanValueDivL1(),
stats.WeightedAverageL1(), stats.L1().Max());

View File

@ -81,8 +81,7 @@ class SbsWriterImpl : public WriterInterface {
template <typename Packed>
void AllocateAndCompress(const std::string& name,
absl::Span<const float> weights) {
const size_t num_packed = CompressedArrayElements<Packed>(weights.size());
MatPtrT<Packed> storage(name, 1, num_packed);
MatPtrT<Packed> storage(name, 1, weights.size());
model_memory_.push_back(storage);
model_memory_.back().Allocate();
storage.SetPtr(model_memory_.back());
@ -95,7 +94,12 @@ class SbsWriterImpl : public WriterInterface {
const TensorInfo& tensor_info, float scale) {
MatPtrT<Packed> storage(name, &tensor_info);
storage.set_scale(scale);
storage.SetNumElements(CompressedArrayElements<Packed>(weights.size()));
// Don't reset num_elements for NUQ.
if (!hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>()) {
storage.SetNumElements(CompressedArrayElements<Packed>(weights.size()));
}
model_memory_.push_back(storage);
if (mode_ == CompressorMode::kTEST_ONLY) return;
model_memory_.back().Allocate();

View File

@ -39,6 +39,18 @@ class CompressionTest(absltest.TestCase):
tensor_info,
1.0,
)
tensor_info_nuq = configs.TensorInfo()
tensor_info_nuq.name = "fooNUQ"
tensor_info_nuq.axes = [0]
tensor_info_nuq.shape = [256]
writer.insert(
"fooNUQ",
np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32),
configs.Type.kNUQ,
tensor_info_nuq,
1.0,
)
writer.insert_sfp(
"bar", np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32)
)
@ -51,7 +63,7 @@ class CompressionTest(absltest.TestCase):
writer.insert_float(
"quux", np.array([0.000375] * 128 + [0.00006] * 128, dtype=np.float32)
)
self.assertEqual(writer.debug_num_blobs_added(), 5)
self.assertEqual(writer.debug_num_blobs_added(), 6)
self.assertEqual(writer.write(temp_file.full_path), 0)

View File

@ -168,7 +168,9 @@ struct NuqStream {
// Returns number of NuqStream to allocate for the stream, which matches its
// size in bytes.
static constexpr size_t PackedEnd(size_t capacity) {
return PackedStart(capacity) + hwy::DivCeil(capacity, 2); // 2x 4-bit/byte
const size_t num_groups = hwy::DivCeil(capacity, kGroupSize);
return (kClusters * num_groups) +
hwy::DivCeil(capacity, 2); // 2x 4-bit/byte
}
uint8_t byte;

View File

@ -23,6 +23,7 @@
#include <vector>
#include "compression/blob_store.h"
#include "compression/compress-inl.h"
#include "compression/compress.h"
#include "compression/io.h" // Path
#include "compression/shared.h"
@ -31,6 +32,7 @@
#include "hwy/aligned_allocator.h"
#include "hwy/base.h" // HWY_ABORT
#include "hwy/contrib/thread_pool/thread_pool.h"
#include "hwy/highway.h"
#include "hwy/profiler.h"
#include "hwy/stats.h"
@ -255,4 +257,63 @@ void ModelWeightsStorage::CreateForType(Type weight_type,
}
}
template <class Weight>
void LayerWeightsPtrs<Weight>::Reshape(MatStorage* storage) {
if (attn_vec_einsum_w.data() == nullptr) return;
const size_t model_dim = layer_config.model_dim;
const size_t heads = layer_config.heads;
const size_t qkv_dim = layer_config.qkv_dim;
// Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
if (storage != nullptr) {
storage->Allocate();
att_weights.SetPtr(*storage);
}
if (hwy::IsSame<Weight, NuqStream>()) {
const hwy::HWY_NAMESPACE::ScalableTag<float> df;
hwy::AlignedFreeUniquePtr<float[]> attn_vec_einsum_w_tmp =
hwy::AllocateAligned<float>(model_dim * heads * qkv_dim);
hwy::AlignedFreeUniquePtr<float[]> att_weights_tmp =
hwy::AllocateAligned<float>(model_dim * heads * qkv_dim);
HWY_NAMESPACE::DecompressAndZeroPad(
df, MakeSpan(attn_vec_einsum_w.data(), model_dim * heads * qkv_dim), 0,
attn_vec_einsum_w_tmp.get(), model_dim * heads * qkv_dim);
for (size_t m = 0; m < model_dim; ++m) {
float* HWY_RESTRICT out_row = att_weights_tmp.get() + m * heads * qkv_dim;
for (size_t h = 0; h < heads; ++h) {
hwy::CopyBytes(
attn_vec_einsum_w_tmp.get() + h * model_dim * qkv_dim + m * qkv_dim,
out_row + h * qkv_dim, qkv_dim * sizeof(float));
}
}
CompressWorkingSet work;
hwy::ThreadPool pool(0);
HWY_NAMESPACE::Compress(
att_weights_tmp.get(), model_dim * heads * qkv_dim, work,
MakeSpan(att_weights.data(), model_dim * heads * qkv_dim),
/*packed_ofs=*/0, pool);
att_weights.set_scale(attn_vec_einsum_w.scale());
return;
}
for (size_t m = 0; m < model_dim; ++m) {
Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim;
for (size_t h = 0; h < heads; ++h) {
hwy::CopyBytes(
attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim,
out_row + h * qkv_dim, qkv_dim * sizeof(Weight));
}
}
att_weights.set_scale(attn_vec_einsum_w.scale());
}
} // namespace gcpp

View File

@ -179,31 +179,7 @@ struct LayerWeightsPtrs {
// Initializes att_weights from attn_vec_einsum_w, hence this must be called
// after loading weights via ForEachTensor.
// TODO: update compression/convert_weights to bake this in.
void Reshape(MatStorage* storage) {
if (attn_vec_einsum_w.data() == nullptr) return;
const size_t model_dim = layer_config.model_dim;
const size_t heads = layer_config.heads;
const size_t qkv_dim = layer_config.qkv_dim;
// TODO: implement a CompressTraits::Copy for NUQ.
// static_assert(!hwy::IsSame<Weight, NuqStream>());
// Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
if (storage != nullptr) {
storage->Allocate();
att_weights.SetPtr(*storage);
}
for (size_t m = 0; m < model_dim; ++m) {
Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim;
for (size_t h = 0; h < heads; ++h) {
hwy::CopyBytes(
attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim,
out_row + h * qkv_dim, qkv_dim * sizeof(Weight));
}
}
att_weights.set_scale(attn_vec_einsum_w.scale());
}
void Reshape(MatStorage* storage);
// Used by ForEachTensor for per-layer tensors.
#define GEMMA_CALL_FUNC(member) \