mirror of https://github.com/google/gemma.cpp.git
Allow conversion, loading and inference with NUQ.
PiperOrigin-RevId: 723507890
This commit is contained in:
parent
8a6edff319
commit
7ccc6abe87
|
|
@ -110,7 +110,12 @@ class MatPtr : public IFields {
|
|||
size_t NumElements() const { return num_elements_; }
|
||||
|
||||
// Returns the number of bytes in the array.
|
||||
size_t SizeBytes() const { return num_elements_ * element_size_; }
|
||||
size_t SizeBytes() const {
|
||||
if (this->GetType() == TypeEnum<NuqStream>()) {
|
||||
return NuqStream::PackedEnd(num_elements_);
|
||||
}
|
||||
return num_elements_ * element_size_;
|
||||
}
|
||||
|
||||
// Returns the number of rows in the 2-d array (outer dimension).
|
||||
size_t Rows() const { return rows_; }
|
||||
|
|
|
|||
|
|
@ -159,7 +159,7 @@ struct TestShortLengthsT {
|
|||
stats.Notify(raw[i], hwy::ConvertScalarTo<float>(dec[i]));
|
||||
}
|
||||
|
||||
if constexpr (false) {
|
||||
if constexpr (true) {
|
||||
fprintf(stderr, "%s %s: %zu: %f %f %f %f\n", TypeName<Packed>(),
|
||||
TypeName<T>(), num, stats.SumL1(), stats.GeomeanValueDivL1(),
|
||||
stats.WeightedAverageL1(), stats.L1().Max());
|
||||
|
|
|
|||
|
|
@ -81,8 +81,7 @@ class SbsWriterImpl : public WriterInterface {
|
|||
template <typename Packed>
|
||||
void AllocateAndCompress(const std::string& name,
|
||||
absl::Span<const float> weights) {
|
||||
const size_t num_packed = CompressedArrayElements<Packed>(weights.size());
|
||||
MatPtrT<Packed> storage(name, 1, num_packed);
|
||||
MatPtrT<Packed> storage(name, 1, weights.size());
|
||||
model_memory_.push_back(storage);
|
||||
model_memory_.back().Allocate();
|
||||
storage.SetPtr(model_memory_.back());
|
||||
|
|
@ -95,7 +94,12 @@ class SbsWriterImpl : public WriterInterface {
|
|||
const TensorInfo& tensor_info, float scale) {
|
||||
MatPtrT<Packed> storage(name, &tensor_info);
|
||||
storage.set_scale(scale);
|
||||
storage.SetNumElements(CompressedArrayElements<Packed>(weights.size()));
|
||||
|
||||
// Don't reset num_elements for NUQ.
|
||||
if (!hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>()) {
|
||||
storage.SetNumElements(CompressedArrayElements<Packed>(weights.size()));
|
||||
}
|
||||
|
||||
model_memory_.push_back(storage);
|
||||
if (mode_ == CompressorMode::kTEST_ONLY) return;
|
||||
model_memory_.back().Allocate();
|
||||
|
|
|
|||
|
|
@ -39,6 +39,18 @@ class CompressionTest(absltest.TestCase):
|
|||
tensor_info,
|
||||
1.0,
|
||||
)
|
||||
|
||||
tensor_info_nuq = configs.TensorInfo()
|
||||
tensor_info_nuq.name = "fooNUQ"
|
||||
tensor_info_nuq.axes = [0]
|
||||
tensor_info_nuq.shape = [256]
|
||||
writer.insert(
|
||||
"fooNUQ",
|
||||
np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32),
|
||||
configs.Type.kNUQ,
|
||||
tensor_info_nuq,
|
||||
1.0,
|
||||
)
|
||||
writer.insert_sfp(
|
||||
"bar", np.array([0.000375] * 128 + [0.00009] * 128, dtype=np.float32)
|
||||
)
|
||||
|
|
@ -51,7 +63,7 @@ class CompressionTest(absltest.TestCase):
|
|||
writer.insert_float(
|
||||
"quux", np.array([0.000375] * 128 + [0.00006] * 128, dtype=np.float32)
|
||||
)
|
||||
self.assertEqual(writer.debug_num_blobs_added(), 5)
|
||||
self.assertEqual(writer.debug_num_blobs_added(), 6)
|
||||
self.assertEqual(writer.write(temp_file.full_path), 0)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -168,7 +168,9 @@ struct NuqStream {
|
|||
// Returns number of NuqStream to allocate for the stream, which matches its
|
||||
// size in bytes.
|
||||
static constexpr size_t PackedEnd(size_t capacity) {
|
||||
return PackedStart(capacity) + hwy::DivCeil(capacity, 2); // 2x 4-bit/byte
|
||||
const size_t num_groups = hwy::DivCeil(capacity, kGroupSize);
|
||||
return (kClusters * num_groups) +
|
||||
hwy::DivCeil(capacity, 2); // 2x 4-bit/byte
|
||||
}
|
||||
|
||||
uint8_t byte;
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "compression/blob_store.h"
|
||||
#include "compression/compress-inl.h"
|
||||
#include "compression/compress.h"
|
||||
#include "compression/io.h" // Path
|
||||
#include "compression/shared.h"
|
||||
|
|
@ -31,6 +32,7 @@
|
|||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h" // HWY_ABORT
|
||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/profiler.h"
|
||||
#include "hwy/stats.h"
|
||||
|
||||
|
|
@ -255,4 +257,63 @@ void ModelWeightsStorage::CreateForType(Type weight_type,
|
|||
}
|
||||
}
|
||||
|
||||
template <class Weight>
|
||||
void LayerWeightsPtrs<Weight>::Reshape(MatStorage* storage) {
|
||||
if (attn_vec_einsum_w.data() == nullptr) return;
|
||||
|
||||
const size_t model_dim = layer_config.model_dim;
|
||||
const size_t heads = layer_config.heads;
|
||||
const size_t qkv_dim = layer_config.qkv_dim;
|
||||
|
||||
// Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
|
||||
if (storage != nullptr) {
|
||||
storage->Allocate();
|
||||
att_weights.SetPtr(*storage);
|
||||
}
|
||||
|
||||
if (hwy::IsSame<Weight, NuqStream>()) {
|
||||
const hwy::HWY_NAMESPACE::ScalableTag<float> df;
|
||||
|
||||
hwy::AlignedFreeUniquePtr<float[]> attn_vec_einsum_w_tmp =
|
||||
hwy::AllocateAligned<float>(model_dim * heads * qkv_dim);
|
||||
hwy::AlignedFreeUniquePtr<float[]> att_weights_tmp =
|
||||
hwy::AllocateAligned<float>(model_dim * heads * qkv_dim);
|
||||
|
||||
HWY_NAMESPACE::DecompressAndZeroPad(
|
||||
df, MakeSpan(attn_vec_einsum_w.data(), model_dim * heads * qkv_dim), 0,
|
||||
attn_vec_einsum_w_tmp.get(), model_dim * heads * qkv_dim);
|
||||
|
||||
for (size_t m = 0; m < model_dim; ++m) {
|
||||
float* HWY_RESTRICT out_row = att_weights_tmp.get() + m * heads * qkv_dim;
|
||||
for (size_t h = 0; h < heads; ++h) {
|
||||
hwy::CopyBytes(
|
||||
attn_vec_einsum_w_tmp.get() + h * model_dim * qkv_dim + m * qkv_dim,
|
||||
out_row + h * qkv_dim, qkv_dim * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
CompressWorkingSet work;
|
||||
hwy::ThreadPool pool(0);
|
||||
|
||||
HWY_NAMESPACE::Compress(
|
||||
att_weights_tmp.get(), model_dim * heads * qkv_dim, work,
|
||||
MakeSpan(att_weights.data(), model_dim * heads * qkv_dim),
|
||||
/*packed_ofs=*/0, pool);
|
||||
|
||||
att_weights.set_scale(attn_vec_einsum_w.scale());
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t m = 0; m < model_dim; ++m) {
|
||||
Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim;
|
||||
for (size_t h = 0; h < heads; ++h) {
|
||||
hwy::CopyBytes(
|
||||
attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim,
|
||||
out_row + h * qkv_dim, qkv_dim * sizeof(Weight));
|
||||
}
|
||||
}
|
||||
att_weights.set_scale(attn_vec_einsum_w.scale());
|
||||
}
|
||||
|
||||
} // namespace gcpp
|
||||
|
|
|
|||
|
|
@ -179,31 +179,7 @@ struct LayerWeightsPtrs {
|
|||
// Initializes att_weights from attn_vec_einsum_w, hence this must be called
|
||||
// after loading weights via ForEachTensor.
|
||||
// TODO: update compression/convert_weights to bake this in.
|
||||
void Reshape(MatStorage* storage) {
|
||||
if (attn_vec_einsum_w.data() == nullptr) return;
|
||||
|
||||
const size_t model_dim = layer_config.model_dim;
|
||||
const size_t heads = layer_config.heads;
|
||||
const size_t qkv_dim = layer_config.qkv_dim;
|
||||
|
||||
// TODO: implement a CompressTraits::Copy for NUQ.
|
||||
// static_assert(!hwy::IsSame<Weight, NuqStream>());
|
||||
|
||||
// Reshape [kHeads, kModelDim, kQKVDim] to [kModelDim, kHeads * kQKVDim].
|
||||
if (storage != nullptr) {
|
||||
storage->Allocate();
|
||||
att_weights.SetPtr(*storage);
|
||||
}
|
||||
for (size_t m = 0; m < model_dim; ++m) {
|
||||
Weight* HWY_RESTRICT out_row = att_weights.data() + m * heads * qkv_dim;
|
||||
for (size_t h = 0; h < heads; ++h) {
|
||||
hwy::CopyBytes(
|
||||
attn_vec_einsum_w.data() + h * model_dim * qkv_dim + m * qkv_dim,
|
||||
out_row + h * qkv_dim, qkv_dim * sizeof(Weight));
|
||||
}
|
||||
}
|
||||
att_weights.set_scale(attn_vec_einsum_w.scale());
|
||||
}
|
||||
void Reshape(MatStorage* storage);
|
||||
|
||||
// Used by ForEachTensor for per-layer tensors.
|
||||
#define GEMMA_CALL_FUNC(member) \
|
||||
|
|
|
|||
Loading…
Reference in New Issue