mirror of https://github.com/google/gemma.cpp.git
Add compression/ comments, especially on SFP range
PiperOrigin-RevId: 642238720
This commit is contained in:
parent
c557ad23a8
commit
a0e808e341
|
|
@ -22,6 +22,7 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
|
#include <cmath> // lroundf, only if COMPRESS_STATS
|
||||||
|
|
||||||
#include "compression/blob_store.h"
|
#include "compression/blob_store.h"
|
||||||
#include "compression/compress.h"
|
#include "compression/compress.h"
|
||||||
|
|
@ -55,6 +56,7 @@ namespace hn = hwy::HWY_NAMESPACE;
|
||||||
template <typename T> // primary, must specialize
|
template <typename T> // primary, must specialize
|
||||||
struct CompressTraits {};
|
struct CompressTraits {};
|
||||||
|
|
||||||
|
// Useful for backprop/, where weights are currently f32.
|
||||||
template <>
|
template <>
|
||||||
struct CompressTraits<float> {
|
struct CompressTraits<float> {
|
||||||
using MatT = float;
|
using MatT = float;
|
||||||
|
|
@ -267,11 +269,14 @@ struct CompressTraits<hwy::bfloat16_t> {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Switching floating point: 8-bit, 2..3 mantissa bits.
|
||||||
template <>
|
template <>
|
||||||
struct CompressTraits<SfpStream> {
|
struct CompressTraits<SfpStream> {
|
||||||
using MatT = SfpStream;
|
using MatT = SfpStream;
|
||||||
static constexpr bool kSupportsEvenOdd = true;
|
static constexpr bool kSupportsEvenOdd = true;
|
||||||
|
|
||||||
|
// Callers are responsible for scaling `in` such that its magnitudes do not
|
||||||
|
// exceed 1.875. See CompressedArray::scale().
|
||||||
template <class DF, HWY_IF_F32_D(DF)>
|
template <class DF, HWY_IF_F32_D(DF)>
|
||||||
static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
|
static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
|
||||||
size_t num, CompressPerThread& tls,
|
size_t num, CompressPerThread& tls,
|
||||||
|
|
@ -351,6 +356,7 @@ struct CompressTraits<SfpStream> {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Nonuniform quantization, 4.5 bits per element, two separate streams.
|
||||||
template <>
|
template <>
|
||||||
struct CompressTraits<NuqStream> {
|
struct CompressTraits<NuqStream> {
|
||||||
using MatT = NuqStream;
|
using MatT = NuqStream;
|
||||||
|
|
@ -525,12 +531,12 @@ HWY_INLINE float Dot(DF df, const CompressedArray<MatT, kCapacity>& compressed,
|
||||||
return compressed.scale() * dot_result;
|
return compressed.scale() * dot_result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Callback used by ForeachTensor.
|
// Functor called for each tensor, which compresses and stores them along with
|
||||||
|
// their scaling factors to BlobStore.
|
||||||
class Compressor {
|
class Compressor {
|
||||||
public:
|
public:
|
||||||
explicit Compressor(hwy::ThreadPool& pool) : pool_(pool) {}
|
explicit Compressor(hwy::ThreadPool& pool) : pool_(pool) {}
|
||||||
|
|
||||||
// Called for each tensor; compresses it and stores to the cache.
|
|
||||||
template <typename MatT, size_t kCapacity>
|
template <typename MatT, size_t kCapacity>
|
||||||
void operator()(const char* name, const float* weights,
|
void operator()(const char* name, const float* weights,
|
||||||
CompressedArray<MatT, kCapacity>& compressed) {
|
CompressedArray<MatT, kCapacity>& compressed) {
|
||||||
|
|
|
||||||
|
|
@ -79,6 +79,9 @@ class CompressedArray {
|
||||||
MatT* data() { return data_.data(); }
|
MatT* data() { return data_.data(); }
|
||||||
const MatT* data() const { return data_.data(); }
|
const MatT* data() const { return data_.data(); }
|
||||||
|
|
||||||
|
// Decoded elements should be multiplied by this to restore their original
|
||||||
|
// range. This is required because SfpStream can only encode a limited range
|
||||||
|
// of magnitudes.
|
||||||
float scale() const { return scale_[0]; }
|
float scale() const { return scale_[0]; }
|
||||||
void set_scale(float scale) { scale_[0] = scale; }
|
void set_scale(float scale) { scale_[0] = scale; }
|
||||||
|
|
||||||
|
|
@ -90,6 +93,7 @@ class CompressedArray {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::array<MatT, NumCompressed()> data_;
|
std::array<MatT, NumCompressed()> data_;
|
||||||
|
// Blobs are at least kBlobAlign bytes anyway.
|
||||||
float scale_[kBlobAlign / sizeof(float)];
|
float scale_[kBlobAlign / sizeof(float)];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -172,6 +176,8 @@ hwy::uint128_t CacheKey(const char* name) {
|
||||||
return MakeKey((std::string(1, prefix) + name).c_str());
|
return MakeKey((std::string(1, prefix) + name).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Functor called for each tensor, which loads them and their scaling factors
|
||||||
|
// from BlobStore.
|
||||||
class CacheLoader {
|
class CacheLoader {
|
||||||
public:
|
public:
|
||||||
explicit CacheLoader(const Path& blob_filename) {
|
explicit CacheLoader(const Path& blob_filename) {
|
||||||
|
|
|
||||||
|
|
@ -260,7 +260,8 @@ class SfpCodec {
|
||||||
hi = hn::BitwiseIfThenElse(k80, sign_in_msb, hn::ShiftRight<1>(biased_e));
|
hi = hn::BitwiseIfThenElse(k80, sign_in_msb, hn::ShiftRight<1>(biased_e));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Encodes `num` bf16 values from `in_bf` to `out_packed`.
|
// Encodes `num` bf16 values from `in_bf` to `out_packed`. Their magnitude
|
||||||
|
// must be at most 1.875.
|
||||||
template <class DBF, HWY_IF_BF16_D(DBF)>
|
template <class DBF, HWY_IF_BF16_D(DBF)>
|
||||||
static HWY_INLINE void Enc(DBF dbf, const hwy::bfloat16_t* HWY_RESTRICT in_bf,
|
static HWY_INLINE void Enc(DBF dbf, const hwy::bfloat16_t* HWY_RESTRICT in_bf,
|
||||||
size_t num, SfpStream* HWY_RESTRICT out_packed) {
|
size_t num, SfpStream* HWY_RESTRICT out_packed) {
|
||||||
|
|
@ -288,7 +289,8 @@ class SfpCodec {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Encodes `num` f32 values from `in_f` to `packed`.
|
// Encodes `num` f32 values from `in_f` to `packed`. Their magnitude
|
||||||
|
// must be at most 1.875.
|
||||||
template <class DF, HWY_IF_F32_D(DF)>
|
template <class DF, HWY_IF_F32_D(DF)>
|
||||||
static HWY_INLINE void Enc(DF df, const float* HWY_RESTRICT in_f, size_t num,
|
static HWY_INLINE void Enc(DF df, const float* HWY_RESTRICT in_f, size_t num,
|
||||||
SfpStream* HWY_RESTRICT out_packed) {
|
SfpStream* HWY_RESTRICT out_packed) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue