mirror of https://github.com/google/gemma.cpp.git
Merge pull request #130 from veluca93:weight-handling
PiperOrigin-RevId: 622405491
This commit is contained in:
commit
325ef06cf9
|
|
@ -29,13 +29,13 @@
|
|||
// copybara:import_next_line:gemma_cpp
|
||||
#include "compression/blob_store.h"
|
||||
|
||||
#include <fcntl.h> // open
|
||||
#include <stdint.h>
|
||||
#include <stdio.h> // SEEK_END - unistd isn't enough for IDE.
|
||||
#include <sys/stat.h> // O_RDONLY
|
||||
#include <fcntl.h> // open
|
||||
#if HWY_OS_WIN
|
||||
#include <io.h> // read, write, close
|
||||
#include <fileapi.h>
|
||||
#include <io.h> // read, write, close
|
||||
#else
|
||||
#include <unistd.h> // read, write, close
|
||||
#endif
|
||||
|
|
@ -113,7 +113,8 @@ hwy::uint128_t MakeKey(const char* string) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
|
||||
namespace {
|
||||
void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
|
||||
std::vector<BlobIO>& requests) {
|
||||
// Split into chunks for load-balancing even if blob sizes vary.
|
||||
constexpr size_t kChunkSize = 4 * 1024 * 1024;
|
||||
|
|
@ -129,7 +130,7 @@ static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data,
|
|||
requests.emplace_back(offset + pos, size - pos, data + pos, 0);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
struct IO {
|
||||
// Returns size in bytes or 0.
|
||||
|
|
@ -197,12 +198,6 @@ static_assert(HWY_IS_LITTLE_ENDIAN, "Assumes little endian");
|
|||
class BlobStore {
|
||||
static constexpr uint32_t kMagic = 0x0A534253; // SBS\n
|
||||
|
||||
// Blob offsets on disk and memory addresses are a multiple of this, because
|
||||
// we pad the header and each blob's size. This matches CUDA alignment and the
|
||||
// maximum SVE vector size, and exceeds typical x86 cache line sizes (64 or
|
||||
// 128), which can help performance.
|
||||
static constexpr size_t kAlign = 256;
|
||||
|
||||
public:
|
||||
// NOT including padding, so that we can also use ZeroFillPadding after
|
||||
// copying the header.
|
||||
|
|
@ -215,13 +210,13 @@ class BlobStore {
|
|||
// blobs. Requires num_blobs_ to already be set, typically by reading
|
||||
// sizeof(BlobStore) bytes from disk.
|
||||
size_t PaddedHeaderSize() const {
|
||||
return hwy::RoundUpTo(HeaderSize(num_blobs_), kAlign);
|
||||
return hwy::RoundUpTo(HeaderSize(num_blobs_), kBlobAlign);
|
||||
}
|
||||
|
||||
// Returns aligned offset and zero-fills between that and `offset`.
|
||||
uint64_t ZeroFillPadding(uint64_t offset) {
|
||||
uint8_t* const bytes = reinterpret_cast<uint8_t*>(this);
|
||||
const uint64_t padded = hwy::RoundUpTo(offset, kAlign);
|
||||
const uint64_t padded = hwy::RoundUpTo(offset, kBlobAlign);
|
||||
hwy::ZeroBytes(bytes + offset, padded - offset);
|
||||
return padded;
|
||||
}
|
||||
|
|
@ -236,7 +231,7 @@ class BlobStore {
|
|||
for (size_t i = 0; i < num_blobs_; ++i) {
|
||||
const hwy::uint128_t val = keys_[num_blobs_ + i];
|
||||
if (val.lo != offset) return __LINE__;
|
||||
offset = ZeroFillPadding(offset + val.hi);
|
||||
offset = hwy::RoundUpTo(offset + val.hi, kBlobAlign);
|
||||
}
|
||||
|
||||
if (offset != file_size_) return __LINE__;
|
||||
|
|
@ -253,25 +248,24 @@ class BlobStore {
|
|||
|
||||
static std::vector<BlobIO> PrepareWriteRequests(
|
||||
const hwy::uint128_t keys[], const hwy::Span<uint8_t> blobs[],
|
||||
size_t num_blobs) {
|
||||
size_t num_blobs, BlobStore* bs) {
|
||||
// Sanity check and ensure the cast below is safe.
|
||||
HWY_ASSERT(num_blobs < (1ULL << 20));
|
||||
|
||||
// Allocate var-length header.
|
||||
const size_t header_size = HeaderSize(num_blobs);
|
||||
const size_t padded_header_size = hwy::RoundUpTo(header_size, kAlign);
|
||||
BlobStorePtr bs = Allocate(padded_header_size);
|
||||
const size_t padded_header_size = hwy::RoundUpTo(header_size, kBlobAlign);
|
||||
const uint64_t padded_header_end = bs->ZeroFillPadding(header_size);
|
||||
HWY_ASSERT(padded_header_end == padded_header_size);
|
||||
|
||||
// All-zero buffer used to write padding to the file without copying the
|
||||
// input blobs.
|
||||
static uint8_t zeros[kAlign] = {0};
|
||||
static uint8_t zeros[kBlobAlign] = {0};
|
||||
|
||||
// Total file size will be the header plus all padded blobs.
|
||||
uint64_t payload = 0;
|
||||
for (size_t i = 0; i < num_blobs; ++i) {
|
||||
payload += hwy::RoundUpTo(blobs[i].size(), kAlign);
|
||||
payload += hwy::RoundUpTo(blobs[i].size(), kBlobAlign);
|
||||
}
|
||||
const size_t total_size = padded_header_size + payload;
|
||||
|
||||
|
|
@ -285,7 +279,7 @@ class BlobStore {
|
|||
std::vector<BlobIO> requests;
|
||||
requests.reserve(1 + 2 * num_blobs);
|
||||
requests.emplace_back(/*offset=*/0, padded_header_size,
|
||||
reinterpret_cast<uint8_t*>(bs.get()), 0);
|
||||
reinterpret_cast<uint8_t*>(bs), 0);
|
||||
|
||||
// Fill second half of keys_ with offset/size and prepare IO requests.
|
||||
uint64_t offset = padded_header_end;
|
||||
|
|
@ -295,10 +289,10 @@ class BlobStore {
|
|||
|
||||
EnqueueChunkRequests(offset, blobs[i].size(), blobs[i].data(), requests);
|
||||
offset += blobs[i].size();
|
||||
const size_t padded_size = hwy::RoundUpTo(blobs[i].size(), kAlign);
|
||||
const size_t padded_size = hwy::RoundUpTo(blobs[i].size(), kBlobAlign);
|
||||
if (padded_size != blobs[i].size()) {
|
||||
const size_t padding = padded_size - blobs[i].size();
|
||||
HWY_ASSERT(padding <= kAlign);
|
||||
HWY_ASSERT(padding <= kBlobAlign);
|
||||
requests.emplace_back(offset, padding, zeros, 0);
|
||||
offset += padding;
|
||||
}
|
||||
|
|
@ -418,8 +412,11 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool,
|
|||
HWY_ASSERT(keys_.size() == blobs_.size());
|
||||
|
||||
// Concatenate blobs in memory.
|
||||
const size_t header_size = BlobStore::HeaderSize(keys_.size());
|
||||
const size_t padded_header_size = hwy::RoundUpTo(header_size, kBlobAlign);
|
||||
BlobStorePtr bs = BlobStore::Allocate(padded_header_size);
|
||||
std::vector<BlobIO> requests = BlobStore::PrepareWriteRequests(
|
||||
keys_.data(), blobs_.data(), keys_.size());
|
||||
keys_.data(), blobs_.data(), keys_.size(), bs.get());
|
||||
|
||||
// Create/replace existing file.
|
||||
#if HWY_OS_WIN
|
||||
|
|
|
|||
|
|
@ -40,6 +40,12 @@ using BlobStorePtr = hwy::AlignedFreeUniquePtr<BlobStore>;
|
|||
// 0 if successful, otherwise the line number of the failing check.
|
||||
using BlobError = int;
|
||||
|
||||
// Blob offsets on disk and memory addresses are a multiple of this, because
|
||||
// we pad the header and each blob's size. This matches CUDA alignment and the
|
||||
// maximum SVE vector size, and exceeds typical x86 cache line sizes (64 or
|
||||
// 128), which can help performance.
|
||||
static constexpr size_t kBlobAlign = 256;
|
||||
|
||||
struct BlobIO {
|
||||
BlobIO(uint64_t offset, size_t size, void* data, uint64_t padding)
|
||||
: offset(offset), size(size), data(data), padding(padding) {}
|
||||
|
|
|
|||
|
|
@ -381,13 +381,14 @@ HWY_INLINE void Compress(const std::array<float, kCapacity>& in,
|
|||
}
|
||||
|
||||
// Decompresses `num` values from `compressed` starting at `compressed_ofs`.
|
||||
template <typename MatT, size_t kCapacity, typename OutT>
|
||||
HWY_NOINLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
|
||||
size_t compressed_ofs, OutT* out, size_t num) {
|
||||
HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
|
||||
template <typename ArrayT, typename OutT>
|
||||
HWY_NOINLINE void Decompress(const ArrayT& compressed, size_t compressed_ofs,
|
||||
OutT* out, size_t num) {
|
||||
HWY_DASSERT(compressed_ofs + num <= compressed.size());
|
||||
const hn::ScalableTag<OutT> d;
|
||||
using Traits = CompressTraits<MatT>;
|
||||
Traits::Decompress(d, kCapacity, compressed.data(), compressed_ofs, out, num);
|
||||
using Traits = CompressTraits<typename ArrayT::value_type>;
|
||||
Traits::Decompress(d, compressed.size(), compressed.data(), compressed_ofs,
|
||||
out, num);
|
||||
}
|
||||
|
||||
// As above, but with threading and benchmarking.
|
||||
|
|
@ -395,7 +396,7 @@ template <typename MatT, size_t kCapacity, typename OutT>
|
|||
HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
|
||||
size_t compressed_ofs, OutT* out, size_t num,
|
||||
hwy::ThreadPool& pool) {
|
||||
HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
|
||||
HWY_DASSERT(compressed_ofs + num <= compressed.size());
|
||||
const double t0 = hwy::platform::Now();
|
||||
|
||||
using Traits = CompressTraits<MatT>;
|
||||
|
|
@ -407,7 +408,7 @@ HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
|
|||
|
||||
const size_t ofs = idx_batch * kBatch;
|
||||
const size_t num = idx_batch == num_batches - 1 ? (num - ofs) : kBatch;
|
||||
Traits::Decompress(d, compressed.NumElements(), compressed.data(),
|
||||
Traits::Decompress(d, compressed.size(), compressed.data(),
|
||||
compressed_ofs + ofs, out + ofs, num);
|
||||
});
|
||||
|
||||
|
|
@ -417,16 +418,28 @@ HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
|
|||
fprintf(stderr, "Decompress %.1f MB/s\n", mbps);
|
||||
}
|
||||
|
||||
// Returns dot product with `vec_aligned` of length `num`.
|
||||
template <class DF, typename ArrayT, typename VecT>
|
||||
HWY_INLINE float Dot(DF df, const ArrayT& compressed, size_t compressed_ofs,
|
||||
const VecT* vec_aligned, size_t num) {
|
||||
HWY_DASSERT(compressed_ofs + num <= compressed.size());
|
||||
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
|
||||
using Traits = CompressTraits<typename ArrayT::value_type>;
|
||||
return Traits::Dot(df, compressed.size(), compressed.data(), compressed_ofs,
|
||||
vec_aligned, num);
|
||||
}
|
||||
|
||||
// Returns dot product with `vec_aligned` of length `num`.
|
||||
template <class DF, typename MatT, size_t kCapacity, typename VecT>
|
||||
HWY_INLINE float Dot(DF df, const CompressedArray<MatT, kCapacity>& compressed,
|
||||
size_t compressed_ofs, const VecT* vec_aligned,
|
||||
size_t num) {
|
||||
HWY_DASSERT(compressed_ofs + num <= compressed.NumElements());
|
||||
HWY_DASSERT(compressed_ofs + num <= compressed.size());
|
||||
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
|
||||
using Traits = CompressTraits<MatT>;
|
||||
return Traits::Dot(df, kCapacity, compressed.data(), compressed_ofs,
|
||||
vec_aligned, num);
|
||||
return (compressed.scale() * Traits::Dot(df, compressed.size(),
|
||||
compressed.data(), compressed_ofs,
|
||||
vec_aligned, num));
|
||||
}
|
||||
|
||||
// Callback used by ForeachTensor.
|
||||
|
|
@ -445,6 +458,12 @@ class Compressor {
|
|||
compressed.CompressedSize());
|
||||
}
|
||||
|
||||
void AddScales(float* scales, size_t len) {
|
||||
if (len) {
|
||||
writer_.Add(CacheKey<float>("scales"), scales, len * sizeof(scales[0]));
|
||||
}
|
||||
}
|
||||
|
||||
void WriteAll(hwy::ThreadPool& pool, const char* blob_filename) {
|
||||
const BlobError err = writer_.WriteAll(pool, blob_filename);
|
||||
if (err != 0) {
|
||||
|
|
|
|||
|
|
@ -71,10 +71,15 @@ class CompressedArray {
|
|||
}
|
||||
|
||||
public:
|
||||
using value_type = MatT;
|
||||
|
||||
MatT* data() { return data_.data(); }
|
||||
const MatT* data() const { return data_.data(); }
|
||||
|
||||
constexpr size_t NumElements() const { return kCapacity; }
|
||||
float scale() const { return scale_[0]; }
|
||||
void set_scale(float scale) { scale_[0] = scale; }
|
||||
|
||||
constexpr size_t size() const { return kCapacity; }
|
||||
|
||||
constexpr size_t CompressedSize() const {
|
||||
return NumCompressed() * sizeof(MatT);
|
||||
|
|
@ -82,6 +87,7 @@ class CompressedArray {
|
|||
|
||||
private:
|
||||
std::array<MatT, NumCompressed()> data_;
|
||||
float scale_[kBlobAlign / sizeof(float)];
|
||||
};
|
||||
|
||||
#if COMPRESS_STATS
|
||||
|
|
@ -187,11 +193,21 @@ class CacheLoader {
|
|||
|
||||
err_ = reader_.Enqueue(CacheKey<MatT>(name), compressed.data(),
|
||||
compressed.CompressedSize());
|
||||
compressed.set_scale(1.0f);
|
||||
if (err_ != 0) {
|
||||
fprintf(stderr, "Failed to read cache %s (error %d)\n", name, err_);
|
||||
}
|
||||
}
|
||||
|
||||
void LoadScales(float* scales, size_t len) {
|
||||
if (0 != reader_.Enqueue(CacheKey<float>("scales"), scales,
|
||||
len * sizeof(scales[0]))) {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
scales[i] = 1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Returns whether all tensors are successfully loaded from cache.
|
||||
bool ReadAll(hwy::ThreadPool& pool) {
|
||||
// reader_ invalid or any Enqueue failed
|
||||
|
|
|
|||
14
configs.h
14
configs.h
|
|
@ -30,6 +30,16 @@
|
|||
|
||||
#include <stddef.h>
|
||||
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "compression/sfp.h"
|
||||
#include "hwy/base.h" // hwy::bfloat16_t
|
||||
|
||||
// Allowable types for GEMMA_WEIGHT_T (can be specified at compilation time):
|
||||
// float, hwy::bfloat16_t, SfpStream, NuqStream
|
||||
#ifndef GEMMA_WEIGHT_T
|
||||
#define GEMMA_WEIGHT_T SfpStream
|
||||
#endif // !GEMMA_WEIGHT_T
|
||||
|
||||
namespace gcpp {
|
||||
|
||||
static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN;
|
||||
|
|
@ -45,6 +55,8 @@ struct ConfigGemma7B {
|
|||
static constexpr int kKVHeads = 16; // standard MHA
|
||||
static constexpr int kQKVDim = 256; // query size == key size == value size
|
||||
static constexpr int kTopK = gcpp::kTopK;
|
||||
static constexpr int kNumTensorScales = 0;
|
||||
using WeightT = GEMMA_WEIGHT_T;
|
||||
};
|
||||
|
||||
struct ConfigGemma2B {
|
||||
|
|
@ -57,6 +69,8 @@ struct ConfigGemma2B {
|
|||
static constexpr int kKVHeads = 1;
|
||||
static constexpr int kQKVDim = 256; // query size == key size == value size
|
||||
static constexpr int kTopK = gcpp::kTopK;
|
||||
static constexpr int kNumTensorScales = 0;
|
||||
using WeightT = GEMMA_WEIGHT_T;
|
||||
};
|
||||
|
||||
} // namespace gcpp
|
||||
|
|
|
|||
|
|
@ -19,9 +19,9 @@
|
|||
#include "gemma.h"
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/app.h" // LoaderArgs
|
||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/args.h"
|
||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||
|
||||
std::vector<int> tokenize(
|
||||
const std::string& prompt_string,
|
||||
|
|
@ -43,8 +43,7 @@ int main(int argc, char** argv) {
|
|||
hwy::ThreadPool pool(num_threads);
|
||||
|
||||
// Instantiate model and KV Cache
|
||||
gcpp::Gemma model(loader.tokenizer, loader.compressed_weights,
|
||||
loader.ModelType(), pool);
|
||||
gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
|
||||
auto kv_cache = CreateKVCache(loader.ModelType());
|
||||
size_t pos = 0; // KV Cache position
|
||||
|
||||
|
|
|
|||
465
gemma.cc
465
gemma.cc
|
|
@ -25,12 +25,12 @@
|
|||
#include "compression/compress-inl.h"
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "ops.h"
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/args.h" // Path
|
||||
#include "hwy/contrib/matvec/matvec-inl.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/profiler.h"
|
||||
#include "hwy/timer.h"
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/args.h" // Path
|
||||
|
||||
// Non-SIMD includes and types. Note that HWY_ONCE is only true on the last
|
||||
// compile pass, whereas we want this defined in the first.
|
||||
|
|
@ -64,6 +64,12 @@
|
|||
// copybara:import_next_line:sentencepiece
|
||||
#include "src/sentencepiece_processor.h"
|
||||
|
||||
// Setting this to true disables fread() calls that read the model file.
|
||||
constexpr bool kDryRunFread = false;
|
||||
|
||||
// Setting this to false will load and use uncompressed weights.
|
||||
constexpr bool kWeightsAreCompressed = true;
|
||||
|
||||
namespace gcpp {
|
||||
|
||||
template <class TConfig>
|
||||
|
|
@ -88,70 +94,145 @@ struct Layer {
|
|||
std::array<float, kModelDim> pre_ffw_norm_scale;
|
||||
};
|
||||
|
||||
float ScaleWeights(float* data, size_t len) {
|
||||
float maxabs = 0.0;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
maxabs = std::max(maxabs, std::abs(data[i]));
|
||||
}
|
||||
const float kMaxRange = 1.875f;
|
||||
if (maxabs <= kMaxRange) {
|
||||
return 1.0f;
|
||||
}
|
||||
const float scale = maxabs / kMaxRange;
|
||||
const float inv_scale = 1.0f / scale;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
data[i] *= inv_scale;
|
||||
}
|
||||
return scale;
|
||||
}
|
||||
|
||||
// Array instead of single large allocation for parallel mem init. Split out of
|
||||
// Weights so that only these pointers are initialized.
|
||||
template <class TConfig>
|
||||
struct LayerPointers {
|
||||
explicit LayerPointers(hwy::ThreadPool& pool) {
|
||||
pool.Run(0, TConfig::kLayers, [this](uint64_t task, size_t /*thread*/) {
|
||||
this->layers[task] = hwy::AllocateAligned<Layer<TConfig>>(1);
|
||||
});
|
||||
}
|
||||
|
||||
using TLayer = Layer<TConfig>;
|
||||
std::array<hwy::AlignedFreeUniquePtr<TLayer[]>, TConfig::kLayers> layers;
|
||||
};
|
||||
|
||||
template <class TConfig>
|
||||
struct Weights {
|
||||
Weights() = default;
|
||||
|
||||
hwy::AlignedUniquePtr<Layer<TConfig>[]> layers; // kLayers
|
||||
// No ctor/dtor, allocated via AllocateAligned.
|
||||
|
||||
std::array<float, TConfig::kVocabSize * TConfig::kModelDim>
|
||||
embedder_input_embedding;
|
||||
|
||||
std::array<float, TConfig::kModelDim> final_norm_scale;
|
||||
|
||||
LayerPointers<TConfig> layer_ptrs;
|
||||
|
||||
std::array<float, TConfig::kNumTensorScales> scales;
|
||||
|
||||
const Layer<TConfig>* GetLayer(size_t layer) const {
|
||||
return layer_ptrs.layers[layer].get();
|
||||
}
|
||||
Layer<TConfig>* GetLayer(size_t layer) {
|
||||
return layer_ptrs.layers[layer].get();
|
||||
}
|
||||
};
|
||||
|
||||
// Only called if cached loading fails.
|
||||
template <typename TConfig>
|
||||
hwy::AlignedUniquePtr<Weights<TConfig>> LoadWeights(const Path& checkpoint) {
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> LoadWeights(
|
||||
const Path& checkpoint, hwy::ThreadPool& pool,
|
||||
bool scale_for_compression = false) {
|
||||
PROFILER_ZONE("Startup.LoadWeights");
|
||||
using TWeights = Weights<TConfig>;
|
||||
hwy::AlignedUniquePtr<TWeights> weights = hwy::MakeUniqueAligned<TWeights>();
|
||||
weights->layers =
|
||||
hwy::MakeUniqueAlignedArray<Layer<TConfig>>(TConfig::kLayers);
|
||||
|
||||
if (checkpoint.path.empty()) {
|
||||
HWY_ABORT(
|
||||
"Loading --compressed_weights failed; we require a --weights argument. "
|
||||
"Please see issue #11 on how to create this file.\n");
|
||||
if (!std::filesystem::exists(checkpoint.path)) {
|
||||
HWY_ABORT("The model weights file '%s' does not exist.",
|
||||
checkpoint.path.c_str());
|
||||
}
|
||||
|
||||
using TWeights = Weights<TConfig>;
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8 =
|
||||
hwy::AllocateAligned<uint8_t>(sizeof(TWeights));
|
||||
TWeights* weights = reinterpret_cast<TWeights*>(weights_u8.get());
|
||||
new (&weights->layer_ptrs) LayerPointers<TConfig>(pool);
|
||||
|
||||
size_t scale_pos = 0;
|
||||
FILE* fptr;
|
||||
if constexpr (kDryRunFread) {
|
||||
fprintf(stderr, "Dry-Run, not reading model-file.\n");
|
||||
} else {
|
||||
fptr = fopen(checkpoint.path.c_str(), "rb");
|
||||
if (fptr == nullptr) {
|
||||
HWY_ABORT("Failed to open model file %s - does it exist?",
|
||||
checkpoint.path.c_str());
|
||||
}
|
||||
}
|
||||
bool ok = true;
|
||||
uint64_t total_size = 0;
|
||||
ok &= 1 == fread(&(weights->embedder_input_embedding),
|
||||
sizeof(weights->embedder_input_embedding), 1, fptr);
|
||||
ok &= 1 == fread(&(weights->final_norm_scale),
|
||||
sizeof(weights->final_norm_scale), 1, fptr);
|
||||
total_size += sizeof(weights->embedder_input_embedding) +
|
||||
sizeof(weights->final_norm_scale);
|
||||
auto do_fread = [&](void* var, int layer, const char* name, size_t size) {
|
||||
if (layer == -1) {
|
||||
fprintf(stderr, "Loading Parameters (size %zu): %s\n", size, name);
|
||||
} else {
|
||||
fprintf(stderr, "Loading Parameters (layer=%d, size %zu): %s\n", layer,
|
||||
size, name);
|
||||
}
|
||||
if constexpr (!kDryRunFread) {
|
||||
ok &= 1 == fread(var, size, 1, fptr);
|
||||
total_size += size;
|
||||
}
|
||||
};
|
||||
do_fread(&(weights->embedder_input_embedding), -1, "embedder_input_embedding",
|
||||
sizeof(weights->embedder_input_embedding));
|
||||
do_fread(&(weights->final_norm_scale), -1, "final_norm_scale",
|
||||
sizeof(weights->final_norm_scale));
|
||||
for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
|
||||
Layer<TConfig>* layer_view = &weights->layers[layer];
|
||||
ok &= 1 == fread(&layer_view->attn_vec_einsum_w,
|
||||
sizeof(layer_view->attn_vec_einsum_w), 1, fptr);
|
||||
ok &= 1 == fread(&layer_view->qkv_einsum_w,
|
||||
sizeof(layer_view->qkv_einsum_w), 1, fptr);
|
||||
ok &= 1 == fread(&layer_view->gating_einsum_w,
|
||||
sizeof(layer_view->gating_einsum_w), 1, fptr);
|
||||
ok &= 1 ==
|
||||
fread(&layer_view->linear_w, sizeof(layer_view->linear_w), 1, fptr);
|
||||
ok &= 1 == fread(&layer_view->pre_attention_norm_scale,
|
||||
sizeof(layer_view->pre_attention_norm_scale), 1, fptr);
|
||||
ok &= 1 == fread(&layer_view->pre_ffw_norm_scale,
|
||||
sizeof(layer_view->pre_ffw_norm_scale), 1, fptr);
|
||||
total_size += sizeof(*layer_view);
|
||||
Layer<TConfig>* layer_view = weights->GetLayer(layer);
|
||||
|
||||
#define READ_WEIGHTS(name) \
|
||||
do { \
|
||||
do_fread(&(layer_view->name), layer, #name, sizeof(layer_view->name)); \
|
||||
} while (0)
|
||||
|
||||
#define SCALE_WEIGHTS(name) \
|
||||
do { \
|
||||
if (ok && !kDryRunFread && scale_for_compression) { \
|
||||
weights->scales[scale_pos++] = \
|
||||
ScaleWeights(layer_view->name.data(), layer_view->name.size()); \
|
||||
} \
|
||||
} while (0)
|
||||
// Make sure we don't have uninitialized memory.
|
||||
hwy::ZeroBytes(layer_view, sizeof(*layer_view));
|
||||
READ_WEIGHTS(attn_vec_einsum_w);
|
||||
READ_WEIGHTS(qkv_einsum_w);
|
||||
SCALE_WEIGHTS(attn_vec_einsum_w);
|
||||
SCALE_WEIGHTS(qkv_einsum_w);
|
||||
READ_WEIGHTS(gating_einsum_w);
|
||||
READ_WEIGHTS(linear_w);
|
||||
SCALE_WEIGHTS(gating_einsum_w);
|
||||
SCALE_WEIGHTS(linear_w);
|
||||
READ_WEIGHTS(pre_attention_norm_scale);
|
||||
READ_WEIGHTS(pre_ffw_norm_scale);
|
||||
#undef READ_WEIGHTS
|
||||
}
|
||||
if (!ok) {
|
||||
HWY_ABORT("Failed to read from %s - might be a directory, or too small? "
|
||||
"expected size: %d kB", checkpoint.path.c_str(),
|
||||
static_cast<uint32_t>(total_size >> 10));
|
||||
HWY_ABORT(
|
||||
"Failed to read from %s - might be a directory, or too small? "
|
||||
"expected size: %d kB",
|
||||
checkpoint.path.c_str(), static_cast<uint32_t>(total_size >> 10));
|
||||
}
|
||||
if (!kDryRunFread) {
|
||||
HWY_ASSERT(0 == fclose(fptr));
|
||||
return weights;
|
||||
if (scale_for_compression) {
|
||||
HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
|
||||
}
|
||||
}
|
||||
return weights_u8;
|
||||
}
|
||||
|
||||
template <class TConfig>
|
||||
|
|
@ -159,18 +240,19 @@ struct CompressedLayer {
|
|||
// No ctor/dtor, allocated via AllocateAligned.
|
||||
|
||||
using TLayer = gcpp::Layer<TConfig>;
|
||||
using WeightT = typename TConfig::WeightT;
|
||||
|
||||
static constexpr size_t kModelDim = TConfig::kModelDim;
|
||||
static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
|
||||
|
||||
// Compressed Parameters
|
||||
// We don't yet have an RMSNorm that accepts all WeightT.
|
||||
CompressedArray<hwy::bfloat16_t, kModelDim> c_pre_attention_norm_scale;
|
||||
CompressedArray<hwy::bfloat16_t, kModelDim> c_pre_ffw_norm_scale;
|
||||
CompressedArray<WeightT, TLayer::kGatingEinsumWSize> c_gating_einsum_w;
|
||||
CompressedArray<WeightT, kModelDim * kFFHiddenDim> c_linear_w;
|
||||
CompressedArray<WeightT, TLayer::kQKVEinsumWSize> c_qkv_einsum_w;
|
||||
CompressedArray<WeightT, TLayer::kAttVecEinsumWSize> c_attn_vec_einsum_w;
|
||||
CompressedArray<hwy::bfloat16_t, kModelDim> pre_attention_norm_scale;
|
||||
CompressedArray<hwy::bfloat16_t, kModelDim> pre_ffw_norm_scale;
|
||||
CompressedArray<WeightT, TLayer::kGatingEinsumWSize> gating_einsum_w;
|
||||
CompressedArray<WeightT, kModelDim * kFFHiddenDim> linear_w;
|
||||
CompressedArray<WeightT, TLayer::kQKVEinsumWSize> qkv_einsum_w;
|
||||
CompressedArray<WeightT, TLayer::kAttVecEinsumWSize> attn_vec_einsum_w;
|
||||
};
|
||||
|
||||
// Array instead of single large allocation for parallel mem init. Split out of
|
||||
|
|
@ -193,21 +275,25 @@ struct CompressedWeights {
|
|||
// No ctor/dtor, allocated via AllocateAligned.
|
||||
|
||||
CompressedArray<EmbedderInputT, TConfig::kVocabSize * TConfig::kModelDim>
|
||||
c_embedder_input_embedding;
|
||||
embedder_input_embedding;
|
||||
|
||||
CompressedArray<hwy::bfloat16_t, TConfig::kModelDim> c_final_norm_scale;
|
||||
CompressedArray<hwy::bfloat16_t, TConfig::kModelDim> final_norm_scale;
|
||||
|
||||
// Must be last so that the other arrays remain aligned.
|
||||
CompressedLayerPointers<TConfig> c_layer_ptrs;
|
||||
|
||||
const CompressedLayer<TConfig>* CLayer(size_t layer) const {
|
||||
const CompressedLayer<TConfig>* GetLayer(size_t layer) const {
|
||||
return c_layer_ptrs.c_layers[layer].get();
|
||||
}
|
||||
CompressedLayer<TConfig>* CLayer(size_t layer) {
|
||||
CompressedLayer<TConfig>* GetLayer(size_t layer) {
|
||||
return c_layer_ptrs.c_layers[layer].get();
|
||||
}
|
||||
};
|
||||
|
||||
template <class TConfig>
|
||||
using WeightsT = hwy::If<kWeightsAreCompressed, CompressedWeights<TConfig>,
|
||||
Weights<TConfig>>;
|
||||
|
||||
// Aligned.
|
||||
template <class TConfig, size_t TBatchSize>
|
||||
struct Activations {
|
||||
|
|
@ -272,16 +358,27 @@ KVCache CreateKVCache(Model type) {
|
|||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <class Config>
|
||||
void DeleteLayersPtrs(CompressedWeights<Config>* c_weights) {
|
||||
c_weights->c_layer_ptrs.~CompressedLayerPointers<Config>();
|
||||
}
|
||||
template <class Config>
|
||||
void DeleteLayersPtrs(Weights<Config>* weights) {
|
||||
weights->layer_ptrs.~LayerPointers<Config>();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <class Config>
|
||||
struct GemmaImpl : public GemmaInterface {
|
||||
GemmaImpl(std::unique_ptr<sentencepiece::SentencePieceProcessor>& tokenizer,
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]>& compressed_weights,
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]>& weights_u8,
|
||||
hwy::ThreadPool& pool);
|
||||
|
||||
~GemmaImpl() {
|
||||
using CWeights = CompressedWeights<Config>;
|
||||
CWeights* c_weights = reinterpret_cast<CWeights*>(compressed_weights.get());
|
||||
c_weights->c_layer_ptrs.~CompressedLayerPointers<Config>();
|
||||
WeightsT<Config>* weights =
|
||||
reinterpret_cast<WeightsT<Config>*>(weights_u8.get());
|
||||
DeleteLayersPtrs(weights);
|
||||
}
|
||||
|
||||
const sentencepiece::SentencePieceProcessor* Tokenizer() const override {
|
||||
|
|
@ -296,7 +393,7 @@ struct GemmaImpl : public GemmaInterface {
|
|||
int verbosity) override;
|
||||
|
||||
std::unique_ptr<sentencepiece::SentencePieceProcessor> tokenizer;
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> compressed_weights;
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8;
|
||||
hwy::AlignedUniquePtr<Activations<Config, kPrefillBatchSize>> prefill;
|
||||
hwy::AlignedUniquePtr<Activations<Config, 1>> state;
|
||||
};
|
||||
|
|
@ -309,11 +406,11 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace gcpp {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <class TConfig, size_t kBatchSize>
|
||||
template <size_t kBatchSize, typename LayerT, class TConfig>
|
||||
HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
|
||||
Activations<TConfig, kBatchSize>& activations,
|
||||
const CompressedLayer<TConfig>* c_layer,
|
||||
KVCache& kv_cache, hwy::ThreadPool& pool) {
|
||||
const LayerT* layer_weights, KVCache& kv_cache,
|
||||
hwy::ThreadPool& pool) {
|
||||
PROFILER_ZONE("Gen.Attention");
|
||||
const size_t pos = batch_start + batch_idx;
|
||||
HWY_DASSERT(batch_idx < kBatchSize);
|
||||
|
|
@ -329,26 +426,25 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
|
|||
static const float kQueryScale =
|
||||
static_cast<float>(1.0 / sqrt(static_cast<double>(kQKVDim)));
|
||||
|
||||
const size_t batch_offset = batch_idx * kModelDim;
|
||||
float* x = activations.pre_att_rms_out.data() + batch_idx * kModelDim;
|
||||
|
||||
auto ProjQ = [&](uint64_t head, size_t head_offset) HWY_ATTR {
|
||||
float* HWY_RESTRICT q =
|
||||
activations.q.data() + head * kQKVDim + batch_idx * kHeads * kQKVDim;
|
||||
|
||||
MatVecLoop<kQKVDim, kModelDim>(
|
||||
c_layer->c_qkv_einsum_w, head_offset + 0 * kQKVDim * kModelDim,
|
||||
activations.pre_att_rms_out.data() + batch_offset, q);
|
||||
MatVecLoop<kQKVDim, kModelDim>(layer_weights->qkv_einsum_w,
|
||||
head_offset + 0 * kQKVDim * kModelDim, x, q);
|
||||
};
|
||||
|
||||
auto ProjKV =
|
||||
[&](size_t k_offset, size_t v_offset, size_t kv_offset) HWY_ATTR {
|
||||
TwoOfsMatVecLoop<kQKVDim, kModelDim>(
|
||||
c_layer->c_qkv_einsum_w, k_offset, v_offset,
|
||||
activations.pre_att_rms_out.data() + batch_offset,
|
||||
kv_cache.key_cache.get() + kv_offset,
|
||||
kv_cache.value_cache.get() + kv_offset);
|
||||
auto ProjKV = [&](size_t k_offset, size_t v_offset,
|
||||
size_t kv_offset) HWY_ATTR {
|
||||
float* HWY_RESTRICT k = kv_cache.key_cache.get() + kv_offset;
|
||||
float* HWY_RESTRICT v = kv_cache.value_cache.get() + kv_offset;
|
||||
|
||||
Rope(kv_cache.key_cache.get() + kv_offset, kQKVDim, pos);
|
||||
TwoOfsMatVecLoop<kQKVDim, kModelDim>(layer_weights->qkv_einsum_w, k_offset,
|
||||
v_offset, x, k, v);
|
||||
|
||||
Rope(k, kQKVDim, pos);
|
||||
};
|
||||
|
||||
auto Attn = [&](uint64_t head, size_t head_offset) HWY_ATTR {
|
||||
|
|
@ -388,7 +484,7 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
|
|||
head == 0
|
||||
? activations.att_post2.data() + batch_idx * kModelDim
|
||||
: activations.att_post1.data() + head * kBatchSize * kModelDim;
|
||||
MatVecLoop<kModelDim, kQKVDim>(c_layer->c_attn_vec_einsum_w,
|
||||
MatVecLoop<kModelDim, kQKVDim>(layer_weights->attn_vec_einsum_w,
|
||||
head * kModelDim * kQKVDim, att_out,
|
||||
head_out);
|
||||
};
|
||||
|
|
@ -431,9 +527,9 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
|
|||
}
|
||||
}
|
||||
|
||||
template <typename TConfig, size_t kBatchSize>
|
||||
template <size_t kBatchSize, typename LayerT, typename TConfig>
|
||||
HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
|
||||
size_t batch_idx, const CompressedLayer<TConfig>* c_layer,
|
||||
size_t batch_idx, const LayerT* layer_weights,
|
||||
hwy::ThreadPool& pool) {
|
||||
HWY_DASSERT(batch_idx < kBatchSize);
|
||||
static constexpr size_t kModelDim = TConfig::kModelDim;
|
||||
|
|
@ -449,12 +545,12 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
|
|||
|
||||
// Same matrix, first and second half of rows. Could fuse into one MatVec,
|
||||
// but separating them could help on NUMA e.g. multiple sockets.
|
||||
MatVec<kFFHiddenDim, kModelDim>(c_layer->c_gating_einsum_w,
|
||||
MatVec<kFFHiddenDim, kModelDim>(layer_weights->gating_einsum_w,
|
||||
kFFHiddenDim * kModelDim, vec, out_mul,
|
||||
pool);
|
||||
|
||||
// Gate, will go through the nonlinearity.
|
||||
MatVec<kFFHiddenDim, kModelDim>(c_layer->c_gating_einsum_w, 0, vec, out,
|
||||
MatVec<kFFHiddenDim, kModelDim>(layer_weights->gating_einsum_w, 0, vec, out,
|
||||
pool);
|
||||
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
|
|
@ -467,7 +563,7 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
|
|||
|
||||
PROFILER_ZONE("Gen.FFW\\GatedGELU");
|
||||
MatVec<kModelDim, kFFHiddenDim>(
|
||||
c_layer->c_linear_w, 0, activations.ffw_hidden.data() + hidden_offset,
|
||||
layer_weights->linear_w, 0, activations.ffw_hidden.data() + hidden_offset,
|
||||
activations.ffw_out.data() + batch_idx * kModelDim, pool);
|
||||
}
|
||||
|
||||
|
|
@ -486,9 +582,9 @@ GEMMA_CONSTEXPR_EMBSCALING float EmbeddingScaling() {
|
|||
Sqrt(static_cast<float>(TConfig::kModelDim))));
|
||||
}
|
||||
|
||||
template <typename TConfig, size_t kBatchSize>
|
||||
template <size_t kBatchSize, typename WeightArrayT, typename TConfig>
|
||||
HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
|
||||
const CompressedWeights<TConfig>& c_weights,
|
||||
const WeightArrayT& weights,
|
||||
Activations<TConfig, kBatchSize>& activations,
|
||||
KVCache& kv_cache, hwy::ThreadPool& pool,
|
||||
hwy::ThreadPool& inner_pool) {
|
||||
|
|
@ -500,22 +596,22 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
|
|||
pool.Run(
|
||||
0, num_tokens, [&](const uint64_t token_idx, size_t /*thread*/) HWY_ATTR {
|
||||
const int token = tokens[token_idx];
|
||||
Decompress(c_weights.c_embedder_input_embedding, token * kModelDim,
|
||||
Decompress(weights.embedder_input_embedding, token * kModelDim,
|
||||
activations.x.data() + token_idx * kModelDim, kModelDim);
|
||||
MulByConst(kEmbScaling, activations.x.data() + token_idx * kModelDim,
|
||||
kModelDim);
|
||||
});
|
||||
|
||||
for (size_t layer = 0; layer < TConfig::kLayers; ++layer) {
|
||||
const CompressedLayer<TConfig>* c_layer = c_weights.CLayer(layer);
|
||||
const auto* layer_weights = weights.GetLayer(layer);
|
||||
|
||||
for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
RMSNorm(activations.x.data() + token_idx * kModelDim,
|
||||
c_layer->c_pre_attention_norm_scale.data(),
|
||||
layer_weights->pre_attention_norm_scale.data(),
|
||||
activations.pre_att_rms_out.data() + token_idx * kModelDim,
|
||||
kModelDim);
|
||||
Attention<TConfig, kBatchSize>(pos, token_idx, layer, activations,
|
||||
c_layer, kv_cache, pool);
|
||||
Attention<kBatchSize>(pos, token_idx, layer, activations, layer_weights,
|
||||
kv_cache, pool);
|
||||
}
|
||||
|
||||
// TODO: sink the loop into these functions, i.e. make them matmuls.
|
||||
|
|
@ -525,10 +621,10 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
|
|||
AddFrom(activations.att_post2.data() + token_idx * kModelDim,
|
||||
activations.x.data() + token_idx * kModelDim, kModelDim);
|
||||
RMSNorm(activations.x.data() + token_idx * kModelDim,
|
||||
c_layer->c_pre_ffw_norm_scale.data(),
|
||||
layer_weights->pre_ffw_norm_scale.data(),
|
||||
activations.bf_pre_ffw_rms_out.data() + token_idx * kModelDim,
|
||||
kModelDim);
|
||||
FFW<TConfig, kBatchSize>(activations, token_idx, c_layer, inner_pool);
|
||||
FFW<kBatchSize>(activations, token_idx, layer_weights, inner_pool);
|
||||
AddFrom(activations.ffw_out.data() + token_idx * kModelDim,
|
||||
activations.x.data() + token_idx * kModelDim, kModelDim);
|
||||
});
|
||||
|
|
@ -536,21 +632,20 @@ HWY_NOINLINE void Prefill(const int* tokens, size_t num_tokens, size_t pos,
|
|||
|
||||
pool.Run(
|
||||
0, num_tokens, [&](const uint64_t token_idx, size_t /*thread*/) HWY_ATTR {
|
||||
RMSNormInplace(c_weights.c_final_norm_scale.data(),
|
||||
RMSNormInplace(weights.final_norm_scale.data(),
|
||||
activations.x.data() + token_idx * kModelDim, kModelDim);
|
||||
});
|
||||
}
|
||||
|
||||
// n = 1 specialization
|
||||
template <class TConfig>
|
||||
void Transformer(int token, size_t pos,
|
||||
const CompressedWeights<TConfig>& c_weights,
|
||||
template <typename WeightArrayT, class TConfig>
|
||||
void Transformer(int token, size_t pos, const WeightArrayT& weights,
|
||||
Activations<TConfig, 1>& activations, KVCache& kv_cache,
|
||||
hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool) {
|
||||
static constexpr size_t kLayers = TConfig::kLayers;
|
||||
static constexpr size_t kModelDim = TConfig::kModelDim;
|
||||
|
||||
Decompress(c_weights.c_embedder_input_embedding, token * kModelDim,
|
||||
Decompress(weights.embedder_input_embedding, token * kModelDim,
|
||||
activations.x.data(), kModelDim);
|
||||
|
||||
GEMMA_CONSTEXPR_EMBSCALING const float kEmbScaling =
|
||||
|
|
@ -558,17 +653,18 @@ void Transformer(int token, size_t pos,
|
|||
MulByConst(kEmbScaling, activations.x.data(), kModelDim);
|
||||
|
||||
for (size_t layer = 0; layer < kLayers; ++layer) {
|
||||
const CompressedLayer<TConfig>* c_layer = c_weights.CLayer(layer);
|
||||
RMSNorm(activations.x.data(), c_layer->c_pre_attention_norm_scale.data(),
|
||||
const auto* layer_weights = weights.GetLayer(layer);
|
||||
RMSNorm(activations.x.data(),
|
||||
layer_weights->pre_attention_norm_scale.data(),
|
||||
activations.pre_att_rms_out.data(), kModelDim);
|
||||
Attention<TConfig, 1>(pos, 0, layer, activations, c_layer, kv_cache, pool);
|
||||
Attention<1>(pos, 0, layer, activations, layer_weights, kv_cache, pool);
|
||||
AddFrom(activations.att_post2.data(), activations.x.data(), kModelDim);
|
||||
RMSNorm(activations.x.data(), c_layer->c_pre_ffw_norm_scale.data(),
|
||||
RMSNorm(activations.x.data(), layer_weights->pre_ffw_norm_scale.data(),
|
||||
activations.bf_pre_ffw_rms_out.data(), kModelDim);
|
||||
FFW<TConfig, 1>(activations, /* batch_idx = */ 0, c_layer, pool);
|
||||
FFW<1>(activations, /* batch_idx = */ 0, layer_weights, pool);
|
||||
AddFrom(activations.ffw_out.data(), activations.x.data(), kModelDim);
|
||||
}
|
||||
RMSNormInplace(c_weights.c_final_norm_scale.data(), activations.x.data(),
|
||||
RMSNormInplace(weights.final_norm_scale.data(), activations.x.data(),
|
||||
kModelDim);
|
||||
}
|
||||
|
||||
|
|
@ -609,9 +705,9 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
|
|||
Activations<TConfig, 1>& activations = *gemma.state.get();
|
||||
Activations<TConfig, kPrefillBatchSize>& prefill_activations =
|
||||
*gemma.prefill.get();
|
||||
const CompressedWeights<TConfig>& c_weights =
|
||||
*reinterpret_cast<CompressedWeights<TConfig>*>(
|
||||
gemma.compressed_weights.get());
|
||||
|
||||
const WeightsT<TConfig>& weights =
|
||||
*reinterpret_cast<WeightsT<TConfig>*>(gemma.weights_u8.get());
|
||||
|
||||
size_t prompt_size = prompt.size();
|
||||
RangeChecks<TConfig>(max_tokens, max_generated_tokens, prompt_size);
|
||||
|
|
@ -643,9 +739,8 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
|
|||
HWY_DASSERT(batch_size <= kPrefillBatchSize);
|
||||
HWY_DASSERT(pos_offset + batch_size <= prompt_size - 1);
|
||||
const int* batch_tokens = prompt.data() + pos_offset;
|
||||
Prefill<TConfig, kPrefillBatchSize>(batch_tokens, batch_size, pos,
|
||||
c_weights, prefill_activations,
|
||||
kv_cache, pool, inner_pool);
|
||||
Prefill<kPrefillBatchSize>(batch_tokens, batch_size, pos, weights,
|
||||
prefill_activations, kv_cache, pool, inner_pool);
|
||||
for (size_t idx = 0; idx < batch_size; ++idx) {
|
||||
stream_token(batch_tokens[idx], 0.0f);
|
||||
}
|
||||
|
|
@ -672,7 +767,7 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
|
|||
for (size_t generate_pos = 0;
|
||||
pos < max_tokens && generate_pos < max_generated_tokens;
|
||||
++pos, ++pos_offset, ++generate_pos) {
|
||||
Transformer(token, pos, c_weights, activations, kv_cache, pool, inner_pool);
|
||||
Transformer(token, pos, weights, activations, kv_cache, pool, inner_pool);
|
||||
float* final_activation = activations.x.data();
|
||||
// The condition below is always true if we are doing Prefill above.
|
||||
// We keep it here for clarity so that the code is correct even if Prefill
|
||||
|
|
@ -680,8 +775,8 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
|
|||
if (pos_offset >= prompt_size - 1) {
|
||||
PROFILER_ZONE("Gen.Embedding");
|
||||
// Generation phase
|
||||
MatVec<kVocabSize, TConfig::kModelDim>(
|
||||
c_weights.c_embedder_input_embedding, 0, final_activation,
|
||||
MatVec<kVocabSize, TConfig::kModelDim>(weights.embedder_input_embedding,
|
||||
0, final_activation,
|
||||
activations.logits.data(), pool);
|
||||
// Barrier: must have all logits so we can subtract max.
|
||||
Softmax(activations.logits.data(), kVocabSize);
|
||||
|
|
@ -743,52 +838,37 @@ void ForEachTensor(const Weights<TConfig>* weights,
|
|||
CompressedWeights<TConfig>& c_weights, Func& func) {
|
||||
func("c_embedding",
|
||||
weights ? weights->embedder_input_embedding.data() : nullptr,
|
||||
c_weights.c_embedder_input_embedding);
|
||||
c_weights.embedder_input_embedding);
|
||||
func("c_final_norm", weights ? weights->final_norm_scale.data() : nullptr,
|
||||
c_weights.c_final_norm_scale);
|
||||
c_weights.final_norm_scale);
|
||||
|
||||
char name[16];
|
||||
for (int layer_idx = 0; layer_idx < static_cast<int>(TConfig::kLayers);
|
||||
++layer_idx) {
|
||||
char name_buf[16];
|
||||
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
|
||||
const size_t idx = static_cast<size_t>(layer_idx);
|
||||
Layer<TConfig>* layer = weights ? &weights->layers[idx] : nullptr;
|
||||
CompressedLayer<TConfig>* c_layer = c_weights.CLayer(idx);
|
||||
const Layer<TConfig>* layer = weights ? weights->GetLayer(idx) : nullptr;
|
||||
CompressedLayer<TConfig>* layer_weights = c_weights.GetLayer(idx);
|
||||
|
||||
snprintf(name, sizeof(name), "pre_ff_ns_%d", layer_idx);
|
||||
func(name, layer ? layer->pre_ffw_norm_scale.data() : nullptr,
|
||||
c_layer->c_pre_ffw_norm_scale);
|
||||
#define CALL_FUNC(name, member) \
|
||||
snprintf(name_buf, sizeof(name_buf), name "_%d", layer_idx); \
|
||||
func(name_buf, layer ? layer->member.data() : nullptr, layer_weights->member)
|
||||
|
||||
snprintf(name, sizeof(name), "gating_ein_%d", layer_idx);
|
||||
func(name, layer ? layer->gating_einsum_w.data() : nullptr,
|
||||
c_layer->c_gating_einsum_w);
|
||||
|
||||
snprintf(name, sizeof(name), "linear_w_%d", layer_idx);
|
||||
func(name, layer ? layer->linear_w.data() : nullptr, c_layer->c_linear_w);
|
||||
snprintf(name, sizeof(name), "qkv_ein_%d", layer_idx);
|
||||
|
||||
func(name, layer ? layer->qkv_einsum_w.data() : nullptr,
|
||||
c_layer->c_qkv_einsum_w);
|
||||
snprintf(name, sizeof(name), "att_ein_%d", layer_idx);
|
||||
|
||||
func(name, layer ? layer->attn_vec_einsum_w.data() : nullptr,
|
||||
c_layer->c_attn_vec_einsum_w);
|
||||
|
||||
snprintf(name, sizeof(name), "pre_att_ns_%d", layer_idx);
|
||||
func(name, layer ? layer->pre_attention_norm_scale.data() : nullptr,
|
||||
c_layer->c_pre_attention_norm_scale);
|
||||
CALL_FUNC("pre_ff_ns", pre_ffw_norm_scale);
|
||||
CALL_FUNC("gating_ein", gating_einsum_w);
|
||||
CALL_FUNC("linear_w", linear_w);
|
||||
CALL_FUNC("qkv_ein", qkv_einsum_w);
|
||||
CALL_FUNC("att_ein", attn_vec_einsum_w);
|
||||
CALL_FUNC("pre_att_ns", pre_attention_norm_scale);
|
||||
#undef CALL_FUNC
|
||||
}
|
||||
}
|
||||
|
||||
template <class TConfig>
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeights(
|
||||
const Path& weights_path, const Path& cache, hwy::ThreadPool& pool) {
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> LoadCompressedWeights(
|
||||
const Path& weights, hwy::ThreadPool& pool) {
|
||||
PROFILER_ZONE("Startup.LoadCache");
|
||||
|
||||
if (!std::filesystem::exists(weights_path.path) &&
|
||||
!std::filesystem::exists(cache.path)) {
|
||||
HWY_ABORT(
|
||||
"Either the model weights (--weights) or cached compressed weights "
|
||||
"(--compressed_weights) must exist.");
|
||||
if (!std::filesystem::exists(weights.path)) {
|
||||
HWY_ABORT("The model weights file '%s' does not exist.",
|
||||
weights.path.c_str());
|
||||
}
|
||||
|
||||
// Allocate compressed weights.
|
||||
|
|
@ -798,32 +878,49 @@ hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeights(
|
|||
CWeights* c_weights = reinterpret_cast<CWeights*>(c_weights_u8.get());
|
||||
new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
|
||||
|
||||
// First attempt to load them from cache, without requiring weights.
|
||||
CacheLoader loader(cache.path.c_str());
|
||||
std::array<float, TConfig::kNumTensorScales> scales;
|
||||
CacheLoader loader(weights.path.c_str());
|
||||
ForEachTensor<TConfig>(nullptr, *c_weights, loader);
|
||||
if (loader.ReadAll(pool)) return c_weights_u8;
|
||||
|
||||
// Get weights, compress, and store in cache.
|
||||
const hwy::AlignedUniquePtr<Weights<TConfig>> weights =
|
||||
LoadWeights<TConfig>(weights_path);
|
||||
Compressor compressor(pool);
|
||||
ForEachTensor<TConfig>(weights.get(), *c_weights, compressor);
|
||||
compressor.WriteAll(pool, cache.path.c_str());
|
||||
|
||||
loader.LoadScales(scales.data(), scales.size());
|
||||
if (!loader.ReadAll(pool)) {
|
||||
HWY_ABORT("Failed to load model weights.");
|
||||
}
|
||||
if (TConfig::kNumTensorScales > 0) {
|
||||
size_t scale_pos = 0;
|
||||
for (int layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) {
|
||||
const size_t idx = static_cast<size_t>(layer_idx);
|
||||
CompressedLayer<TConfig>* layer_weights = c_weights->GetLayer(idx);
|
||||
layer_weights->attn_vec_einsum_w.set_scale(scales[scale_pos++]);
|
||||
layer_weights->qkv_einsum_w.set_scale(scales[scale_pos++]);
|
||||
layer_weights->gating_einsum_w.set_scale(scales[scale_pos++]);
|
||||
layer_weights->linear_w.set_scale(scales[scale_pos++]);
|
||||
}
|
||||
HWY_ASSERT(scale_pos == TConfig::kNumTensorScales);
|
||||
}
|
||||
return c_weights_u8;
|
||||
}
|
||||
|
||||
// Type-erased because this function is called via a function pointer.
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> GetCompressedWeightsT(
|
||||
gcpp::Model model, const Path& weights, const Path& compressed_weights,
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> LoadCompressedWeightsT(
|
||||
gcpp::Model model, const Path& weights, hwy::ThreadPool& pool) {
|
||||
switch (model) {
|
||||
case Model::GEMMA_2B:
|
||||
return LoadCompressedWeights<ConfigGemma2B>(weights, pool);
|
||||
case Model::GEMMA_7B:
|
||||
return LoadCompressedWeights<ConfigGemma7B>(weights, pool);
|
||||
default:
|
||||
HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
|
||||
}
|
||||
}
|
||||
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> LoadWeightsT(gcpp::Model model,
|
||||
const Path& weights,
|
||||
hwy::ThreadPool& pool) {
|
||||
switch (model) {
|
||||
case Model::GEMMA_2B:
|
||||
return GetCompressedWeights<ConfigGemma2B>(weights, compressed_weights,
|
||||
pool);
|
||||
return LoadWeights<ConfigGemma2B>(weights, pool);
|
||||
case Model::GEMMA_7B:
|
||||
return GetCompressedWeights<ConfigGemma7B>(weights, compressed_weights,
|
||||
pool);
|
||||
return LoadWeights<ConfigGemma7B>(weights, pool);
|
||||
default:
|
||||
HWY_ABORT("Model type %d unknown.", static_cast<int>(model));
|
||||
}
|
||||
|
|
@ -846,18 +943,22 @@ void CompressWeights(const Path& weights_path,
|
|||
new (&c_weights->c_layer_ptrs) CompressedLayerPointers<TConfig>(pool);
|
||||
|
||||
// Get weights, compress, and store.
|
||||
const hwy::AlignedUniquePtr<Weights<TConfig>> weights =
|
||||
LoadWeights<TConfig>(weights_path);
|
||||
const bool scale_for_compression = TConfig::kNumTensorScales > 0;
|
||||
const hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8 =
|
||||
LoadWeights<TConfig>(weights_path, pool, scale_for_compression);
|
||||
Weights<TConfig>* weights =
|
||||
reinterpret_cast<Weights<TConfig>*>(weights_u8.get());
|
||||
Compressor compressor(pool);
|
||||
ForEachTensor<TConfig>(weights.get(), *c_weights, compressor);
|
||||
ForEachTensor<TConfig>(weights, *c_weights, compressor);
|
||||
compressor.AddScales(weights->scales.data(), weights->scales.size());
|
||||
compressor.WriteAll(pool, compressed_weights_path.path.c_str());
|
||||
|
||||
weights->layer_ptrs.~LayerPointers<TConfig>();
|
||||
c_weights->c_layer_ptrs.~CompressedLayerPointers<TConfig>();
|
||||
}
|
||||
|
||||
void CompressWeightsT(gcpp::Model model, const Path& weights,
|
||||
const Path& compressed_weights,
|
||||
hwy::ThreadPool& pool) {
|
||||
const Path& compressed_weights, hwy::ThreadPool& pool) {
|
||||
switch (model) {
|
||||
case Model::GEMMA_2B:
|
||||
CompressWeights<ConfigGemma2B>(weights, compressed_weights, pool);
|
||||
|
|
@ -877,7 +978,8 @@ HWY_AFTER_NAMESPACE();
|
|||
#if HWY_ONCE
|
||||
namespace gcpp {
|
||||
|
||||
HWY_EXPORT(GetCompressedWeightsT);
|
||||
HWY_EXPORT(LoadCompressedWeightsT);
|
||||
HWY_EXPORT(LoadWeightsT);
|
||||
HWY_EXPORT(CompressWeightsT);
|
||||
HWY_EXPORT(Generate2B);
|
||||
HWY_EXPORT(Generate7B);
|
||||
|
|
@ -892,10 +994,9 @@ KVCache CreateKVCache(size_t size_cache_pos, size_t seq_len) {
|
|||
template <class Config>
|
||||
GemmaImpl<Config>::GemmaImpl(
|
||||
std::unique_ptr<sentencepiece::SentencePieceProcessor>& tokenizer,
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]>& compressed_weights,
|
||||
hwy::ThreadPool& pool)
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]>& weights_u8, hwy::ThreadPool& pool)
|
||||
: tokenizer(std::move(tokenizer)),
|
||||
compressed_weights(std::move(compressed_weights)),
|
||||
weights_u8(std::move(weights_u8)),
|
||||
prefill(hwy::MakeUniqueAligned<Activations<Config, kPrefillBatchSize>>()),
|
||||
state(hwy::MakeUniqueAligned<Activations<Config, 1>>()) {}
|
||||
|
||||
|
|
@ -922,10 +1023,8 @@ void GemmaImpl<ConfigGemma7B>::Generate(
|
|||
kv_cache, pool, inner_pool, stream_token, accept_token, gen, verbosity);
|
||||
}
|
||||
|
||||
Gemma::Gemma(const Path& tokenizer_path, const Path& compressed_weights_path,
|
||||
const Path& weights_path, Model model_type, ModelTraining training,
|
||||
hwy::ThreadPool& pool)
|
||||
: model_training(training) {
|
||||
Gemma::Gemma(const Path& tokenizer_path, const Path& weights, Model model_type,
|
||||
hwy::ThreadPool& pool) {
|
||||
std::unique_ptr<sentencepiece::SentencePieceProcessor> tokenizer;
|
||||
{
|
||||
PROFILER_ZONE("Startup.tokenizer");
|
||||
|
|
@ -934,16 +1033,21 @@ Gemma::Gemma(const Path& tokenizer_path, const Path& compressed_weights_path,
|
|||
HWY_ABORT("Failed to load the tokenizer file.");
|
||||
}
|
||||
}
|
||||
auto compressed_weights = HWY_DYNAMIC_DISPATCH(GetCompressedWeightsT)(
|
||||
model_type, weights_path, compressed_weights_path, pool);
|
||||
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> weights_u8;
|
||||
if constexpr (kWeightsAreCompressed) {
|
||||
weights_u8 =
|
||||
HWY_DYNAMIC_DISPATCH(LoadCompressedWeightsT)(model_type, weights, pool);
|
||||
} else {
|
||||
weights_u8 = HWY_DYNAMIC_DISPATCH(LoadWeightsT)(model_type, weights, pool);
|
||||
}
|
||||
|
||||
switch (model_type) {
|
||||
case Model::GEMMA_2B:
|
||||
impl_.reset(
|
||||
new GemmaImpl<ConfigGemma2B>(tokenizer, compressed_weights, pool));
|
||||
impl_.reset(new GemmaImpl<ConfigGemma2B>(tokenizer, weights_u8, pool));
|
||||
break;
|
||||
case Model::GEMMA_7B:
|
||||
impl_.reset(
|
||||
new GemmaImpl<ConfigGemma7B>(tokenizer, compressed_weights, pool));
|
||||
impl_.reset(new GemmaImpl<ConfigGemma7B>(tokenizer, weights_u8, pool));
|
||||
break;
|
||||
default:
|
||||
HWY_ABORT("Model type %d unknown.", static_cast<int>(model_type));
|
||||
|
|
@ -981,10 +1085,9 @@ void GenerateGemma(Gemma& gemma, RuntimeConfig runtime_config,
|
|||
}
|
||||
|
||||
void CompressWeights(gcpp::Model model, const Path& weights,
|
||||
const Path& compressed_weights,
|
||||
hwy::ThreadPool& pool) {
|
||||
HWY_DYNAMIC_DISPATCH(CompressWeightsT)(
|
||||
model, weights, compressed_weights, pool);
|
||||
const Path& compressed_weights, hwy::ThreadPool& pool) {
|
||||
HWY_DYNAMIC_DISPATCH(CompressWeightsT)
|
||||
(model, weights, compressed_weights, pool);
|
||||
}
|
||||
|
||||
} // namespace gcpp
|
||||
|
|
|
|||
19
gemma.h
19
gemma.h
|
|
@ -24,22 +24,18 @@
|
|||
// copybara:import_next_line:gemma_cpp
|
||||
#include "compression/compress.h" // SfpStream/NuqStream
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/args.h" // Path
|
||||
#include "configs.h"
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h" // hwy::bfloat16_t
|
||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/args.h" // Path
|
||||
// copybara:import_next_line:sentencepiece
|
||||
#include "src/sentencepiece_processor.h"
|
||||
|
||||
namespace gcpp {
|
||||
|
||||
// Allowable types for GEMMA_WEIGHT_T (can be specified at compilation time):
|
||||
// float, hwy::bfloat16_t, SfpStream, NuqStream
|
||||
#ifndef GEMMA_WEIGHT_T
|
||||
#define GEMMA_WEIGHT_T SfpStream
|
||||
#endif // !GEMMA_WEIGHT_T
|
||||
using WeightT = GEMMA_WEIGHT_T;
|
||||
|
||||
using GemmaWeightT = GEMMA_WEIGHT_T;
|
||||
using EmbedderInputT = hwy::bfloat16_t;
|
||||
constexpr size_t kPrefillBatchSize = 16;
|
||||
constexpr bool kSystemPrompt = false;
|
||||
|
|
@ -65,13 +61,11 @@ struct RuntimeConfig {
|
|||
struct GemmaInterface;
|
||||
|
||||
struct Gemma {
|
||||
Gemma(const Path& tokenizer_path, const Path& compressed_weights_path,
|
||||
const Path& weights_path, Model model_type, ModelTraining training,
|
||||
Gemma(const Path& tokenizer_path, const Path& weights, Model model_type,
|
||||
hwy::ThreadPool& pool);
|
||||
~Gemma(); // must be defined after GemmaInterface's dtor is defined.
|
||||
const sentencepiece::SentencePieceProcessor* Tokenizer() const;
|
||||
std::unique_ptr<GemmaInterface> impl_;
|
||||
gcpp::ModelTraining model_training;
|
||||
};
|
||||
|
||||
KVCache CreateKVCache(Model type); // convenient workaround for now
|
||||
|
|
@ -99,8 +93,7 @@ void GenerateGemma(Gemma& gemma, RuntimeConfig runtime_config,
|
|||
const StreamFunc& stream_token, std::mt19937& gen);
|
||||
|
||||
void CompressWeights(gcpp::Model model, const Path& weights,
|
||||
const Path& compressed_weights,
|
||||
hwy::ThreadPool& pool);
|
||||
const Path& compressed_weights, hwy::ThreadPool& pool);
|
||||
|
||||
constexpr int EOS_ID = 1;
|
||||
|
||||
|
|
|
|||
|
|
@ -369,6 +369,7 @@ CompressedArray<float, kOuter * kInner> GenerateMat(size_t offset) {
|
|||
}
|
||||
}
|
||||
Compress(content, ws, mat, pool);
|
||||
mat.set_scale(1.0f);
|
||||
return mat;
|
||||
}
|
||||
|
||||
|
|
|
|||
23
run.cc
23
run.cc
|
|
@ -29,14 +29,14 @@
|
|||
#include "gemma.h" // Gemma
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/app.h"
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/args.h" // HasHelp
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/per_target.h"
|
||||
#include "hwy/profiler.h"
|
||||
#include "hwy/timer.h"
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/args.h" // HasHelp
|
||||
|
||||
namespace gcpp {
|
||||
|
||||
|
|
@ -66,7 +66,7 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
|||
<< hwy::VectorBytes() * 8 << " bits)" << "\n"
|
||||
<< "Compiled config : " << CompiledConfig() << "\n"
|
||||
<< "Weight Type : "
|
||||
<< gcpp::TypeName(gcpp::WeightT()) << "\n"
|
||||
<< gcpp::TypeName(gcpp::GemmaWeightT()) << "\n"
|
||||
<< "EmbedderInput Type : "
|
||||
<< gcpp::TypeName(gcpp::EmbedderInputT()) << "\n";
|
||||
}
|
||||
|
|
@ -93,10 +93,11 @@ void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
|
|||
std::cerr << "\n";
|
||||
}
|
||||
|
||||
void ReplGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache,
|
||||
hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool,
|
||||
const InferenceArgs& args, int verbosity,
|
||||
const gcpp::AcceptFunc& accept_token, std::string& eot_line) {
|
||||
void ReplGemma(gcpp::Gemma& model, ModelTraining training,
|
||||
gcpp::KVCache& kv_cache, hwy::ThreadPool& pool,
|
||||
hwy::ThreadPool& inner_pool, const InferenceArgs& args,
|
||||
int verbosity, const gcpp::AcceptFunc& accept_token,
|
||||
std::string& eot_line) {
|
||||
PROFILER_ZONE("Gen.misc");
|
||||
int abs_pos = 0; // absolute token index over all turns
|
||||
int current_pos = 0; // token index within the current turn
|
||||
|
|
@ -177,7 +178,7 @@ void ReplGemma(gcpp::Gemma& model, gcpp::KVCache& kv_cache,
|
|||
continue;
|
||||
}
|
||||
|
||||
if (model.model_training == ModelTraining::GEMMA_IT) {
|
||||
if (training == ModelTraining::GEMMA_IT) {
|
||||
// For instruction-tuned models: add control tokens.
|
||||
prompt_string = "<start_of_turn>user\n" + prompt_string +
|
||||
"<end_of_turn>\n<start_of_turn>model\n";
|
||||
|
|
@ -232,8 +233,7 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
|||
[](uint64_t /*task*/, size_t thread) { PinThreadToCore(thread); });
|
||||
}
|
||||
|
||||
gcpp::Gemma model(loader.tokenizer, loader.compressed_weights, loader.weights,
|
||||
loader.ModelType(), loader.ModelTraining(), pool);
|
||||
gcpp::Gemma model(loader.tokenizer, loader.weights, loader.ModelType(), pool);
|
||||
|
||||
auto kv_cache = CreateKVCache(loader.ModelType());
|
||||
|
||||
|
|
@ -265,7 +265,8 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
|||
}
|
||||
|
||||
ReplGemma(
|
||||
model, kv_cache, pool, inner_pool, inference, app.verbosity,
|
||||
model, loader.ModelTraining(), kv_cache, pool, inner_pool, inference,
|
||||
app.verbosity,
|
||||
/*accept_token=*/[](int) { return true; }, app.eot_line);
|
||||
}
|
||||
|
||||
|
|
|
|||
39
util/app.h
39
util/app.h
|
|
@ -36,9 +36,9 @@
|
|||
#include "configs.h"
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "gemma.h"
|
||||
#include "hwy/base.h" // HWY_ASSERT
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/args.h"
|
||||
#include "hwy/base.h" // HWY_ASSERT
|
||||
|
||||
namespace gcpp {
|
||||
|
||||
|
|
@ -151,7 +151,7 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
|
|||
}
|
||||
|
||||
// Returns error string or nullptr if OK.
|
||||
const char* Validate() const {
|
||||
const char* Validate() {
|
||||
const std::string model_type_lc = ToLower(model_type);
|
||||
if (model_type.empty()) {
|
||||
return "Missing --model flag, need to specify either 2b-pt, 7b-pt, "
|
||||
|
|
@ -165,37 +165,42 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
|
|||
if (tokenizer.path.empty()) {
|
||||
return "Missing --tokenizer flag, a file for the tokenizer is required.";
|
||||
}
|
||||
if (compressed_weights.path.empty()) {
|
||||
return "Missing --compressed_weights flag, a file for the compressed "
|
||||
"model.";
|
||||
if (!compressed_weights.path.empty()) {
|
||||
if (weights.path.empty()) {
|
||||
weights = compressed_weights;
|
||||
} else {
|
||||
return "Only one of --weights and --compressed_weights can be "
|
||||
"specified. To create compressed weights use the compress_weights "
|
||||
"tool.";
|
||||
}
|
||||
}
|
||||
if (weights.path.empty()) {
|
||||
return "Missing --weights flag, a file for the model weights.";
|
||||
}
|
||||
if (!weights.exists()) {
|
||||
return "Can't open file specified with --weights flag.";
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Path tokenizer;
|
||||
Path weights; // uncompressed weights file location
|
||||
Path compressed_weights; // compressed weights file location
|
||||
Path weights; // weights file location
|
||||
Path compressed_weights;
|
||||
std::string model_type;
|
||||
|
||||
template <class Visitor>
|
||||
void ForEach(const Visitor& visitor) {
|
||||
visitor(tokenizer, "tokenizer", Path(),
|
||||
"Path name of tokenizer model file.\n Required argument.");
|
||||
visitor(
|
||||
compressed_weights, "compressed_weights", Path(),
|
||||
"Path name of compressed weights file, regenerated from `--weights` "
|
||||
"file if "
|
||||
"the compressed weights file does not exist.\n Required argument.");
|
||||
visitor(weights, "weights", Path(),
|
||||
"Path name of model weights (.sbs) file.\n Required argument.");
|
||||
visitor(compressed_weights, "compressed_weights", Path(),
|
||||
"Alias for --weights.");
|
||||
visitor(model_type, "model", std::string(),
|
||||
"Model type\n 2b-it = 2B parameters, instruction-tuned\n "
|
||||
"2b-pt = 2B parameters, pretrained\n 7b-it = 7B parameters "
|
||||
"instruction-tuned\n 7b-pt = 7B parameters, pretrained\n"
|
||||
" Required argument.");
|
||||
visitor(weights, "weights", Path(),
|
||||
"Path name of model weights (.sbs) file. Only required if "
|
||||
"compressed_weights file is not present and needs to be "
|
||||
"regenerated. This parameter is only required for compressing "
|
||||
"new model weight exports, otherwise it is not needed.");
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue