Fix file mapping: was letting the smart pointer go out of scope

Also save+print the IO mode used.

PiperOrigin-RevId: 788848165
This commit is contained in:
Jan Wassenberg 2025-07-30 04:29:27 -07:00 committed by Copybara-Service
parent 2141d4788d
commit d831ddce5b
8 changed files with 95 additions and 58 deletions

View File

@ -56,7 +56,8 @@ GemmaEnv::GemmaEnv(const LoaderArgs& loader, const ThreadingArgs& threading,
kv_caches_.push_back(KVCache(config, inference, ctx_.allocator));
if (inference.verbosity >= 2) {
ShowConfig(loader, threading, inference, config, ctx_);
ShowConfig(loader, threading, inference, config, gemma_.WeightReadMode(),
ctx_);
}
InitGenerator(inference, gen_);
@ -229,13 +230,15 @@ static constexpr const char* CompiledConfig() {
void ShowConfig(const LoaderArgs& loader, const ThreadingArgs& threading,
const InferenceArgs& inference, const ModelConfig& config,
const WeightsPtrs::Mode weight_read_mode,
const ThreadingContext& ctx) {
threading.Print(inference.verbosity);
loader.Print(inference.verbosity);
inference.Print(inference.verbosity);
fprintf(stderr, "Model : %s, to_bf16 %d, mmap %d\n",
config.Specifier().c_str(), static_cast<int>(loader.to_bf16),
static_cast<int>(loader.map));
fprintf(
stderr, "Model : %s, to_bf16 %d, mmap %d => %s\n",
config.Specifier().c_str(), static_cast<int>(loader.to_bf16),
static_cast<int>(loader.map), WeightsPtrs::ToString(weight_read_mode));
if (inference.verbosity >= 2) {
time_t now = time(nullptr);

View File

@ -125,6 +125,7 @@ void LogSpeedStats(double time_start, size_t total_tokens);
void ShowConfig(const LoaderArgs& loader, const ThreadingArgs& threading,
const InferenceArgs& inference, const ModelConfig& config,
WeightsPtrs::Mode weight_read_mode,
const ThreadingContext& ctx);
void ShowHelp(const LoaderArgs& loader, const ThreadingArgs& threading,
const InferenceArgs& inference);

View File

@ -609,7 +609,9 @@ Gemma::Gemma(const LoaderArgs& loader, const InferenceArgs& inference,
weights_(model_.Config()),
chat_template_(model_.Tokenizer(), model_.Config().model),
inference_(inference) {
weights_.ReadFromBlobs(model_, reader_, loader, inference, mat_owners_, ctx);
weight_read_mode_ = weights_.ReadFromBlobs(model_, reader_, loader, inference,
mat_owners_, ctx);
// Read everything into memory, or `weights_.mapped_` keeps the mapping alive.
reader_.CloseFile();
}

View File

@ -239,6 +239,7 @@ class Gemma {
const ModelConfig& Config() const { return model_.Config(); }
const GemmaTokenizer& Tokenizer() const { return model_.Tokenizer(); }
const WeightsPtrs& Weights() const { return weights_; }
WeightsPtrs::Mode WeightReadMode() const { return weight_read_mode_; }
const GemmaChatTemplate& ChatTemplate() const { return chat_template_; }
const InferenceArgs& Inference() const { return inference_; }
@ -271,6 +272,7 @@ class Gemma {
ModelStore model_;
std::vector<MatOwner> mat_owners_;
WeightsPtrs weights_;
WeightsPtrs::Mode weight_read_mode_;
GemmaChatTemplate chat_template_;
InferenceArgs inference_;
};

View File

@ -285,7 +285,8 @@ void Run(const LoaderArgs& loader, const ThreadingArgs& threading,
if (inference.IsInteractive()) {
std::cout << "\033[2J\033[1;1H" // clear screen
<< kAsciiArtBanner << "\n\n";
ShowConfig(loader, threading, inference, gemma.Config(), ctx);
ShowConfig(loader, threading, inference, gemma.Config(),
gemma.WeightReadMode(), ctx);
std::cout << "\n" << instructions << "\n";
}
}

View File

@ -223,7 +223,7 @@ void WeightsPtrs::CopyFrom(const WeightsPtrs& other) {
}
// For reshaping file tensors to the shape expected by the code. This would
// ideally already happen in the importer. Called by WeightsOwner::Fixup.
// ideally already happen in the importer. Called by `ReadFromBlobs`.
void WeightsPtrs::Fixup(std::vector<MatOwner>& mat_owners,
ThreadingContext& ctx) {
// TODO: use 1D parallel-for helper function
@ -251,21 +251,11 @@ std::vector<uint32_t> WeightsPtrs::AddTensorDataToWriter(
return serialized_mat_ptrs;
}
enum class Mode {
// Parallel I/O, decompress to BF16. Best for large batch sizes.
kReadBF16,
// Parallel I/O, insert row-wise padding. Safe default.
kRead,
// Best for large weights relative to available memory, especially for
// frequent invocations of small batches and short sequences. Adds noise to
// performance measurements due to I/O variability.
kMap
};
// Decides whether to read or map based on heuristics and user override.
static Mode ChooseMode(uint64_t file_bytes, const LoaderArgs& loader,
const InferenceArgs& inference,
const Allocator& allocator) {
static WeightsPtrs::Mode ChooseMode(uint64_t file_bytes,
const LoaderArgs& loader,
const InferenceArgs& inference,
const Allocator& allocator) {
Tristate to_bf16 = loader.to_bf16;
Tristate map = loader.map;
@ -283,8 +273,8 @@ static Mode ChooseMode(uint64_t file_bytes, const LoaderArgs& loader,
if (to_bf16 == Tristate::kTrue && map == Tristate::kTrue) {
HWY_WARN("Cannot have to_bf16 && map, to_bf16 takes precedence.");
}
if (to_bf16 == Tristate::kTrue) return Mode::kReadBF16;
if (map == Tristate::kTrue) return Mode::kMap;
if (to_bf16 == Tristate::kTrue) return WeightsPtrs::Mode::kReadBF16;
if (map == Tristate::kTrue) return WeightsPtrs::Mode::kMap;
if (to_bf16 == Tristate::kDefault) {
// Heuristic: sub-bf16 compression is not helpful if compute-bound.
@ -307,8 +297,9 @@ static Mode ChooseMode(uint64_t file_bytes, const LoaderArgs& loader,
}
// If the `map` heuristic triggers, use that for safety.
if (map == Tristate::kTrue) return Mode::kMap;
return (to_bf16 == Tristate::kTrue) ? Mode::kReadBF16 : Mode::kRead;
if (map == Tristate::kTrue) return WeightsPtrs::Mode::kMap;
return (to_bf16 == Tristate::kTrue) ? WeightsPtrs::Mode::kReadBF16
: WeightsPtrs::Mode::kRead;
}
struct TensorToRead {
@ -324,7 +315,8 @@ struct TensorToRead {
// Allocates multiple in parallel and binds to NUMA nodes.
static void AllocateAndBindAll(std::vector<TensorToRead>& tensors,
const Mode mode, std::vector<MatOwner>& owners,
const WeightsPtrs::Mode mode,
std::vector<MatOwner>& owners,
ThreadingContext& ctx) {
const size_t start = owners.size();
owners.resize(start + tensors.size());
@ -342,7 +334,7 @@ static void AllocateAndBindAll(std::vector<TensorToRead>& tensors,
if (tensor.prev_type == Type::kF32 || mat.Rows() < 1024) {
tensor.keep_type = true;
tensor.padding = MatPadding::kPacked; // single I/O for simplicity
} else if (mode == Mode::kReadBF16) {
} else if (mode == WeightsPtrs::Mode::kReadBF16) {
mat.SetType(Type::kBF16);
}
@ -354,7 +346,7 @@ static void AllocateAndBindAll(std::vector<TensorToRead>& tensors,
// Mode == kMap
static void MapAll(const std::vector<TensorToRead>& tensors,
const MapPtr& mapped) {
const MapPtr& mapped, uint64_t file_bytes) {
PROFILER_ZONE("Startup.Weights.Map");
for (size_t i = 0; i < tensors.size(); ++i) {
// SetPtr does not change the stride, but it is expected to be packed
@ -362,10 +354,12 @@ static void MapAll(const std::vector<TensorToRead>& tensors,
const size_t mat_bytes = tensors[i].mat->PackedBytes();
// Ensure blob size matches that computed from metadata.
HWY_ASSERT_M(mat_bytes == tensors[i].range.bytes, tensors[i].mat->Name());
// Ensure the blob lies within the file mapping.
const uint64_t offset = tensors[i].range.offset;
HWY_ASSERT_M(offset + mat_bytes <= file_bytes, tensors[i].mat->Name());
tensors[i].mat->SetPtr(
const_cast<uint8_t*>(mapped.get() + tensors[i].range.offset),
tensors[i].mat->Stride());
tensors[i].mat->SetPtr(const_cast<uint8_t*>(mapped.get() + offset),
tensors[i].mat->Stride());
}
}
@ -484,40 +478,49 @@ static void ReadBatches(const BlobReader& reader,
});
}
// Aborts on error.
static void MapOrReadAll(std::vector<TensorToRead>& tensors, BlobReader& reader,
Mode mode, std::vector<MatOwner>& mat_owners,
ThreadingContext& ctx) {
if (mode == Mode::kMap) {
MapPtr mapped = reader.file().Map();
if (mapped) return MapAll(tensors, mapped);
// Aborts on error. Updates `mode` to the actual mode used. Returns mapped
// memory or nullptr if `kMap` was not used.
static MapPtr MapOrReadAll(std::vector<TensorToRead>& tensors,
BlobReader& reader, WeightsPtrs::Mode* mode,
std::vector<MatOwner>& mat_owners,
ThreadingContext& ctx) {
if (*mode == WeightsPtrs::Mode::kMap) {
if (MapPtr mapped = reader.Map()) {
MapAll(tensors, mapped, reader.file().FileSize());
return mapped;
}
HWY_WARN("Failed to map file (%zu KiB), reading instead.",
static_cast<size_t>(reader.file_bytes() >> 10));
// If we wanted to map but failed, memory is probably not plentiful, so
// fall through to kRead because kReadBF16 requires more memory.
mode = Mode::kRead;
*mode = WeightsPtrs::Mode::kRead;
}
{
PROFILER_ZONE("Startup.Weights.Allocate");
// NOTE: this changes the stride of `mats`!
AllocateAndBindAll(tensors, mode, mat_owners, ctx);
AllocateAndBindAll(tensors, *mode, mat_owners, ctx);
}
hwy::ThreadPool& pool = ctx.pools.Pool();
if (mode == Mode::kReadBF16) return ReadAllToBF16(tensors, reader, pool);
if (*mode == WeightsPtrs::Mode::kReadBF16) {
ReadAllToBF16(tensors, reader, pool);
return MapPtr();
}
const std::vector<IOBatch> batches =
MakeBatches(tensors, reader.file_bytes());
ReadBatches(reader, batches, pool);
return MapPtr();
}
void WeightsPtrs::ReadFromBlobs(const ModelStore& model, BlobReader& reader,
const LoaderArgs& loader,
const InferenceArgs& inference,
std::vector<MatOwner>& mat_owners,
ThreadingContext& ctx) {
WeightsPtrs::Mode WeightsPtrs::ReadFromBlobs(const ModelStore& model,
BlobReader& reader,
const LoaderArgs& loader,
const InferenceArgs& inference,
std::vector<MatOwner>& mat_owners,
ThreadingContext& ctx) {
// List of tensors to read/map, and where from.
std::vector<TensorToRead> tensors;
@ -536,15 +539,14 @@ void WeightsPtrs::ReadFromBlobs(const ModelStore& model, BlobReader& reader,
HWY_ABORT("Tensor %s is required but not found in file.", t.mat.Name());
});
const Mode mode =
ChooseMode(reader.file_bytes(), loader, inference, ctx.allocator);
MapOrReadAll(tensors, reader, mode, mat_owners, ctx);
Mode mode = ChooseMode(reader.file_bytes(), loader, inference, ctx.allocator);
mapped_ = MapOrReadAll(tensors, reader, &mode, mat_owners, ctx);
{
PROFILER_ZONE("Startup.Fixup");
Fixup(mat_owners, ctx);
}
return mode;
}
} // namespace gcpp

View File

@ -90,7 +90,7 @@ class MatFinder {
};
// Per-layer weight metadata and pointers. The tensor data is owned by
// `WeightsOwner`.
// `MatOwner`.
struct LayerWeightsPtrs {
// Initializes tensor metadata without allocating.
// NOTE: do not store layer_idx, TransformerLayer and Attention may use
@ -314,7 +314,7 @@ struct LayerWeightsPtrs {
};
// Holds layer-independent weight metadata and pointers plus per-layer
// `LayerWeightsPtrs`. The tensor data is owned by `WeightsOwner`.
// `LayerWeightsPtrs`. The tensor data is owned by `MatOwner`.
struct WeightsPtrs {
explicit WeightsPtrs(const ModelConfig& config)
: config_(config),
@ -423,9 +423,34 @@ struct WeightsPtrs {
// Copies only the allocated tensors in `*this` from tensors in `other`.
void CopyFrom(const WeightsPtrs& other);
enum class Mode {
// Parallel I/O, decompress to BF16. Best for large batch sizes.
kReadBF16,
// Parallel I/O, insert row-wise padding. Safe default.
kRead,
// Best for large weights relative to available memory, especially for
// frequent invocations of small batches and short sequences. Adds noise to
// performance measurements due to I/O variability.
kMap
};
static const char* ToString(Mode mode) {
switch (mode) {
case Mode::kReadBF16:
return "ReadBF16";
case Mode::kRead:
return "Read";
case Mode::kMap:
return "Map";
default:
HWY_DASSERT(false);
return "?";
}
}
// Reads tensor data from `BlobStore` or aborts on error. `map` is a user
// override for whether to map blobs or read them.
void ReadFromBlobs(const ModelStore& model, BlobReader& reader,
// override for whether to map blobs or read them. Returns the mode used.
Mode ReadFromBlobs(const ModelStore& model, BlobReader& reader,
const LoaderArgs& loader, const InferenceArgs& inference,
std::vector<MatOwner>& mat_owners, ThreadingContext& ctx);
@ -436,6 +461,8 @@ struct WeightsPtrs {
// For reshaping file tensors to the shape expected by the code. This would
// ideally already happen in the importer. Called by ReadFromBlobs.
void Fixup(std::vector<MatOwner>& mat_owners, ThreadingContext& ctx);
MapPtr mapped_;
}; // `WeightsPtrs`
#undef TENSOR_ARGS

View File

@ -47,7 +47,7 @@ struct BlobRange {
// Reads `BlobStore` header, converts keys to strings and creates a hash map for
// faster lookups.
// TODO(janwas): rename to BlobFinder or similar.
// Thread-safe: it is safe to concurrently call all methods.
// Thread-safe: it is safe to concurrently call all methods except `CloseFile`.
class BlobReader {
public:
// Acquires ownership of `file` (which must be non-null) and reads its header.
@ -56,11 +56,10 @@ class BlobReader {
const Path& blob_path() const { return blob_path_; }
// Non-const version required for File::Map().
File& file() { return *file_; }
const File& file() const { return *file_; }
uint64_t file_bytes() const { return file_bytes_; }
MapPtr Map() { return file_->Map(); }
// OK to call if Map() was called; the smart pointer keeps the mapping alive.
void CloseFile() { file_.reset(); }
const std::vector<std::string>& Keys() const { return keys_; }