Remove branching in llama-model-loader.cpp and reduce code duplications in llama-mmap.cpp
This commit is contained in:
parent
d2acc3a8a8
commit
f6d79fe1b1
|
|
@ -75,7 +75,7 @@ struct llama_file::impl {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl(const char * fname, const char * mode) {
|
impl(const char * fname, const char * mode, const bool use_direct_io = false) {
|
||||||
fp = ggml_fopen(fname, mode);
|
fp = ggml_fopen(fname, mode);
|
||||||
if (fp == NULL) {
|
if (fp == NULL) {
|
||||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||||
|
|
@ -154,43 +154,50 @@ struct llama_file::impl {
|
||||||
write_raw(&val, sizeof(val));
|
write_raw(&val, sizeof(val));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool has_direct_io() const {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const {
|
||||||
|
throw std::runtime_error("DirectIO is not implemented on Windows.");
|
||||||
|
}
|
||||||
|
|
||||||
~impl() {
|
~impl() {
|
||||||
if (fp) {
|
if (fp) {
|
||||||
std::fclose(fp);
|
std::fclose(fp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#elif defined(__linux__)
|
#else
|
||||||
impl(const char * fname, const char * mode) : impl(fname, mode, false) {}
|
impl(const char * fname, const char * mode, const bool use_direct_io = false) {
|
||||||
|
#ifdef __linux__
|
||||||
impl(const char * fname, const char * mode, bool uncached_read) {
|
// Try unbuffered I/O for read only
|
||||||
if (uncached_read) {
|
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
|
||||||
fd = open(fname, O_RDONLY | O_DIRECT);
|
fd = open(fname, O_RDONLY | O_DIRECT);
|
||||||
if (fd == -1 && (errno == EINVAL || errno == EOPNOTSUPP)) {
|
|
||||||
fd = open(fname, O_RDONLY); // retry without O_DIRECT
|
if (fd != -1) {
|
||||||
|
struct stat file_stats{};
|
||||||
|
fstat(fd, &file_stats);
|
||||||
|
|
||||||
|
size = file_stats.st_size;
|
||||||
|
|
||||||
|
off_t ret = lseek(fd, 0, SEEK_SET);
|
||||||
|
if (ret == -1) {
|
||||||
|
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
||||||
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fd == -1) {
|
LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
|
||||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
fname, strerror(errno));
|
||||||
}
|
|
||||||
|
|
||||||
struct stat file_stats{};
|
|
||||||
fstat(fd, &file_stats);
|
|
||||||
|
|
||||||
size = file_stats.st_size;
|
|
||||||
|
|
||||||
off_t ret = lseek(fd, 0, SEEK_SET);
|
|
||||||
if (ret == -1) {
|
|
||||||
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fp = ggml_fopen(fname, mode);
|
|
||||||
if (fp == NULL) {
|
|
||||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
||||||
}
|
|
||||||
seek(0, SEEK_END);
|
|
||||||
size = tell();
|
|
||||||
seek(0, SEEK_SET);
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
fp = ggml_fopen(fname, mode);
|
||||||
|
if (fp == NULL) {
|
||||||
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||||
|
}
|
||||||
|
seek(0, SEEK_END);
|
||||||
|
size = tell();
|
||||||
|
seek(0, SEEK_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t tell() const {
|
size_t tell() const {
|
||||||
|
|
@ -226,8 +233,8 @@ struct llama_file::impl {
|
||||||
if (len == 0) {
|
if (len == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
errno = 0;
|
||||||
if (fd == -1) {
|
if (fd == -1) {
|
||||||
errno = 0;
|
|
||||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||||
if (ferror(fp)) {
|
if (ferror(fp)) {
|
||||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||||
|
|
@ -255,86 +262,27 @@ struct llama_file::impl {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t read_u32() const {
|
void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const {
|
||||||
uint32_t ret;
|
off_t aligned_offset = offset & ~(alignment - 1);
|
||||||
read_raw(&ret, sizeof(ret));
|
off_t offset_from_alignment = offset - aligned_offset;
|
||||||
return ret;
|
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
||||||
}
|
|
||||||
|
|
||||||
void write_raw(const void * ptr, size_t len) const {
|
void * raw_buffer = nullptr;
|
||||||
if (len == 0) {
|
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
|
||||||
return;
|
|
||||||
}
|
|
||||||
errno = 0;
|
|
||||||
size_t ret = std::fwrite(ptr, len, 1, fp);
|
|
||||||
if (ret != 1) {
|
|
||||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_u32(uint32_t val) const {
|
|
||||||
write_raw(&val, sizeof(val));
|
|
||||||
}
|
|
||||||
|
|
||||||
~impl() {
|
|
||||||
if (fp) {
|
|
||||||
std::fclose(fp);
|
|
||||||
} else if (fd != -1) {
|
|
||||||
close(fd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int fd = -1;
|
|
||||||
|
|
||||||
#else
|
|
||||||
impl(const char * fname, const char * mode) {
|
|
||||||
fp = ggml_fopen(fname, mode);
|
|
||||||
if (fp == NULL) {
|
|
||||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
||||||
}
|
|
||||||
seek(0, SEEK_END);
|
|
||||||
size = tell();
|
|
||||||
seek(0, SEEK_SET);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t tell() const {
|
|
||||||
// TODO: this ifdef is never true?
|
|
||||||
#ifdef _WIN32
|
|
||||||
__int64 ret = _ftelli64(fp);
|
|
||||||
#else
|
|
||||||
long ret = std::ftell(fp);
|
|
||||||
#endif
|
|
||||||
if (ret == -1) {
|
|
||||||
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
|
|
||||||
return (size_t) ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void seek(size_t offset, int whence) const {
|
|
||||||
// TODO: this ifdef is never true?
|
|
||||||
#ifdef _WIN32
|
|
||||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
|
||||||
#else
|
|
||||||
int ret = std::fseek(fp, (long) offset, whence);
|
|
||||||
#endif
|
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
void read_raw(void * ptr, size_t len) const {
|
struct aligned_buffer_deleter {
|
||||||
if (len == 0) {
|
void operator()(void * p) const { free(p); }
|
||||||
return;
|
};
|
||||||
}
|
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
||||||
errno = 0;
|
|
||||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
seek(aligned_offset, SEEK_SET);
|
||||||
if (ferror(fp)) {
|
read_raw(buffer.get(), bytes_to_read);
|
||||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
||||||
}
|
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
||||||
if (ret != 1) {
|
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
||||||
throw std::runtime_error("unexpectedly reached end of file");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t read_u32() const {
|
uint32_t read_u32() const {
|
||||||
|
|
@ -358,26 +306,33 @@ struct llama_file::impl {
|
||||||
write_raw(&val, sizeof(val));
|
write_raw(&val, sizeof(val));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool has_direct_io() const {
|
||||||
|
return fd != -1;
|
||||||
|
}
|
||||||
|
|
||||||
~impl() {
|
~impl() {
|
||||||
if (fp) {
|
if (fd != -1) {
|
||||||
|
close(fd);
|
||||||
|
} else {
|
||||||
std::fclose(fp);
|
std::fclose(fp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
int fd = -1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
FILE * fp{};
|
FILE * fp{};
|
||||||
size_t size{};
|
size_t size{};
|
||||||
};
|
};
|
||||||
|
|
||||||
llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
|
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
|
||||||
#if defined(__linux__)
|
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
|
||||||
llama_file::llama_file(const char * fname, const char * mode, bool uncached_read) : pimpl(std::make_unique<impl>(fname, mode, uncached_read)) {}
|
|
||||||
#endif
|
|
||||||
llama_file::~llama_file() = default;
|
llama_file::~llama_file() = default;
|
||||||
|
|
||||||
size_t llama_file::tell() const { return pimpl->tell(); }
|
size_t llama_file::tell() const { return pimpl->tell(); }
|
||||||
size_t llama_file::size() const { return pimpl->size; }
|
size_t llama_file::size() const { return pimpl->size; }
|
||||||
|
|
||||||
|
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
|
||||||
|
|
||||||
int llama_file::file_id() const {
|
int llama_file::file_id() const {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
return _fileno(pimpl->fp);
|
return _fileno(pimpl->fp);
|
||||||
|
|
@ -392,6 +347,9 @@ int llama_file::file_id() const {
|
||||||
|
|
||||||
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
||||||
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
|
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
|
||||||
|
void llama_file::read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const
|
||||||
|
{ pimpl->read_aligned_chunk(offset, dest, size, alignment); }
|
||||||
|
|
||||||
|
|
||||||
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
|
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,10 +13,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
||||||
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
||||||
|
|
||||||
struct llama_file {
|
struct llama_file {
|
||||||
llama_file(const char * fname, const char * mode);
|
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
|
||||||
#if defined(__linux__)
|
|
||||||
llama_file(const char * fname, const char * mode, bool uncached_read);
|
|
||||||
#endif
|
|
||||||
~llama_file();
|
~llama_file();
|
||||||
|
|
||||||
size_t tell() const;
|
size_t tell() const;
|
||||||
|
|
@ -27,11 +24,13 @@ struct llama_file {
|
||||||
void seek(size_t offset, int whence) const;
|
void seek(size_t offset, int whence) const;
|
||||||
|
|
||||||
void read_raw(void * ptr, size_t len) const;
|
void read_raw(void * ptr, size_t len) const;
|
||||||
|
void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const;
|
||||||
uint32_t read_u32() const;
|
uint32_t read_u32() const;
|
||||||
|
|
||||||
void write_raw(const void * ptr, size_t len) const;
|
void write_raw(const void * ptr, size_t len) const;
|
||||||
void write_u32(uint32_t val) const;
|
void write_u32(uint32_t val) const;
|
||||||
|
|
||||||
|
bool has_direct_io() const;
|
||||||
private:
|
private:
|
||||||
struct impl;
|
struct impl;
|
||||||
std::unique_ptr<impl> pimpl;
|
std::unique_ptr<impl> pimpl;
|
||||||
|
|
|
||||||
|
|
@ -503,11 +503,7 @@ llama_model_loader::llama_model_loader(
|
||||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||||
|
|
||||||
#if defined(__linux__)
|
|
||||||
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
|
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
|
||||||
#else
|
|
||||||
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
|
||||||
#endif
|
|
||||||
contexts.emplace_back(ctx);
|
contexts.emplace_back(ctx);
|
||||||
|
|
||||||
// Save tensors data offset of the main file.
|
// Save tensors data offset of the main file.
|
||||||
|
|
@ -575,11 +571,7 @@ llama_model_loader::llama_model_loader(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__linux__)
|
|
||||||
files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
|
|
||||||
#else
|
|
||||||
files.emplace_back(new llama_file(fname_split, "rb"));
|
files.emplace_back(new llama_file(fname_split, "rb"));
|
||||||
#endif
|
|
||||||
contexts.emplace_back(ctx);
|
contexts.emplace_back(ctx);
|
||||||
|
|
||||||
// Save tensors data offset info of the shard.
|
// Save tensors data offset info of the shard.
|
||||||
|
|
@ -941,14 +933,17 @@ bool llama_model_loader::load_all_data(
|
||||||
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
||||||
// NVMe raid configurations might require more / larger buffers.
|
// NVMe raid configurations might require more / larger buffers.
|
||||||
constexpr size_t n_buffers = 4;
|
constexpr size_t n_buffers = 4;
|
||||||
#if defined(__linux__)
|
|
||||||
constexpr size_t alignment = 4 * 1024; // 4 KiB for Direct I/O
|
|
||||||
|
bool direct_io = false;
|
||||||
|
for (const auto& file : files) {
|
||||||
|
direct_io |= file->has_direct_io();
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr size_t alignment = 4 * 1024; // 4 KB for Direct I/O
|
||||||
// Buffer size: balance between memory usage and I/O efficiency
|
// Buffer size: balance between memory usage and I/O efficiency
|
||||||
// 64MB works well for NVMe drives
|
// 64MB works well for NVMe drives
|
||||||
constexpr size_t buffer_size = 64 * 1024 * 1024; // 64 MiB
|
const size_t buffer_size = direct_io ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
|
||||||
#else
|
|
||||||
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::vector<ggml_backend_buffer_t> host_buffers;
|
std::vector<ggml_backend_buffer_t> host_buffers;
|
||||||
std::vector<ggml_backend_event_t> events;
|
std::vector<ggml_backend_event_t> events;
|
||||||
|
|
@ -997,11 +992,8 @@ bool llama_model_loader::load_all_data(
|
||||||
|
|
||||||
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
||||||
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
||||||
#if defined(__linux__)
|
|
||||||
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size + 2 * alignment);
|
|
||||||
#else
|
|
||||||
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
||||||
#endif
|
|
||||||
if (!buf) {
|
if (!buf) {
|
||||||
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
|
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
|
||||||
ggml_backend_dev_name(dev));
|
ggml_backend_dev_name(dev));
|
||||||
|
|
@ -1038,35 +1030,6 @@ bool llama_model_loader::load_all_data(
|
||||||
ggml_backend_name(upload_backend));
|
ggml_backend_name(upload_backend));
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__linux__)
|
|
||||||
auto read_aligned_chunk = [](const llama_file * file,
|
|
||||||
size_t offset,
|
|
||||||
void * dest,
|
|
||||||
size_t size,
|
|
||||||
size_t alignment) {
|
|
||||||
off_t aligned_offset = offset & ~(alignment - 1);
|
|
||||||
off_t offset_from_alignment = offset - aligned_offset;
|
|
||||||
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
|
||||||
|
|
||||||
void * raw_buffer = nullptr;
|
|
||||||
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
|
|
||||||
if (ret != 0) {
|
|
||||||
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct aligned_buffer_deleter {
|
|
||||||
void operator()(void * p) const { free(p); }
|
|
||||||
};
|
|
||||||
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
|
||||||
|
|
||||||
file->seek(aligned_offset, SEEK_SET);
|
|
||||||
file->read_raw(buffer.get(), bytes_to_read);
|
|
||||||
|
|
||||||
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
|
||||||
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
const auto * weight = get_weight(ggml_get_name(cur));
|
const auto * weight = get_weight(ggml_get_name(cur));
|
||||||
if (weight == nullptr) {
|
if (weight == nullptr) {
|
||||||
|
|
@ -1112,100 +1075,97 @@ bool llama_model_loader::load_all_data(
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const auto & file = files.at(weight->idx);
|
const auto & file = files.at(weight->idx);
|
||||||
#if defined(__linux__)
|
|
||||||
auto offset = (off_t) weight->offs;
|
|
||||||
off_t aligned_offset = offset & ~(alignment - 1);
|
|
||||||
off_t offset_from_alignment = offset - aligned_offset;
|
|
||||||
#endif
|
|
||||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||||
#if defined(__linux__)
|
if (file->has_direct_io()) {
|
||||||
read_aligned_chunk(file.get(), weight->offs, cur->data, n_size, alignment);
|
file->read_aligned_chunk(weight->offs, cur->data, n_size, alignment);
|
||||||
#else
|
} else {
|
||||||
file->seek(weight->offs, SEEK_SET);
|
file->seek(weight->offs, SEEK_SET);
|
||||||
file->read_raw(cur->data, n_size);
|
file->read_raw(cur->data, n_size);
|
||||||
#endif
|
}
|
||||||
if (check_tensors) {
|
if (check_tensors) {
|
||||||
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
||||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
file->seek(weight->offs, SEEK_SET);
|
||||||
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
||||||
if (upload_backend) {
|
if (upload_backend) {
|
||||||
#if defined(__linux__)
|
if (file->has_direct_io()) {
|
||||||
// Calculate aligned read boundaries
|
auto offset = (off_t) weight->offs;
|
||||||
size_t read_start = aligned_offset;
|
off_t aligned_offset = offset & ~(alignment - 1);
|
||||||
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
|
off_t offset_from_alignment = offset - aligned_offset;
|
||||||
|
|
||||||
size_t bytes_read = 0;
|
// Calculate aligned read boundaries
|
||||||
size_t data_read = 0; // Actual tensor data copied (excluding padding)
|
size_t read_start = aligned_offset;
|
||||||
|
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
|
||||||
|
|
||||||
file->seek(aligned_offset, SEEK_SET);
|
size_t bytes_read = 0;
|
||||||
|
size_t data_read = 0; // Actual tensor data copied (excluding padding)
|
||||||
|
|
||||||
while (bytes_read < read_end - read_start) {
|
while (bytes_read < read_end - read_start) {
|
||||||
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
|
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
|
||||||
|
|
||||||
// Align the destination pointer within the pinned buffer
|
// Align the destination pointer within the pinned buffer
|
||||||
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
|
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
|
||||||
|
|
||||||
// Wait for previous upload to complete before reusing buffer
|
// Wait for previous upload to complete before reusing buffer
|
||||||
ggml_backend_event_synchronize(events[buffer_idx]);
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
||||||
|
|
||||||
// Read aligned chunk from file
|
// Read aligned chunk from file
|
||||||
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
||||||
|
|
||||||
// Calculate actual data portion (excluding alignment padding)
|
// Calculate actual data portion (excluding alignment padding)
|
||||||
uintptr_t ptr_data = ptr_dest_aligned;
|
uintptr_t ptr_data = ptr_dest_aligned;
|
||||||
size_t data_to_copy = read_size;
|
size_t data_to_copy = read_size;
|
||||||
|
|
||||||
// Skip alignment padding at start of first chunk
|
// Skip alignment padding at start of first chunk
|
||||||
if (bytes_read == 0) {
|
if (bytes_read == 0) {
|
||||||
ptr_data += offset_from_alignment;
|
ptr_data += offset_from_alignment;
|
||||||
data_to_copy -= offset_from_alignment;
|
data_to_copy -= offset_from_alignment;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trim alignment padding at end of last chunk
|
||||||
|
if (aligned_offset + bytes_read + read_size > offset + n_size) {
|
||||||
|
data_to_copy -= (read_end - (offset + n_size));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Async upload actual data to GPU
|
||||||
|
ggml_backend_tensor_set_async(upload_backend, cur,
|
||||||
|
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
|
||||||
|
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
||||||
|
|
||||||
|
data_read += data_to_copy;
|
||||||
|
bytes_read += read_size;
|
||||||
|
|
||||||
|
++buffer_idx;
|
||||||
|
buffer_idx %= n_buffers;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
size_t bytes_read = 0;
|
||||||
|
|
||||||
// Trim alignment padding at end of last chunk
|
while (bytes_read < n_size) {
|
||||||
if (aligned_offset + bytes_read + read_size > offset + n_size) {
|
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
||||||
data_to_copy -= (read_end - (offset + n_size));
|
|
||||||
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
||||||
|
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
||||||
|
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
||||||
|
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
||||||
|
|
||||||
|
bytes_read += read_iteration;
|
||||||
|
++buffer_idx;
|
||||||
|
buffer_idx %= n_buffers;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Async upload actual data to GPU
|
|
||||||
ggml_backend_tensor_set_async(upload_backend, cur,
|
|
||||||
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
|
|
||||||
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
|
||||||
|
|
||||||
data_read += data_to_copy;
|
|
||||||
bytes_read += read_size;
|
|
||||||
|
|
||||||
++buffer_idx;
|
|
||||||
buffer_idx %= n_buffers;
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
file->seek(weight->offs, SEEK_SET);
|
|
||||||
|
|
||||||
size_t bytes_read = 0;
|
|
||||||
|
|
||||||
while (bytes_read < n_size) {
|
|
||||||
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
|
||||||
|
|
||||||
ggml_backend_event_synchronize(events[buffer_idx]);
|
|
||||||
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
|
||||||
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
|
||||||
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
|
||||||
|
|
||||||
bytes_read += read_iteration;
|
|
||||||
++buffer_idx;
|
|
||||||
buffer_idx %= n_buffers;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
read_buf.resize(n_size);
|
read_buf.resize(n_size);
|
||||||
#if defined(__linux__)
|
if (file->has_direct_io()) {
|
||||||
read_aligned_chunk(file.get(), weight->offs, read_buf.data(), n_size, alignment);
|
file->read_aligned_chunk(weight->offs, read_buf.data(), n_size, alignment);
|
||||||
#else
|
} else {
|
||||||
file->seek(weight->offs, SEEK_SET);
|
file->seek(weight->offs, SEEK_SET);
|
||||||
file->read_raw(read_buf.data(), n_size);
|
file->read_raw(read_buf.data(), n_size);
|
||||||
#endif
|
}
|
||||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
||||||
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
||||||
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue