From 2038101bd9b1dcf45b5410b969fbc5206e25d993 Mon Sep 17 00:00:00 2001 From: Julius Tischbein Date: Thu, 8 Jan 2026 07:35:30 +0100 Subject: [PATCH] llama : add `use_direct_io` flag for model loading (#18166) * Adding --direct-io flag for model loading * Fixing read_raw() calls * Fixing Windows read_raw_at * Changing type off_t to size_t for windows and Renaming functions * disable direct io when mmap is explicitly enabled * Use read_raw_unsafe when upload_backend is available, not functional on some devices with Vulkan and SYCL * Fallback to std::fread in case O_DIRECT fails due to bad address * Windows: remove const keywords and unused functions * Update src/llama-mmap.cpp Co-authored-by: Georgi Gerganov --------- Co-authored-by: jtischbein Co-authored-by: Georgi Gerganov --- common/arg.cpp | 13 +++- common/common.cpp | 1 + common/common.h | 3 +- examples/diffusion/diffusion-cli.cpp | 1 + include/llama.h | 1 + src/llama-mmap.cpp | 111 +++++++++++++++++---------- src/llama-mmap.h | 9 ++- src/llama-model-loader.cpp | 22 ++++-- src/llama-model-loader.h | 2 + src/llama-model.cpp | 4 +- src/llama-quant.cpp | 2 +- src/llama.cpp | 2 +- 12 files changed, 118 insertions(+), 53 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index e7966d9d5c..26c790c7e0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2088,11 +2088,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--mmap"}, {"--no-mmap"}, - string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"), + string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"), [](common_params & params, bool value) { params.use_mmap = value; + if (value) { + params.use_direct_io = false; // disable direct io when mmap is explicitly enabled + } } ).set_env("LLAMA_ARG_MMAP")); + add_opt(common_arg( + {"-dio", "--direct-io"}, + {"-ndio", "--no-direct-io"}, + string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.use_direct_io = value; + } + ).set_env("LLAMA_ARG_DIO")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" diff --git a/common/common.cpp b/common/common.cpp index 41b2b6833e..34fa3b5a42 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1366,6 +1366,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; + mparams.use_direct_io = params.use_direct_io; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; diff --git a/common/common.h b/common/common.h index d6fd0d37a9..d55a6b71fb 100644 --- a/common/common.h +++ b/common/common.h @@ -428,7 +428,8 @@ struct common_params { bool kv_unified = false; // enable unified KV cache bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - bool use_mmap = true; // use mmap for faster loads + bool use_mmap = true; // enable mmap to use filesystem cache + bool use_direct_io = true; // read from disk without buffering for faster model loading bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 273942a165..d50f754092 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -553,6 +553,7 @@ int main(int argc, char ** argv) { model_params.n_gpu_layers = params.n_gpu_layers; model_params.devices = params.devices.data(); model_params.use_mmap = params.use_mmap; + model_params.use_direct_io = params.use_direct_io; model_params.use_mlock = params.use_mlock; model_params.check_tensors = params.check_tensors; diff --git a/include/llama.h b/include/llama.h index 05cb653254..edc4c871a1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -309,6 +309,7 @@ extern "C" { // Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible + bool use_direct_io; // use direct io, takes precedence over use_mmap bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data bool use_extra_bufts; // use extra buffer types (used for weight repacking) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 232005e140..2da857b3aa 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -110,7 +110,7 @@ struct llama_file::impl { } } - void read_raw(void * ptr, size_t len) const { + void read_raw(void * ptr, size_t len) { size_t bytes_read = 0; while (bytes_read < len) { size_t chunk_size = std::min(len - bytes_read, 64*1024*1024); @@ -127,7 +127,7 @@ struct llama_file::impl { } } - uint32_t read_u32() const { + uint32_t read_u32() { uint32_t val; read_raw(&val, sizeof(val)); return val; @@ -154,8 +154,8 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } - void read_aligned_chunk(size_t offset, void * dest, size_t size) const { - throw std::runtime_error("DirectIO is not implemented on Windows."); + bool has_direct_io() const { + return true; } ~impl() { @@ -164,33 +164,45 @@ struct llama_file::impl { } } #else - impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) { + impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) { #ifdef __linux__ // Try unbuffered I/O for read only if (use_direct_io && std::strcmp(mode, "rb") == 0) { - fd = open(fname, O_RDONLY | O_DIRECT); - - if (fd != -1) { - struct stat file_stats{}; - fstat(fd, &file_stats); - - size = file_stats.st_size; - alignment = file_stats.st_blksize; - - off_t ret = lseek(fd, 0, SEEK_SET); - if (ret == -1) { - throw std::runtime_error(format("seek error: %s", strerror(errno))); - } + if (init_fd()) { return; } - - LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O", - fname, strerror(errno)); + LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O", + fname, strerror(errno)); } #endif - fp = ggml_fopen(fname, mode); + init_fp(mode); + } + +#ifdef __linux__ + bool init_fd() { + fd = open(fname.c_str(), O_RDONLY | O_DIRECT); + + if (fd != -1) { + struct stat file_stats{}; + fstat(fd, &file_stats); + + size = file_stats.st_size; + alignment = file_stats.st_blksize; + + off_t ret = lseek(fd, 0, SEEK_SET); + if (ret == -1) { + throw std::runtime_error(format("seek error: %s", strerror(errno))); + } + return true; + } + return false; + } +#endif + + void init_fp(const char * mode) { + fp = ggml_fopen(fname.c_str(), mode); if (fp == NULL) { - throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno))); } seek(0, SEEK_END); size = tell(); @@ -226,7 +238,7 @@ struct llama_file::impl { } } - void read_raw(void * ptr, size_t len) const { + void read_raw_unsafe(void * ptr, size_t len) { if (len == 0) { return; } @@ -249,6 +261,17 @@ struct llama_file::impl { if (errno == EINTR) { continue; // Interrupted by signal, retry } + // Fallback to std::fread in case the DMA controller cannot access the buffer + if (errno == EFAULT) { + auto curr_off = tell(); + close(fd); + fd = -1; + alignment = 1; + init_fp("rb"); + seek(curr_off, SEEK_SET); + read_raw_unsafe(ptr, len); + return; + } throw std::runtime_error(format("read error: %s", strerror(errno))); } if (ret == 0) { @@ -266,7 +289,8 @@ struct llama_file::impl { } } - void read_aligned_chunk(size_t offset, void * dest, size_t size) const { + void read_aligned_chunk(void * dest, size_t size) { + size_t offset = tell(); off_t aligned_offset = offset & ~(alignment - 1); off_t offset_from_alignment = offset - aligned_offset; size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1); @@ -283,13 +307,21 @@ struct llama_file::impl { std::unique_ptr buffer(raw_buffer); seek(aligned_offset, SEEK_SET); - read_raw(buffer.get(), bytes_to_read); + read_raw_unsafe(buffer.get(), bytes_to_read); uintptr_t actual_data = reinterpret_cast(buffer.get()) + offset_from_alignment; memcpy(dest, reinterpret_cast(actual_data), size); } - uint32_t read_u32() const { + void read_raw(void * ptr, size_t len) { + if (has_direct_io()) { + read_aligned_chunk(ptr, len); + } else { + read_raw_unsafe(ptr, len); + } + } + + uint32_t read_u32() { uint32_t ret; read_raw(&ret, sizeof(ret)); return ret; @@ -310,6 +342,10 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } + bool has_direct_io() const { + return fd != -1 && alignment > 1; + } + ~impl() { if (fd != -1) { close(fd); @@ -318,17 +354,9 @@ struct llama_file::impl { } } int fd = -1; + std::string fname; #endif - void read_raw_at(void * ptr, size_t len, size_t offset) const { - if (alignment != 1) { - read_aligned_chunk(offset, ptr, len); - } else { - seek(offset, SEEK_SET); - read_raw(ptr, len); - } - } - size_t read_alignment() const { return alignment; } @@ -347,6 +375,7 @@ size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::size() const { return pimpl->size; } size_t llama_file::read_alignment() const { return pimpl->read_alignment(); } +bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); } int llama_file::file_id() const { #ifdef _WIN32 @@ -361,10 +390,14 @@ int llama_file::file_id() const { } void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } -void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); } -void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); } +void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); } +#ifdef _WIN32 +void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); } +#else +void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); } +#endif -uint32_t llama_file::read_u32() const { return pimpl->read_u32(); } +uint32_t llama_file::read_u32() { return pimpl->read_u32(); } void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); } void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); } diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 729aac164b..29ce4d2468 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -24,15 +24,16 @@ struct llama_file { void seek(size_t offset, int whence) const; - void read_raw(void * ptr, size_t len) const; - void read_raw_at(void * ptr, size_t len, size_t offset) const; - void read_aligned_chunk(size_t offset, void * dest, size_t size) const; - uint32_t read_u32() const; + void read_raw(void * ptr, size_t len); + void read_raw_unsafe(void * ptr, size_t len); + void read_aligned_chunk(void * dest, size_t size); + uint32_t read_u32(); void write_raw(const void * ptr, size_t len) const; void write_u32(uint32_t val) const; size_t read_alignment() const; + bool has_direct_io() const; private: struct impl; std::unique_ptr pimpl; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 5003b4fbf5..e66febaa02 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -495,6 +495,7 @@ llama_model_loader::llama_model_loader( const std::string & fname, std::vector & splits, bool use_mmap, + bool use_direct_io, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, @@ -527,9 +528,17 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap)); + files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); contexts.emplace_back(ctx); + use_direct_io = use_direct_io && files.back()->has_direct_io(); + + // Disable mmap in case Direct I/O is enabled and available + if (use_direct_io && use_mmap) { + use_mmap = false; + LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); + } + // Save tensors data offset of the main file. // For subsidiary files, `meta` tensor data offset must not be used, // so we build a unified tensors index for weights. @@ -595,7 +604,7 @@ llama_model_loader::llama_model_loader( } } - files.emplace_back(new llama_file(fname_split, "rb", !use_mmap)); + files.emplace_back(new llama_file(fname_split, "rb", use_direct_io)); contexts.emplace_back(ctx); // Save tensors data offset info of the shard. @@ -739,6 +748,7 @@ llama_model_loader::llama_model_loader( } this->use_mmap = use_mmap; + this->use_direct_io = use_direct_io; this->check_tensors = check_tensors; this->no_alloc = no_alloc; } @@ -1100,7 +1110,8 @@ bool llama_model_loader::load_all_data( const auto & file = files.at(weight->idx); if (ggml_backend_buffer_is_host(cur->buffer)) { - file->read_raw_at(cur->data, n_size, weight->offs); + file->seek(weight->offs, SEEK_SET); + file->read_raw(cur->data, n_size); if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); @@ -1132,7 +1143,7 @@ bool llama_model_loader::load_all_data( ggml_backend_event_synchronize(events[buffer_idx]); // Read aligned chunk from file - file->read_raw(reinterpret_cast(ptr_dest_aligned), read_size); + file->read_raw_unsafe(reinterpret_cast(ptr_dest_aligned), read_size); // Calculate actual data portion (excluding alignment padding) uintptr_t ptr_data = ptr_dest_aligned; @@ -1162,7 +1173,8 @@ bool llama_model_loader::load_all_data( } } else { read_buf.resize(n_size); - file->read_raw_at(read_buf.data(), n_size, weight->offs); + file->seek(weight->offs, SEEK_SET); + file->read_raw(read_buf.data(), n_size); ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index d13299ad3f..65953dd3d5 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -70,6 +70,7 @@ struct llama_model_loader { size_t n_bytes = 0; bool use_mmap = false; + bool use_direct_io = false; bool check_tensors; bool no_alloc; @@ -97,6 +98,7 @@ struct llama_model_loader { const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, + bool use_direct_io, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 04c48b5fd3..7ac59846bb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2440,7 +2440,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const bool use_mmap_buffer = true; - LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false"); + LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n", + __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false"); // build a list of buffer types for the CPU and GPU devices pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host); @@ -7973,6 +7974,7 @@ llama_model_params llama_model_default_params() { /*.kv_overrides =*/ nullptr, /*.vocab_only =*/ false, /*.use_mmap =*/ true, + /*.use_direct_io =*/ true, /*.use_mlock =*/ false, /*.check_tensors =*/ false, /*.use_extra_bufts =*/ true, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index bc4b05c3b5..048d65a75c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 0162ae8d58..dfefb3d2b5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -794,7 +794,7 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info();