diff --git a/common/arg.cpp b/common/arg.cpp index 04fd375d56..2f68bdc1c0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2198,18 +2198,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--mmap"}, {"--no-mmap"}, - string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"), + string_format("whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"), [](common_params & params, bool value) { params.use_mmap = value; - if (value) { - params.use_direct_io = false; // disable direct io when mmap is explicitly enabled - } } ).set_env("LLAMA_ARG_MMAP")); add_opt(common_arg( {"-dio", "--direct-io"}, {"-ndio", "--no-direct-io"}, - string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"), + string_format("use DirectIO if available. (default: %s)", params.use_direct_io ? "enabled" : "disabled"), [](common_params & params, bool value) { params.use_direct_io = value; } diff --git a/common/common.h b/common/common.h index 96c990c05d..21c11f457d 100644 --- a/common/common.h +++ b/common/common.h @@ -438,7 +438,7 @@ struct common_params { bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool use_mmap = true; // enable mmap to use filesystem cache - bool use_direct_io = true; // read from disk without buffering for faster model loading + bool use_direct_io = false; // read from disk without buffering bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation diff --git a/include/llama.h b/include/llama.h index c3360ae57c..bf4e28a8be 100644 --- a/include/llama.h +++ b/include/llama.h @@ -309,7 +309,7 @@ extern "C" { // Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible - bool use_direct_io; // use direct io, takes precedence over use_mmap + bool use_direct_io; // use direct io, takes precedence over use_mmap when supported bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data bool use_extra_bufts; // use extra buffer types (used for weight repacking) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 383b8dc761..1501e392ca 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -541,15 +541,15 @@ llama_model_loader::llama_model_loader( if (use_mmap && use_direct_io) { if (files.back()->has_direct_io()) { - // Disable mmap, as DirectIO is available - use_mmap = false; LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); + use_mmap = false; } else { - // Disable DirectIO and reopen file using std::fopen for mmap + LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); use_direct_io = false; + + // reopen file using std::fopen for mmap files.pop_back(); files.emplace_back(new llama_file(fname.c_str(), "rb", false)); - LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); } } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index cc784e1cb0..72490a89b5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -8125,7 +8125,7 @@ llama_model_params llama_model_default_params() { /*.kv_overrides =*/ nullptr, /*.vocab_only =*/ false, /*.use_mmap =*/ true, - /*.use_direct_io =*/ true, + /*.use_direct_io =*/ false, /*.use_mlock =*/ false, /*.check_tensors =*/ false, /*.use_extra_bufts =*/ true, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a2b8d4e56c..776222cb6f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -545,7 +545,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params());