From db79dc06b14d8d4c5f6b49c0191444689e6a5db3 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 13 Jan 2026 08:49:10 +0100 Subject: [PATCH] llama-bench: add direct_io parameter (#18778) --- tools/llama-bench/llama-bench.cpp | 53 +++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index a98ede0a57..aed97e77e5 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -334,6 +334,7 @@ struct cmd_params { std::vector> tensor_split; std::vector> tensor_buft_overrides; std::vector use_mmap; + std::vector use_direct_io; std::vector embeddings; std::vector no_op_offload; std::vector no_host; @@ -372,6 +373,7 @@ static const cmd_params cmd_params_defaults = { /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, /* use_mmap */ { true }, + /* use_direct_io */ { true }, /* embeddings */ { false }, /* no_op_offload */ { false }, /* no_host */ { false }, @@ -449,6 +451,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -dev, --device (default: auto)\n"); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" -dio, --direct-io <0|1> (default: %s)\n", + join(cmd_params_defaults.use_direct_io, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); @@ -772,6 +776,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); + } else if (arg == "-dio" || arg == "--direct-io") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end()); } else if (arg == "-embd" || arg == "--embeddings") { if (++i >= argc) { invalid_param = true; @@ -1008,6 +1019,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } + if (params.use_direct_io.empty()) { + params.use_direct_io = cmd_params_defaults.use_direct_io; + } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } @@ -1056,6 +1070,7 @@ struct cmd_params_instance { std::vector tensor_split; std::vector tensor_buft_overrides; bool use_mmap; + bool use_direct_io; bool embeddings; bool no_op_offload; bool no_host; @@ -1067,11 +1082,12 @@ struct cmd_params_instance { if (!devices.empty()) { mparams.devices = const_cast(devices.data()); } - mparams.split_mode = split_mode; - mparams.main_gpu = main_gpu; - mparams.tensor_split = tensor_split.data(); - mparams.use_mmap = use_mmap; - mparams.no_host = no_host; + mparams.split_mode = split_mode; + mparams.main_gpu = main_gpu; + mparams.tensor_split = tensor_split.data(); + mparams.use_mmap = use_mmap; + mparams.use_direct_io = use_direct_io; + mparams.no_host = no_host; if (n_cpu_moe <= 0) { if (tensor_buft_overrides.empty()) { @@ -1115,7 +1131,8 @@ struct cmd_params_instance { bool equal_mparams(const cmd_params_instance & other) const { return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && split_mode == other.split_mode && - main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split && + main_gpu == other.main_gpu && tensor_split == other.tensor_split && + use_mmap == other.use_mmap && use_direct_io == other.use_direct_io && devices == other.devices && no_host == other.no_host && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); @@ -1153,6 +1170,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & ts : params.tensor_split) for (const auto & ot : params.tensor_buft_overrides) for (const auto & mmp : params.use_mmap) + for (const auto & dio : params.use_direct_io) for (const auto & noh : params.no_host) for (const auto & embd : params.embeddings) for (const auto & nopo : params.no_op_offload) @@ -1194,6 +1212,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, + /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1228,6 +1247,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, + /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1262,6 +1282,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, + /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1301,6 +1322,7 @@ struct test { std::vector tensor_split; std::vector tensor_buft_overrides; bool use_mmap; + bool use_direct_io; bool embeddings; bool no_op_offload; bool no_host; @@ -1338,6 +1360,7 @@ struct test { tensor_split = inst.tensor_split; tensor_buft_overrides = inst.tensor_buft_overrides; use_mmap = inst.use_mmap; + use_direct_io = inst.use_direct_io; embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; no_host = inst.no_host; @@ -1397,9 +1420,9 @@ struct test { "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", - "tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload", - "no_host", "n_prompt", "n_gen", "n_depth", "test_time", - "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" + "tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings", + "no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth", + "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" }; return fields; } @@ -1414,7 +1437,7 @@ struct test { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || - field == "use_mmap" || field == "embeddings" || field == "no_host") { + field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -1487,6 +1510,7 @@ struct test { tensor_split_str, tensor_buft_overrides_str, std::to_string(use_mmap), + std::to_string(use_direct_io), std::to_string(embeddings), std::to_string(no_op_offload), std::to_string(no_host), @@ -1672,6 +1696,9 @@ struct markdown_printer : public printer { if (field == "use_mmap") { return 4; } + if (field == "use_direct_io") { + return 3; + } if (field == "test") { return 15; } @@ -1709,6 +1736,9 @@ struct markdown_printer : public printer { if (field == "use_mmap") { return "mmap"; } + if (field == "use_direct_io") { + return "dio"; + } if (field == "embeddings") { return "embd"; } @@ -1793,6 +1823,9 @@ struct markdown_printer : public printer { if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { fields.emplace_back("use_mmap"); } + if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) { + fields.emplace_back("use_direct_io"); + } if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { fields.emplace_back("embeddings"); }