diff --git a/common/arg.cpp b/common/arg.cpp index 18f953a38e..d1b8cb43d9 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2331,11 +2331,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_N_GPU_LAYERS")); add_opt(common_arg( - {"-sm", "--split-mode"}, "{none,layer,row}", + {"-sm", "--split-mode"}, "{none,layer,row,group}", "how to split the model across multiple GPUs, one of:\n" "- none: use one GPU only\n" "- layer (default): split layers and KV across GPUs\n" - "- row: split rows across GPUs", + "- row: split rows across GPUs\n" + "- group: group GPUs to use minimum number needed based on available memory", [](common_params & params, const std::string & value) { std::string arg_next = value; if (arg_next == "none") { @@ -2344,6 +2345,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { params.split_mode = LLAMA_SPLIT_MODE_ROW; + } else if (arg_next == "group") { + params.split_mode = LLAMA_SPLIT_MODE_GROUP; } else { throw std::invalid_argument("invalid value"); } diff --git a/include/llama.h b/include/llama.h index 305623127c..d021acee9e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -192,6 +192,7 @@ extern "C" { LLAMA_SPLIT_MODE_NONE = 0, // single GPU LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported + LLAMA_SPLIT_MODE_GROUP = 3, // group GPUs to use minimum number needed based on available memory }; // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) diff --git a/src/llama.cpp b/src/llama.cpp index 6da90d6f1f..81c0323f4e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #if defined(_MSC_VER) @@ -142,6 +143,87 @@ static std::vector llama_get_device_memory_data( return ret; } +static std::vector select_min_gpu_subset( + const std::vector & available_gpus, + const char * path_model) { + // estimated runtime memory / file size (GGUF + dequant/overhead) + constexpr double MEMORY_ESTIMATE_RATIO = 1.5; + constexpr int64_t MiB = 1024*1024; + + if (available_gpus.empty()) { + return available_gpus; + } + + std::vector gpu_devices; + for (auto dev : available_gpus) { + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { + gpu_devices.push_back(dev); + } + } + if (gpu_devices.empty()) { + LLAMA_LOG_INFO("%s: no GPU devices found, using all devices\n", __func__); + return available_gpus; + } + + std::vector sorted_gpus = gpu_devices; + std::sort(sorted_gpus.begin(), sorted_gpus.end(), [](ggml_backend_dev_t a, ggml_backend_dev_t b) { + size_t free_a, total_a, free_b, total_b; + ggml_backend_dev_memory(a, &free_a, &total_a); + ggml_backend_dev_memory(b, &free_b, &total_b); + (void)total_a; + (void)total_b; + return free_a > free_b; + }); + + size_t file_size = 0; + try { + file_size = static_cast(std::filesystem::file_size(path_model)); + } catch (const std::exception & e) { + LLAMA_LOG_ERROR("%s: failed to get file size for '%s': %s\n", __func__, path_model, e.what()); + LLAMA_LOG_INFO("%s: using all available devices as fallback\n", __func__); + return available_gpus; + } catch (...) { + LLAMA_LOG_ERROR("%s: failed to get file size for '%s': unknown error\n", __func__, path_model); + LLAMA_LOG_INFO("%s: using all available devices as fallback\n", __func__); + return available_gpus; + } + if (file_size == 0) { + LLAMA_LOG_ERROR("%s: model file '%s' appears to be empty\n", __func__, path_model); + LLAMA_LOG_INFO("%s: using all available devices as fallback\n", __func__); + return available_gpus; + } + + size_t estimated_model_mem = static_cast(file_size * MEMORY_ESTIMATE_RATIO); + LLAMA_LOG_DEBUG("%s: model file size: %zu MiB\n", __func__, file_size / MiB); + LLAMA_LOG_DEBUG("%s: estimated memory required: %zu MiB\n", __func__, estimated_model_mem / MiB); + + std::vector selected_gpus; + size_t cumulative_free = 0; + + for (auto dev : sorted_gpus) { + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + (void)total; + selected_gpus.push_back(dev); + cumulative_free += free; + if (cumulative_free >= estimated_model_mem) { + LLAMA_LOG_DEBUG("%s: selected %zu device(s) for estimated %zu MiB model memory\n", + __func__, selected_gpus.size(), estimated_model_mem / MiB); + return selected_gpus; + } + } + + LLAMA_LOG_DEBUG("%s: selected all %zu device(s) for estimated %zu MiB model memory\n", + __func__, selected_gpus.size(), estimated_model_mem / MiB); + if (cumulative_free < estimated_model_mem) { + LLAMA_LOG_WARN("%s: combined free memory (%zu MiB) is less than estimated model memory (%zu MiB)\n", + __func__, cumulative_free / MiB, estimated_model_mem / MiB); + LLAMA_LOG_WARN("%s: model load may fail or run out of memory\n", __func__); + } + + return selected_gpus; +} + // enum to identify part of a layer for distributing its tensors: enum layer_fraction_t { LAYER_FRACTION_NONE = 0, // nothing @@ -978,6 +1060,11 @@ static struct llama_model * llama_model_load_from_file_impl( } } + // if using group mode, select minimum GPU subset based on free memory + if (params.split_mode == LLAMA_SPLIT_MODE_GROUP) { + model->devices = select_min_gpu_subset(model->devices, path_model.c_str()); + } + // if using single GPU mode, remove all except the main GPU if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { if (params.main_gpu < 0) { diff --git a/tools/cli/README.md b/tools/cli/README.md index 4a15cbad9d..2977fc49bd 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -66,7 +66,7 @@ | `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU
(env: LLAMA_ARG_CPU_MOE) | | `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU
(env: LLAMA_ARG_N_CPU_MOE) | | `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS) | -| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
(env: LLAMA_ARG_SPLIT_MODE) | +| `-sm, --split-mode {none,layer,row,group}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
- group: group GPUs to use minimum number needed based on available memory
(env: LLAMA_ARG_SPLIT_MODE) | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
(env: LLAMA_ARG_TENSOR_SPLIT) | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | | `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')
(env: LLAMA_ARG_FIT) | diff --git a/tools/completion/README.md b/tools/completion/README.md index 3ca3e68454..4e23ed3181 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -149,7 +149,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU
(env: LLAMA_ARG_CPU_MOE) | | `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU
(env: LLAMA_ARG_N_CPU_MOE) | | `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS) | -| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
(env: LLAMA_ARG_SPLIT_MODE) | +| `-sm, --split-mode {none,layer,row,group}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
- group: group GPUs to use minimum number needed based on available memory
(env: LLAMA_ARG_SPLIT_MODE) | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
(env: LLAMA_ARG_TENSOR_SPLIT) | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | | `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')
(env: LLAMA_ARG_FIT) | diff --git a/tools/server/README.md b/tools/server/README.md index 0b56ca1e27..5d2f580d1d 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -83,7 +83,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU
(env: LLAMA_ARG_CPU_MOE) | | `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU
(env: LLAMA_ARG_N_CPU_MOE) | | `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS) | -| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
(env: LLAMA_ARG_SPLIT_MODE) | +| `-sm, --split-mode {none,layer,row,group}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
- group: group GPUs to use minimum number needed based on available memory
(env: LLAMA_ARG_SPLIT_MODE) | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
(env: LLAMA_ARG_TENSOR_SPLIT) | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | | `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')
(env: LLAMA_ARG_FIT) |