NCCL support
This commit is contained in:
parent
c531444411
commit
8de41b5b40
|
|
@ -7,6 +7,8 @@ set(GGML_VERSION_MINOR 9)
|
|||
set(GGML_VERSION_PATCH 5)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
||||
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
||||
if(GIT_EXE)
|
||||
# Get current git commit hash
|
||||
|
|
@ -203,6 +205,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
|
|||
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
|
||||
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
||||
option(GGML_CUDA_NCCL "ggml: use NVIDIA Collective Comm. Library" ON)
|
||||
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
|
||||
"ggml: cuda link binary compression mode; requires cuda 12.8+")
|
||||
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,34 @@
|
|||
# cmake/FindNCCL.cmake
|
||||
|
||||
find_path(NCCL_INCLUDE_DIR
|
||||
NAMES nccl.h
|
||||
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda
|
||||
PATH_SUFFIXES include
|
||||
)
|
||||
|
||||
find_library(NCCL_LIBRARY
|
||||
NAMES nccl
|
||||
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda
|
||||
PATH_SUFFIXES lib lib64
|
||||
)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(NCCL
|
||||
DEFAULT_MSG
|
||||
NCCL_LIBRARY NCCL_INCLUDE_DIR
|
||||
)
|
||||
|
||||
if(NCCL_FOUND)
|
||||
set(NCCL_LIBRARIES ${NCCL_LIBRARY})
|
||||
set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
|
||||
|
||||
if(NOT TARGET NCCL::NCCL)
|
||||
add_library(NCCL::NCCL UNKNOWN IMPORTED)
|
||||
set_target_properties(NCCL::NCCL PROPERTIES
|
||||
IMPORTED_LOCATION "${NCCL_LIBRARY}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}"
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARY)
|
||||
|
|
@ -209,7 +209,9 @@ extern "C" {
|
|||
|
||||
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||
|
||||
// Split buffer type for tensor parallelism
|
||||
// AllReduce operation for tensor parallelism (meta backend)
|
||||
typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
|
||||
// Split buffer type for tensor parallelism (old)
|
||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
||||
// Set the number of threads for the backend
|
||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
||||
|
|
|
|||
|
|
@ -27,6 +27,9 @@ GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
|||
// device buffer
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
|
||||
// conduct allreduce operation between devices
|
||||
GGML_BACKEND_API bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
|
||||
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
||||
|
||||
|
|
|
|||
|
|
@ -946,9 +946,11 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
}
|
||||
}
|
||||
|
||||
if (i < n_subgraphs - 1) {
|
||||
if (n_backends > 1 && i < n_subgraphs - 1) {
|
||||
bool backend_allreduce_success = false;
|
||||
if (backend_ctx->backend_configs[0].backend->iface.allreduce_tensor_async) {
|
||||
ggml_backend_allreduce_tensor_t allreduce_tensor = (ggml_backend_allreduce_tensor_t) ggml_backend_reg_get_proc_address(
|
||||
ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_ctx->backend_configs[0].backend)), "ggml_backend_allreduce_tensor");
|
||||
if (allreduce_tensor) {
|
||||
std::vector<ggml_backend_t> backends;
|
||||
backends.reserve(n_backends);
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
|
|
@ -957,11 +959,8 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
backends.push_back(bcj.backend);
|
||||
nodes.push_back(bcj.cgraphs[i].cgraph_main.nodes[bcj.cgraphs[i].cgraph_main.n_nodes-1]);
|
||||
GGML_ASSERT(nodes.back()->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(nodes.back()));
|
||||
}
|
||||
backend_allreduce_success = backend_ctx->backend_configs[0].backend->iface.allreduce_tensor_async(
|
||||
backends.data(), nodes.data(), n_backends);
|
||||
backend_allreduce_success = allreduce_tensor(backends.data(), nodes.data(), n_backends);
|
||||
}
|
||||
|
||||
if (!backend_allreduce_success) {
|
||||
|
|
|
|||
|
|
@ -182,6 +182,16 @@ if (CUDAToolkit_FOUND)
|
|||
target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NCCL)
|
||||
find_package(NCCL)
|
||||
if (NCCL_FOUND)
|
||||
add_compile_definitions(GGML_USE_NCCL)
|
||||
target_link_libraries(ggml-cuda PRIVATE NCCL::NCCL)
|
||||
else()
|
||||
message(STATUS "Warning: NCCL not found, performance for multiple CUDA GPUs will be suboptimal")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||
|
|
|
|||
|
|
@ -186,6 +186,10 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
|||
|
||||
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
#define NCCL_CHECK(err) CUDA_CHECK_GEN(err, ncclSuccess, ncclGetErrorString)
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
static const char * cu_get_error_str(CUresult err) {
|
||||
const char * err_str;
|
||||
|
|
@ -1050,6 +1054,8 @@ struct ggml_cuda_device_info {
|
|||
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
|
||||
|
||||
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
||||
|
||||
ncclComm_t comms[GGML_CUDA_MAX_DEVICES];
|
||||
};
|
||||
|
||||
const ggml_cuda_device_info & ggml_cuda_info();
|
||||
|
|
|
|||
|
|
@ -322,6 +322,13 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
int dev_ids[GGML_CUDA_MAX_DEVICES];
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
dev_ids[id] = id;
|
||||
}
|
||||
NCCL_CHECK(ncclCommInitAll(info.comms, info.device_count, dev_ids));
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
|
|
@ -1077,6 +1084,33 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
|
|||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
const ggml_cuda_device_info info = ggml_cuda_info();
|
||||
|
||||
const size_t ne = ggml_nelements(tensors[0]);
|
||||
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
return true;
|
||||
#else
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
static bool warning_printed = false;
|
||||
if (!warning_printed) {
|
||||
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n");
|
||||
warning_printed = true;
|
||||
}
|
||||
GGML_UNUSED_VARS(backends, tensors, n_backends);
|
||||
return false;
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
|
@ -5049,6 +5083,9 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
|
|||
|
||||
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
GGML_UNUSED(reg);
|
||||
if (strcmp(name, "ggml_backend_allreduce_tensor") == 0) {
|
||||
return (void *)ggml_backend_cuda_allreduce_tensor;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
return (void *)ggml_backend_cuda_split_buffer_type;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,10 @@
|
|||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
#include <nccl.h>
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
#if CUDART_VERSION >= 12050
|
||||
#include <cuda_fp8.h>
|
||||
#endif // CUDART_VERSION >= 12050
|
||||
|
|
|
|||
Loading…
Reference in New Issue