NCCL support
This commit is contained in:
parent
c531444411
commit
8de41b5b40
|
|
@ -7,6 +7,8 @@ set(GGML_VERSION_MINOR 9)
|
||||||
set(GGML_VERSION_PATCH 5)
|
set(GGML_VERSION_PATCH 5)
|
||||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||||
|
|
||||||
|
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||||
|
|
||||||
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
||||||
if(GIT_EXE)
|
if(GIT_EXE)
|
||||||
# Get current git commit hash
|
# Get current git commit hash
|
||||||
|
|
@ -203,6 +205,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
|
||||||
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
|
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
|
||||||
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||||
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
||||||
|
option(GGML_CUDA_NCCL "ggml: use NVIDIA Collective Comm. Library" ON)
|
||||||
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
|
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
|
||||||
"ggml: cuda link binary compression mode; requires cuda 12.8+")
|
"ggml: cuda link binary compression mode; requires cuda 12.8+")
|
||||||
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
|
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
# cmake/FindNCCL.cmake
|
||||||
|
|
||||||
|
find_path(NCCL_INCLUDE_DIR
|
||||||
|
NAMES nccl.h
|
||||||
|
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda
|
||||||
|
PATH_SUFFIXES include
|
||||||
|
)
|
||||||
|
|
||||||
|
find_library(NCCL_LIBRARY
|
||||||
|
NAMES nccl
|
||||||
|
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda
|
||||||
|
PATH_SUFFIXES lib lib64
|
||||||
|
)
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(NCCL
|
||||||
|
DEFAULT_MSG
|
||||||
|
NCCL_LIBRARY NCCL_INCLUDE_DIR
|
||||||
|
)
|
||||||
|
|
||||||
|
if(NCCL_FOUND)
|
||||||
|
set(NCCL_LIBRARIES ${NCCL_LIBRARY})
|
||||||
|
set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
|
||||||
|
|
||||||
|
if(NOT TARGET NCCL::NCCL)
|
||||||
|
add_library(NCCL::NCCL UNKNOWN IMPORTED)
|
||||||
|
set_target_properties(NCCL::NCCL PROPERTIES
|
||||||
|
IMPORTED_LOCATION "${NCCL_LIBRARY}"
|
||||||
|
INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}"
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARY)
|
||||||
|
|
@ -209,7 +209,9 @@ extern "C" {
|
||||||
|
|
||||||
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||||
|
|
||||||
// Split buffer type for tensor parallelism
|
// AllReduce operation for tensor parallelism (meta backend)
|
||||||
|
typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
|
||||||
|
// Split buffer type for tensor parallelism (old)
|
||||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
||||||
// Set the number of threads for the backend
|
// Set the number of threads for the backend
|
||||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,9 @@ GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||||
// device buffer
|
// device buffer
|
||||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
|
|
||||||
|
// conduct allreduce operation between devices
|
||||||
|
GGML_BACKEND_API bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -946,9 +946,11 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i < n_subgraphs - 1) {
|
if (n_backends > 1 && i < n_subgraphs - 1) {
|
||||||
bool backend_allreduce_success = false;
|
bool backend_allreduce_success = false;
|
||||||
if (backend_ctx->backend_configs[0].backend->iface.allreduce_tensor_async) {
|
ggml_backend_allreduce_tensor_t allreduce_tensor = (ggml_backend_allreduce_tensor_t) ggml_backend_reg_get_proc_address(
|
||||||
|
ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_ctx->backend_configs[0].backend)), "ggml_backend_allreduce_tensor");
|
||||||
|
if (allreduce_tensor) {
|
||||||
std::vector<ggml_backend_t> backends;
|
std::vector<ggml_backend_t> backends;
|
||||||
backends.reserve(n_backends);
|
backends.reserve(n_backends);
|
||||||
std::vector<ggml_tensor *> nodes;
|
std::vector<ggml_tensor *> nodes;
|
||||||
|
|
@ -957,11 +959,8 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||||
auto & bcj = backend_ctx->backend_configs[j];
|
auto & bcj = backend_ctx->backend_configs[j];
|
||||||
backends.push_back(bcj.backend);
|
backends.push_back(bcj.backend);
|
||||||
nodes.push_back(bcj.cgraphs[i].cgraph_main.nodes[bcj.cgraphs[i].cgraph_main.n_nodes-1]);
|
nodes.push_back(bcj.cgraphs[i].cgraph_main.nodes[bcj.cgraphs[i].cgraph_main.n_nodes-1]);
|
||||||
GGML_ASSERT(nodes.back()->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(nodes.back()));
|
|
||||||
}
|
}
|
||||||
backend_allreduce_success = backend_ctx->backend_configs[0].backend->iface.allreduce_tensor_async(
|
backend_allreduce_success = allreduce_tensor(backends.data(), nodes.data(), n_backends);
|
||||||
backends.data(), nodes.data(), n_backends);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!backend_allreduce_success) {
|
if (!backend_allreduce_success) {
|
||||||
|
|
|
||||||
|
|
@ -182,6 +182,16 @@ if (CUDAToolkit_FOUND)
|
||||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
|
target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CUDA_NCCL)
|
||||||
|
find_package(NCCL)
|
||||||
|
if (NCCL_FOUND)
|
||||||
|
add_compile_definitions(GGML_USE_NCCL)
|
||||||
|
target_link_libraries(ggml-cuda PRIVATE NCCL::NCCL)
|
||||||
|
else()
|
||||||
|
message(STATUS "Warning: NCCL not found, performance for multiple CUDA GPUs will be suboptimal")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
set(CUDA_CXX_FLAGS "")
|
set(CUDA_CXX_FLAGS "")
|
||||||
|
|
||||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||||
|
|
|
||||||
|
|
@ -186,6 +186,10 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||||
|
|
||||||
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
||||||
|
|
||||||
|
#ifdef GGML_USE_NCCL
|
||||||
|
#define NCCL_CHECK(err) CUDA_CHECK_GEN(err, ncclSuccess, ncclGetErrorString)
|
||||||
|
#endif // GGML_USE_NCCL
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
static const char * cu_get_error_str(CUresult err) {
|
static const char * cu_get_error_str(CUresult err) {
|
||||||
const char * err_str;
|
const char * err_str;
|
||||||
|
|
@ -1050,6 +1054,8 @@ struct ggml_cuda_device_info {
|
||||||
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
|
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
|
||||||
|
|
||||||
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
||||||
|
|
||||||
|
ncclComm_t comms[GGML_CUDA_MAX_DEVICES];
|
||||||
};
|
};
|
||||||
|
|
||||||
const ggml_cuda_device_info & ggml_cuda_info();
|
const ggml_cuda_device_info & ggml_cuda_info();
|
||||||
|
|
|
||||||
|
|
@ -322,6 +322,13 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int dev_ids[GGML_CUDA_MAX_DEVICES];
|
||||||
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
|
dev_ids[id] = id;
|
||||||
|
}
|
||||||
|
NCCL_CHECK(ncclCommInitAll(info.comms, info.device_count, dev_ids));
|
||||||
|
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1077,6 +1084,33 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
|
||||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends) {
|
||||||
|
#ifdef GGML_USE_NCCL
|
||||||
|
const ggml_cuda_device_info info = ggml_cuda_info();
|
||||||
|
|
||||||
|
const size_t ne = ggml_nelements(tensors[0]);
|
||||||
|
|
||||||
|
NCCL_CHECK(ncclGroupStart());
|
||||||
|
for (size_t i = 0; i < n_backends; ++i) {
|
||||||
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||||
|
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
|
||||||
|
}
|
||||||
|
NCCL_CHECK(ncclGroupEnd());
|
||||||
|
|
||||||
|
return true;
|
||||||
|
#else
|
||||||
|
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||||
|
static bool warning_printed = false;
|
||||||
|
if (!warning_printed) {
|
||||||
|
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n");
|
||||||
|
warning_printed = true;
|
||||||
|
}
|
||||||
|
GGML_UNUSED_VARS(backends, tensors, n_backends);
|
||||||
|
return false;
|
||||||
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||||
|
#endif // GGML_USE_NCCL
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
|
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
|
||||||
static std::mutex mutex;
|
static std::mutex mutex;
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
|
@ -5049,6 +5083,9 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
|
||||||
|
|
||||||
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||||
GGML_UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
|
if (strcmp(name, "ggml_backend_allreduce_tensor") == 0) {
|
||||||
|
return (void *)ggml_backend_cuda_allreduce_tensor;
|
||||||
|
}
|
||||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||||
return (void *)ggml_backend_cuda_split_buffer_type;
|
return (void *)ggml_backend_cuda_split_buffer_type;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,10 @@
|
||||||
#include <cuda_bf16.h>
|
#include <cuda_bf16.h>
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
|
#ifdef GGML_USE_NCCL
|
||||||
|
#include <nccl.h>
|
||||||
|
#endif // GGML_USE_NCCL
|
||||||
|
|
||||||
#if CUDART_VERSION >= 12050
|
#if CUDART_VERSION >= 12050
|
||||||
#include <cuda_fp8.h>
|
#include <cuda_fp8.h>
|
||||||
#endif // CUDART_VERSION >= 12050
|
#endif // CUDART_VERSION >= 12050
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue