Remove shfl and AllReduce from backend interface

This commit is contained in:
Johannes Gäßler 2026-02-11 14:51:37 +01:00
parent 29c5327d01
commit 4dc3d10e80
19 changed files with 4 additions and 132 deletions

View File

@ -1,5 +1,7 @@
# cmake/FindNCCL.cmake
# NVIDIA does not distribute CMake files with NCCl, therefore use this file to find it instead.
find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda

View File

@ -111,17 +111,6 @@ extern "C" {
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// asynchronous tensor shuffle
// - src1, dst1 belong to backend_1
// - src2, dst2 belong to backend_2
// - src1 is copied to dst2
// - src2 is copied to dst1
// - both backends wait until both copies have completed
GGML_API void ggml_backend_tensor_shfl_async(
ggml_backend_t backend_1, ggml_backend_t backend_2,
const struct ggml_tensor * src1, const struct ggml_tensor * src2,
struct ggml_tensor * dst1, struct ggml_tensor * dst2);
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
//

View File

@ -97,11 +97,6 @@ extern "C" {
void (*set_tensor_2d_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
bool (*shfl_tensor_async)(ggml_backend_t backend_1, ggml_backend_t backend_2,
const struct ggml_tensor * src1, const struct ggml_tensor * src2, struct ggml_tensor * dst1, struct ggml_tensor * dst2);
// (optional) backend-specific AllReduce operation for meta backend
bool (*allreduce_tensor_async)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// (optional) complete all pending operations (required if the backend supports async operations)
void (*synchronize)(ggml_backend_t backend);

View File

@ -894,7 +894,8 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
bcj1.cgraphs[i].nodes_aux.push_back(node_tmp_1);
bcj2.cgraphs[i].nodes_aux.push_back(node_tmp_2);
ggml_backend_tensor_shfl_async(bcj1.backend, bcj2.backend, node1, node2, node_tmp_1, node_tmp_2);
ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2);
ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1);
ggml_tensor * node_red_1 = backend_ctx->get_next_tensor(j, tensors, node1);
ggml_tensor * node_red_2 = backend_ctx->get_next_tensor(j_other, tensors, node2);
@ -982,8 +983,6 @@ static const ggml_backend_i ggml_backend_meta_i = {
/* .get_tensor_2d_async = */ nullptr,
/* .set_tensor_2d_async = */ nullptr,
/* .cpy_tensor_async = */ nullptr,
/* .shfl_tensor_async = */ nullptr,
/* .allreduce_tensor_async = */ nullptr,
/* .synchronize = */ ggml_backend_meta_synchronize,
/* .graph_plan_create = */ nullptr,
/* .graph_plan_free = */ nullptr,

View File

@ -432,20 +432,6 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
ggml_backend_tensor_copy(src, dst);
}
void ggml_backend_tensor_shfl_async(
ggml_backend_t backend_1, ggml_backend_t backend_2,
const struct ggml_tensor * src1, const struct ggml_tensor * src2,
struct ggml_tensor * dst1, struct ggml_tensor * dst2) {
GGML_ASSERT(ggml_are_same_layout(src1, dst1) && "cannot shuffle tensors with different layouts");
GGML_ASSERT(ggml_are_same_layout(src2, dst2) && "cannot shuffle tensors with different layouts");
if (backend_1->iface.shfl_tensor_async != NULL) {
if (backend_1->iface.shfl_tensor_async(backend_1, backend_2, src1, src2, dst1, dst2)) {
return;
}
}
ggml_backend_tensor_copy_async(backend_1, backend_2, src1, dst2);
ggml_backend_tensor_copy_async(backend_2, backend_1, src2, dst1);
}
// events
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {

View File

@ -264,8 +264,6 @@ static struct ggml_backend_i blas_backend_i = {
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -2570,8 +2570,6 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_cann_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -198,8 +198,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,

View File

@ -2841,77 +2841,6 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
return true;
}
static bool ggml_backend_cuda_shfl_tensor_async(
ggml_backend_t backend_1, ggml_backend_t backend_2, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst1, ggml_tensor * dst2) {
ggml_backend_buffer_t buf_src1 = src1->view_src ? src1->view_src->buffer : src1->buffer;
ggml_backend_buffer_t buf_src2 = src2->view_src ? src2->view_src->buffer : src2->buffer;
ggml_backend_buffer_t buf_dst1 = dst1->view_src ? dst1->view_src->buffer : dst1->buffer;
ggml_backend_buffer_t buf_dst2 = dst2->view_src ? dst2->view_src->buffer : dst2->buffer;
if (!ggml_backend_is_cuda(backend_1) || !ggml_backend_is_cuda(backend_2)) {
return false;
}
if (!ggml_backend_buffer_is_cuda(buf_src1) || !ggml_backend_buffer_is_cuda(buf_src2) ||
!ggml_backend_buffer_is_cuda(buf_dst1) || !ggml_backend_buffer_is_cuda(buf_dst2)) {
return false;
}
// device -> device copy
ggml_backend_cuda_context * cuda_ctx_1 = (ggml_backend_cuda_context *) backend_1->context;
ggml_backend_cuda_context * cuda_ctx_2 = (ggml_backend_cuda_context *) backend_2->context;
ggml_backend_cuda_buffer_context * buf_ctx_src1 = (ggml_backend_cuda_buffer_context *) buf_src1->context;
ggml_backend_cuda_buffer_context * buf_ctx_src2 = (ggml_backend_cuda_buffer_context *) buf_src2->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst1 = (ggml_backend_cuda_buffer_context *) buf_dst1->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst2 = (ggml_backend_cuda_buffer_context *) buf_dst2->context;
if (cuda_ctx_1->device != buf_ctx_src1->device || cuda_ctx_2->device != buf_ctx_src2->device ||
cuda_ctx_1->device != buf_ctx_dst1->device || cuda_ctx_2->device != buf_ctx_dst2->device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif // NDEBUG
return false;
}
if (backend_1 != backend_2) {
// Copies under control of src streams:
if (cuda_ctx_1->device == cuda_ctx_2->device) {
CUDA_CHECK(cudaMemcpyAsync(dst2->data, src1->data, ggml_nbytes(dst2), cudaMemcpyDeviceToDevice, cuda_ctx_1->stream()));
CUDA_CHECK(cudaMemcpyAsync(dst1->data, src2->data, ggml_nbytes(dst1), cudaMemcpyDeviceToDevice, cuda_ctx_2->stream()));
} else {
#ifdef GGML_CUDA_NO_PEER_COPY
return false;
#else
CUDA_CHECK(cudaMemcpyPeerAsync(dst2->data, cuda_ctx_2->device, src1->data, cuda_ctx_1->device, ggml_nbytes(dst2), cuda_ctx_1->stream()));
CUDA_CHECK(cudaMemcpyPeerAsync(dst1->data, cuda_ctx_1->device, src2->data, cuda_ctx_2->device, ggml_nbytes(dst1), cuda_ctx_2->stream()));
#endif // GGML_CUDA_NO_PEER_COPY
}
// Record event on src streams after the copy:
if (!cuda_ctx_1->copy_event) {
ggml_cuda_set_device(cuda_ctx_1->device);
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_1->copy_event, cudaEventDisableTiming));
}
if (!cuda_ctx_2->copy_event) {
ggml_cuda_set_device(cuda_ctx_2->device);
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_2->copy_event, cudaEventDisableTiming));
}
CUDA_CHECK(cudaEventRecord(cuda_ctx_1->copy_event, cuda_ctx_1->stream()));
CUDA_CHECK(cudaEventRecord(cuda_ctx_2->copy_event, cuda_ctx_2->stream()));
// Wait on dst stream for the copies to complete:
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_2->stream(), cuda_ctx_1->copy_event, 0));
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_1->stream(), cuda_ctx_2->copy_event, 0));
} else {
// srcs and dsts are on the same backend:
CUDA_CHECK(cudaMemcpyAsync(dst2->data, src1->data, ggml_nbytes(dst2), cudaMemcpyDeviceToDevice, cuda_ctx_1->stream()));
CUDA_CHECK(cudaMemcpyAsync(dst1->data, src2->data, ggml_nbytes(dst1), cudaMemcpyDeviceToDevice, cuda_ctx_2->stream()));
}
return true;
}
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@ -4314,8 +4243,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
/* .shfl_tensor_async = */ ggml_backend_cuda_shfl_tensor_async,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -2844,8 +2844,6 @@ static struct ggml_backend_i hexagon_backend_i = {
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_hexagon_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -566,8 +566,6 @@ static ggml_backend_i ggml_backend_metal_i = {
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_metal_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -3480,8 +3480,6 @@ static ggml_backend_i ggml_backend_opencl_i = {
/* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_opencl_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -895,8 +895,6 @@ static ggml_backend_i ggml_backend_rpc_interface = {
/* .cpy_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_rpc_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -4460,8 +4460,6 @@ static ggml_backend_i ggml_backend_sycl_interface = {
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
// // TODO: update for the new
// interface
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_sycl_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -37,8 +37,6 @@ static ggml_backend_i ggml_backend_remoting_interface = {
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL, // ggml_backend_remoting_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -14377,8 +14377,6 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_vk_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -2200,8 +2200,6 @@ static ggml_backend_i ggml_backend_webgpu_i = {
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -424,8 +424,6 @@ static ggml_backend_i ggml_backend_zdnn_i = {
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -243,8 +243,6 @@ static struct ggml_backend_i ggml_backend_zendnn_i = {
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,