From 4dc3d10e802a691a7c4cb30ddde9b56f217c366a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 11 Feb 2026 14:51:37 +0100 Subject: [PATCH] Remove shfl and AllReduce from backend interface --- ggml/cmake/FindNCCL.cmake | 2 + ggml/include/ggml-backend.h | 11 ---- ggml/src/ggml-backend-impl.h | 5 -- ggml/src/ggml-backend-meta.cpp | 5 +- ggml/src/ggml-backend.cpp | 14 ----- ggml/src/ggml-blas/ggml-blas.cpp | 2 - ggml/src/ggml-cann/ggml-cann.cpp | 2 - ggml/src/ggml-cpu/ggml-cpu.cpp | 2 - ggml/src/ggml-cuda/ggml-cuda.cu | 73 -------------------------- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 2 - ggml/src/ggml-metal/ggml-metal.cpp | 2 - ggml/src/ggml-opencl/ggml-opencl.cpp | 2 - ggml/src/ggml-rpc/ggml-rpc.cpp | 2 - ggml/src/ggml-sycl/ggml-sycl.cpp | 2 - ggml/src/ggml-virtgpu/ggml-backend.cpp | 2 - ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 - ggml/src/ggml-webgpu/ggml-webgpu.cpp | 2 - ggml/src/ggml-zdnn/ggml-zdnn.cpp | 2 - ggml/src/ggml-zendnn/ggml-zendnn.cpp | 2 - 19 files changed, 4 insertions(+), 132 deletions(-) diff --git a/ggml/cmake/FindNCCL.cmake b/ggml/cmake/FindNCCL.cmake index fc704de686..67511e2d56 100644 --- a/ggml/cmake/FindNCCL.cmake +++ b/ggml/cmake/FindNCCL.cmake @@ -1,5 +1,7 @@ # cmake/FindNCCL.cmake +# NVIDIA does not distribute CMake files with NCCl, therefore use this file to find it instead. + find_path(NCCL_INCLUDE_DIR NAMES nccl.h HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index cb379bca56..6962e9a74c 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -111,17 +111,6 @@ extern "C" { // automatic fallback to sync copy if async is not supported GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); - // asynchronous tensor shuffle - // - src1, dst1 belong to backend_1 - // - src2, dst2 belong to backend_2 - // - src1 is copied to dst2 - // - src2 is copied to dst1 - // - both backends wait until both copies have completed - GGML_API void ggml_backend_tensor_shfl_async( - ggml_backend_t backend_1, ggml_backend_t backend_2, - const struct ggml_tensor * src1, const struct ggml_tensor * src2, - struct ggml_tensor * dst1, struct ggml_tensor * dst2); - GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend); // diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 6500758414..6d92a9c06d 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -97,11 +97,6 @@ extern "C" { void (*set_tensor_2d_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data); void (*get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data); bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); - bool (*shfl_tensor_async)(ggml_backend_t backend_1, ggml_backend_t backend_2, - const struct ggml_tensor * src1, const struct ggml_tensor * src2, struct ggml_tensor * dst1, struct ggml_tensor * dst2); - - // (optional) backend-specific AllReduce operation for meta backend - bool (*allreduce_tensor_async)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends); // (optional) complete all pending operations (required if the backend supports async operations) void (*synchronize)(ggml_backend_t backend); diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index 6e7925026c..3cc203594d 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -894,7 +894,8 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend, bcj1.cgraphs[i].nodes_aux.push_back(node_tmp_1); bcj2.cgraphs[i].nodes_aux.push_back(node_tmp_2); - ggml_backend_tensor_shfl_async(bcj1.backend, bcj2.backend, node1, node2, node_tmp_1, node_tmp_2); + ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2); + ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1); ggml_tensor * node_red_1 = backend_ctx->get_next_tensor(j, tensors, node1); ggml_tensor * node_red_2 = backend_ctx->get_next_tensor(j_other, tensors, node2); @@ -982,8 +983,6 @@ static const ggml_backend_i ggml_backend_meta_i = { /* .get_tensor_2d_async = */ nullptr, /* .set_tensor_2d_async = */ nullptr, /* .cpy_tensor_async = */ nullptr, - /* .shfl_tensor_async = */ nullptr, - /* .allreduce_tensor_async = */ nullptr, /* .synchronize = */ ggml_backend_meta_synchronize, /* .graph_plan_create = */ nullptr, /* .graph_plan_free = */ nullptr, diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index f16471484b..c3e517e33f 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -432,20 +432,6 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b ggml_backend_tensor_copy(src, dst); } -void ggml_backend_tensor_shfl_async( - ggml_backend_t backend_1, ggml_backend_t backend_2, - const struct ggml_tensor * src1, const struct ggml_tensor * src2, - struct ggml_tensor * dst1, struct ggml_tensor * dst2) { - GGML_ASSERT(ggml_are_same_layout(src1, dst1) && "cannot shuffle tensors with different layouts"); - GGML_ASSERT(ggml_are_same_layout(src2, dst2) && "cannot shuffle tensors with different layouts"); - if (backend_1->iface.shfl_tensor_async != NULL) { - if (backend_1->iface.shfl_tensor_async(backend_1, backend_2, src1, src2, dst1, dst2)) { - return; - } - } - ggml_backend_tensor_copy_async(backend_1, backend_2, src1, dst2); - ggml_backend_tensor_copy_async(backend_2, backend_1, src2, dst1); -} // events ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) { diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 8c4e8e4f15..0bf295677e 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -264,8 +264,6 @@ static struct ggml_backend_i blas_backend_i = { /* .set_tensor_2d_async = */ NULL, /* .get_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 5ab78b9b42..73fcc879de 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2570,8 +2570,6 @@ static const ggml_backend_i ggml_backend_cann_interface = { /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ ggml_backend_cann_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 62e273c93f..49f840be20 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -198,8 +198,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = { /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ NULL, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index a90e370560..5da55512dd 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2841,77 +2841,6 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ return true; } -static bool ggml_backend_cuda_shfl_tensor_async( - ggml_backend_t backend_1, ggml_backend_t backend_2, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst1, ggml_tensor * dst2) { - ggml_backend_buffer_t buf_src1 = src1->view_src ? src1->view_src->buffer : src1->buffer; - ggml_backend_buffer_t buf_src2 = src2->view_src ? src2->view_src->buffer : src2->buffer; - ggml_backend_buffer_t buf_dst1 = dst1->view_src ? dst1->view_src->buffer : dst1->buffer; - ggml_backend_buffer_t buf_dst2 = dst2->view_src ? dst2->view_src->buffer : dst2->buffer; - - if (!ggml_backend_is_cuda(backend_1) || !ggml_backend_is_cuda(backend_2)) { - return false; - } - - if (!ggml_backend_buffer_is_cuda(buf_src1) || !ggml_backend_buffer_is_cuda(buf_src2) || - !ggml_backend_buffer_is_cuda(buf_dst1) || !ggml_backend_buffer_is_cuda(buf_dst2)) { - return false; - } - - // device -> device copy - ggml_backend_cuda_context * cuda_ctx_1 = (ggml_backend_cuda_context *) backend_1->context; - ggml_backend_cuda_context * cuda_ctx_2 = (ggml_backend_cuda_context *) backend_2->context; - - ggml_backend_cuda_buffer_context * buf_ctx_src1 = (ggml_backend_cuda_buffer_context *) buf_src1->context; - ggml_backend_cuda_buffer_context * buf_ctx_src2 = (ggml_backend_cuda_buffer_context *) buf_src2->context; - ggml_backend_cuda_buffer_context * buf_ctx_dst1 = (ggml_backend_cuda_buffer_context *) buf_dst1->context; - ggml_backend_cuda_buffer_context * buf_ctx_dst2 = (ggml_backend_cuda_buffer_context *) buf_dst2->context; - - if (cuda_ctx_1->device != buf_ctx_src1->device || cuda_ctx_2->device != buf_ctx_src2->device || - cuda_ctx_1->device != buf_ctx_dst1->device || cuda_ctx_2->device != buf_ctx_dst2->device) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__); -#endif // NDEBUG - return false; - } - - if (backend_1 != backend_2) { - // Copies under control of src streams: - if (cuda_ctx_1->device == cuda_ctx_2->device) { - CUDA_CHECK(cudaMemcpyAsync(dst2->data, src1->data, ggml_nbytes(dst2), cudaMemcpyDeviceToDevice, cuda_ctx_1->stream())); - CUDA_CHECK(cudaMemcpyAsync(dst1->data, src2->data, ggml_nbytes(dst1), cudaMemcpyDeviceToDevice, cuda_ctx_2->stream())); - } else { -#ifdef GGML_CUDA_NO_PEER_COPY - return false; -#else - CUDA_CHECK(cudaMemcpyPeerAsync(dst2->data, cuda_ctx_2->device, src1->data, cuda_ctx_1->device, ggml_nbytes(dst2), cuda_ctx_1->stream())); - CUDA_CHECK(cudaMemcpyPeerAsync(dst1->data, cuda_ctx_1->device, src2->data, cuda_ctx_2->device, ggml_nbytes(dst1), cuda_ctx_2->stream())); -#endif // GGML_CUDA_NO_PEER_COPY - } - - // Record event on src streams after the copy: - if (!cuda_ctx_1->copy_event) { - ggml_cuda_set_device(cuda_ctx_1->device); - CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_1->copy_event, cudaEventDisableTiming)); - } - if (!cuda_ctx_2->copy_event) { - ggml_cuda_set_device(cuda_ctx_2->device); - CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_2->copy_event, cudaEventDisableTiming)); - } - - CUDA_CHECK(cudaEventRecord(cuda_ctx_1->copy_event, cuda_ctx_1->stream())); - CUDA_CHECK(cudaEventRecord(cuda_ctx_2->copy_event, cuda_ctx_2->stream())); - - // Wait on dst stream for the copies to complete: - CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_2->stream(), cuda_ctx_1->copy_event, 0)); - CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_1->stream(), cuda_ctx_2->copy_event, 0)); - } else { - // srcs and dsts are on the same backend: - CUDA_CHECK(cudaMemcpyAsync(dst2->data, src1->data, ggml_nbytes(dst2), cudaMemcpyDeviceToDevice, cuda_ctx_1->stream())); - CUDA_CHECK(cudaMemcpyAsync(dst1->data, src2->data, ggml_nbytes(dst1), cudaMemcpyDeviceToDevice, cuda_ctx_2->stream())); - } - return true; -} - static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; @@ -4314,8 +4243,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, - /* .shfl_tensor_async = */ ggml_backend_cuda_shfl_tensor_async, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ ggml_backend_cuda_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 9766b90acf..11e0f1c3cf 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2844,8 +2844,6 @@ static struct ggml_backend_i hexagon_backend_i = { /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ NULL, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ ggml_backend_hexagon_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index 5c0da80e62..a1e46b1884 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -566,8 +566,6 @@ static ggml_backend_i ggml_backend_metal_i = { /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ ggml_backend_metal_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index e8a654aef5..be58e8ba08 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -3480,8 +3480,6 @@ static ggml_backend_i ggml_backend_opencl_i = { /* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */ /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ ggml_backend_opencl_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index c7e078fb13..7acf868c72 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -895,8 +895,6 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .cpy_tensor_async = */ NULL, /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ ggml_backend_rpc_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 33fee035b4..da15f968f8 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4460,8 +4460,6 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async, // // TODO: update for the new // interface - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ ggml_backend_sycl_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-virtgpu/ggml-backend.cpp b/ggml/src/ggml-virtgpu/ggml-backend.cpp index 6ee685f15e..f22ce4113d 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend.cpp @@ -37,8 +37,6 @@ static ggml_backend_i ggml_backend_remoting_interface = { /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ NULL, // ggml_backend_remoting_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index d7fff442b2..504f24028c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -14377,8 +14377,6 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ ggml_backend_vk_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index d5fcc64b92..396159d367 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -2200,8 +2200,6 @@ static ggml_backend_i ggml_backend_webgpu_i = { /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ NULL, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp index 4a18f0e969..9d801483d2 100644 --- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp +++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp @@ -424,8 +424,6 @@ static ggml_backend_i ggml_backend_zdnn_i = { /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ NULL, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp index 9cc43fa35f..346450e603 100644 --- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp +++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp @@ -243,8 +243,6 @@ static struct ggml_backend_i ggml_backend_zendnn_i = { /* .get_tensor_2d_async = */ NULL, /* .set_tensor_2d_async = */ NULL, /* .cpy_tensor_async = */ NULL, - /* .shfl_tensor_async = */ NULL, - /* .allreduce_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL,