ggml: backend-agnostic tensor parallelism (experimental) (#19378)

* ggml: backend-agnostic tensor parallelism

* support for GPT-OSS, Qwen 3 MoE

* partial Vulkan fix

* add support for 4/8 GPUs

* unconditional peer access

* re-use buffers + ggml contexts

* fix output pattern

* NCCL support

* GGML: HIP: add RCCL support

* Remove shfl and AllReduce from backend interface

* move allocation workaround out of ggml-alloc.c

* 2d tensor set/get support

* Fix the seg fault without NCCL

* Apply suggestion from JohannesGaessler

* support for tensor dims % n_devs != 0

* fix view_offs scaling

* arbitrary num. of GPUs/tensor split

* fix compilation

* better granularity estimate

* Support device-specific host buffer types if all underlying backends expose the same type. This allows using pinned memory instead of pageable memory for CUDA.

Fix compilation errors.

* partial Qwen 3 Next support

* Fix qwen3 30b (#8)

* Fix crash with Qwen-30B-A3B Q4_0

Qwen-30B-A3B Q4_0 has an intermediate dimension of 768. Using a granularity of 256 forces an uneven split between GPUs, which is not supported by the current implementation.

* Decide block size based on tensor quantization type

* Fix crashes due to KV cache serialization (#9)

KV cache serialization requires non-zero offsets on the tensor. Add support in the meta backend to set/get a tensor with a non-zero offset.

* metal : fix build (#7)

* static memory allocations, fix usage count

* fix tensor granularity

* more even memory distribution

* use BF16 for allreduce

* rebase fixup

* better error message for unsupported architectures

* Fix device mismatch during scatter of allReduce. (#11)

There is a mismatch between the dst buffer device and the backend device, causing the use of sync copies

* Enable the previous allreduce implementation. It is better in both perf and stability (#12)

* delay AllReduce for Moe for less I/O

* build : clean-up compile warnings

* backend : move most of the meta backend API to ggml-backend-impl.h

* cont : hide unused public API in the implementation

* llama : use llama_device + remove ggml_backend_dev_is_meta()

* ggml-backend : remove unused alloc include

* minor : remove regex include

* ggml : introduce ggml-ext.h for staging new APIs

* rebase fixup

* fix tests

* llama : more robust logic for determining Meta devices (#16)

* llama : more robust logic for determining Meta devices

* cont : fix devs size check

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cont : fix log type

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* disable roundtrip for meta backend

* fix arch selection

* Qwen 3.5 support

* fix Gemma 4 MoE

* fix OpenVino, SYCL

* fix test-llama-archs for CPU-only builds

* Fix Qwen 3.5 MoE

* disable meta backend tests for WebGPU

* tests : filter CPU-based devices from the Meta backend tests (#17)

* meta : formatting, naming, indentation (#18)

* formatting : llama-model.cpp

* formatting : ggml-ext.h

* formatting : ggml-backend-meta.cpp

* meta : add TODO

* add documentation

* better error messages

* fix GPT-OSS

---------

Co-authored-by: Carl Philipp Klemm <carl@uvos.xyz>
Co-authored-by: Gaurav Garg <gaugarg@nvidia.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Johannes Gäßler 2026-04-09 16:42:19 +02:00 committed by GitHub
parent 009a113326
commit d6f3030047
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
48 changed files with 3198 additions and 342 deletions

View File

@ -2348,19 +2348,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
add_opt(common_arg(
{"-sm", "--split-mode"}, "{none,layer,row}",
{"-sm", "--split-mode"}, "{none,layer,row,tensor}",
"how to split the model across multiple GPUs, one of:\n"
"- none: use one GPU only\n"
"- layer (default): split layers and KV across GPUs\n"
"- row: split rows across GPUs",
"- layer (default): split layers and KV across GPUs (pipelined)\n"
"- row: split weight across GPUs by rows (parallelized)\n"
"- tensor: split weights and KV across GPUs (parallelized)",
[](common_params & params, const std::string & value) {
std::string arg_next = value;
if (arg_next == "none") {
if (value == "none") {
params.split_mode = LLAMA_SPLIT_MODE_NONE;
} else if (arg_next == "layer") {
} else if (value == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (arg_next == "row") {
} else if (value == "row") {
params.split_mode = LLAMA_SPLIT_MODE_ROW;
} else if (value == "tensor") {
params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
} else {
throw std::invalid_argument("invalid value");
}

View File

@ -7,6 +7,8 @@ set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 11)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
if(GIT_EXE)
# Get current git commit hash
@ -204,12 +206,14 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
option(GGML_CUDA_NCCL "ggml: use NVIDIA Collective Comm. Library" ON)
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
"ggml: cuda link binary compression mode; requires cuda 12.8+")
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
option(GGML_HIP_RCCL "ggml: use ROCm Collective Comm. Library" OFF)
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)

36
ggml/cmake/FindNCCL.cmake Normal file
View File

@ -0,0 +1,36 @@
# cmake/FindNCCL.cmake
# NVIDIA does not distribute CMake files with NCCl, therefore use this file to find it instead.
find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda
PATH_SUFFIXES include
)
find_library(NCCL_LIBRARY
NAMES nccl
HINTS ${NCCL_ROOT} $ENV{NCCL_ROOT} $ENV{CUDA_HOME} /usr/local/cuda
PATH_SUFFIXES lib lib64
)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL
DEFAULT_MSG
NCCL_LIBRARY NCCL_INCLUDE_DIR
)
if(NCCL_FOUND)
set(NCCL_LIBRARIES ${NCCL_LIBRARY})
set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
if(NOT TARGET NCCL::NCCL)
add_library(NCCL::NCCL UNKNOWN IMPORTED)
set_target_properties(NCCL::NCCL PROPERTIES
IMPORTED_LOCATION "${NCCL_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}"
)
endif()
endif()
mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARY)

View File

@ -68,7 +68,7 @@ extern "C" {
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst);
//
// Backend (stream)
@ -83,13 +83,17 @@ extern "C" {
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set_async (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set_2d_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
// "offset" refers to the offset in tensor->data for setting/getting data
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set ( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get (const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set_2d( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
@ -109,7 +113,7 @@ extern "C" {
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
@ -135,7 +139,9 @@ extern "C" {
// integrated GPU device using host memory
GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
GGML_BACKEND_DEVICE_TYPE_ACCEL,
// "meta" device wrapping multiple other devices for tensor parallelism
GGML_BACKEND_DEVICE_TYPE_META,
};
// functionality supported by the device
@ -196,7 +202,9 @@ extern "C" {
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
// Split buffer type for tensor parallelism
// AllReduce operation for tensor parallelism (meta backend)
typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// Split buffer type for tensor parallelism (old)
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
// Set the number of threads for the backend
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);

View File

@ -27,6 +27,9 @@ GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// conduct allreduce operation between devices
GGML_BACKEND_API bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);

View File

@ -200,6 +200,7 @@ add_library(ggml-base
ggml.cpp
ggml-alloc.c
ggml-backend.cpp
ggml-backend-meta.cpp
ggml-opt.cpp
ggml-threading.cpp
ggml-threading.h

View File

@ -1236,6 +1236,9 @@ size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx,
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
size_t nbytes_total = 0;
if (ggml_backend_buft_is_meta(buft)) {
return ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft);
}
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
}

View File

@ -49,6 +49,10 @@ extern "C" {
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) 2d data copies
void (*set_tensor_2d)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*get_tensor_2d)(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
// (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
// clear the entire buffer
@ -80,6 +84,20 @@ extern "C" {
GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
//
// Backend (meta)
//
GGML_API bool ggml_backend_is_meta (ggml_backend_t backend);
GGML_API bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf);
GGML_API bool ggml_backend_buft_is_meta (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_meta_n_backends (ggml_backend_t meta_backend);
GGML_API ggml_backend_t ggml_backend_meta_simple_backend(ggml_backend_t meta_backend, size_t index);
// temporary workaround to statically allocate tensors from a context in a deduplicated way:
GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
//
// Backend (stream)
//
@ -90,8 +108,10 @@ extern "C" {
void (*free)(ggml_backend_t backend);
// (optional) asynchronous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*set_tensor_async) (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async) (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*set_tensor_2d_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations (required if the backend supports async operations)

File diff suppressed because it is too large Load Diff

View File

@ -123,7 +123,7 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
// get_base is optional if the buffer is zero-sized
if (buffer->size == 0) {
if (!ggml_backend_buffer_is_meta(buffer) && buffer->size == 0) {
return NULL;
}
@ -279,15 +279,57 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
}
}
void ggml_backend_tensor_set_2d_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(backend);
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_set_async(backend, tensor, (const char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
backend->iface.set_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(backend);
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (size == 0) {
return;
}
GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@ -297,18 +339,62 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (size == 0) {
return;
}
GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
buf->iface.get_tensor(buf, tensor, data, offset, size);
}
void ggml_backend_tensor_set_2d(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_set(tensor, (const char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
buf->iface.set_tensor_2d(buf, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size,
size_t n_copies, size_t stride_tensor, size_t stride_data) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf != NULL && "tensor buffer not set");
if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) {
for (size_t i = 0; i < n_copies; i++) {
ggml_backend_tensor_get(tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
}
return;
}
if (size == 0) {
return;
}
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
buf->iface.get_tensor_2d(buf, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
}
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@ -388,7 +474,7 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
// backend copy
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) {
@ -402,7 +488,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
#endif
#endif // NDEBUG
size_t nbytes = ggml_nbytes(src);
void * data = malloc(nbytes);
ggml_backend_tensor_get(src, data, 0, nbytes);
@ -411,7 +497,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
}
}
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) {
@ -500,6 +586,7 @@ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
}
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
GGML_ASSERT(device);
memset(props, 0, sizeof(*props));
device->iface.get_props(device, props);
}
@ -610,6 +697,8 @@ static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ NULL,
/* .get_tensor = */ NULL,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_multi_buffer_clear,
/* .reset = */ NULL,
@ -1899,8 +1988,9 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct
GGML_ASSERT(tensor->data == NULL);
GGML_ASSERT(tensor->view_src == NULL);
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer) ||
(char *) addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *) ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
tensor->buffer = buffer;
tensor->data = addr;
@ -2174,6 +2264,8 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cpu_buffer_clear,
/* .reset = */ NULL,
@ -2186,6 +2278,8 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cpu_buffer_clear,
/* .reset = */ NULL,

View File

@ -262,6 +262,8 @@ static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_get_name,
/* .free = */ ggml_backend_blas_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,

View File

@ -1457,6 +1457,8 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_cann_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cann_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cann_buffer_clear,
/* .reset = */ NULL,
@ -2698,6 +2700,8 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .free = */ ggml_backend_cann_free,
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cann_synchronize,
/* .graph_plan_create = */ NULL,

View File

@ -111,6 +111,8 @@ static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ nullptr,
/* .set_tensor_2d = */ nullptr,
/* .get_tensor_2d = */ nullptr,
/* .cpy_tensor = */ nullptr,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ nullptr,

View File

@ -195,6 +195,8 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .free = */ ggml_backend_cpu_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,

View File

@ -181,6 +181,16 @@ if (CUDAToolkit_FOUND)
target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
endif()
if (GGML_CUDA_NCCL)
find_package(NCCL)
if (NCCL_FOUND)
add_compile_definitions(GGML_USE_NCCL)
target_link_libraries(ggml-cuda PRIVATE NCCL::NCCL)
else()
message(STATUS "Warning: NCCL not found, performance for multiple CUDA GPUs will be suboptimal")
endif()
endif()
set(CUDA_CXX_FLAGS "")
set(CUDA_FLAGS -use_fast_math -extended-lambda)

View File

@ -186,6 +186,10 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
#ifdef GGML_USE_NCCL
#define NCCL_CHECK(err) CUDA_CHECK_GEN(err, ncclSuccess, ncclGetErrorString)
#endif // GGML_USE_NCCL
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
static const char * cu_get_error_str(CUresult err) {
const char * err_str;
@ -1086,6 +1090,10 @@ struct ggml_cuda_device_info {
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
#ifdef GGML_USE_NCCL
ncclComm_t comms[GGML_CUDA_MAX_DEVICES];
#endif // GGML_USE_NCCL
};
const ggml_cuda_device_info & ggml_cuda_info();

View File

@ -324,6 +324,28 @@ static ggml_cuda_device_info ggml_cuda_init() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
for (int id = 0; id < info.device_count; ++id) {
ggml_cuda_set_device(id);
for (int id_other = 0; id_other < info.device_count; ++id_other) {
if (id == id_other) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
}
}
}
#ifdef GGML_USE_NCCL
int dev_ids[GGML_CUDA_MAX_DEVICES];
for (int id = 0; id < info.device_count; ++id) {
dev_ids[id] = id;
}
NCCL_CHECK(ncclCommInitAll(info.comms, info.device_count, dev_ids));
#endif // GGML_USE_NCCL
return info;
}
@ -632,26 +654,46 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
}
static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread));
CUDA_CHECK(cudaMemsetAsync((char *) tensor->data + offset, value, size, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaMemcpyAsync((char *) tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpyAsync(data, (const char *) tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpy2DAsync(
(char *) tensor->data + offset, stride_tensor, data, stride_data, size, n_copies, cudaMemcpyHostToDevice, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
static void ggml_backend_cuda_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
CUDA_CHECK(cudaMemcpy2DAsync(
data, stride_data, (const char *) tensor->data + offset, stride_tensor, size, n_copies, cudaMemcpyDeviceToHost, cudaStreamPerThread));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
}
@ -691,6 +733,8 @@ static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
/* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
/* .set_tensor_2d = */ ggml_backend_cuda_buffer_set_tensor_2d,
/* .get_tensor_2d = */ ggml_backend_cuda_buffer_get_tensor_2d,
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cuda_buffer_clear,
/* .reset = */ NULL,
@ -1003,6 +1047,8 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
/* .reset = */ NULL,
@ -1079,6 +1125,83 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
};
bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends) {
#ifdef GGML_USE_NCCL
const int64_t ne = ggml_nelements(tensors[0]);
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
// This then causes a crash in this function
if (ne == 0) {
return true;
}
for (size_t i = 0; i < n_backends; ++i) {
GGML_ASSERT(tensors[i] != nullptr);
GGML_ASSERT(ggml_nelements(tensors[i]) == ne);
GGML_ASSERT(ggml_is_contiguously_allocated(tensors[i]));
}
const ggml_cuda_device_info info = ggml_cuda_info();
// For small tensors, simply reduce them as FP32.
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
return true;
}
// For large tensors it's faster to compress them to BF16 for the reduction:
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
ggml_cuda_pool_alloc<nv_bfloat16> tmp[GGML_CUDA_MAX_DEVICES];
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
tmp[i].pool = &cuda_ctx->pool();
tmp[i].alloc(ne);
ggml_cuda_set_device(i);
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
ggml_cuda_set_device(i);
to_fp32(tmp[i].get(), (float *) tensors[i]->data, ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
return true;
#else
// If NCCL is installed it is used by default for optimal performance.
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
// RCCL is disabled by default, users are explicitly opting in.
// Therefore print no warning for RCCL.
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
static bool warning_printed = false;
if (!warning_printed) {
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
warning_printed = true;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
GGML_UNUSED_VARS(backends, tensors, n_backends);
return false;
#endif // GGML_USE_NCCL
}
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
@ -1425,64 +1548,6 @@ static void ggml_cuda_op_mul_mat_cublas(
GGML_UNUSED_VARS(dst, src1_ddq_i, src1_padded_row_size);
}
static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
static bool peer_access_enabled = false;
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
if (peer_access_enabled == enable_peer_access) {
return;
}
#ifdef NDEBUG
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
ggml_cuda_set_device(id);
CUDA_CHECK(cudaDeviceSynchronize());
}
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
ggml_cuda_set_device(id);
for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
if (id == id_other) {
continue;
}
if (id != main_device && id_other != main_device) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
if (enable_peer_access) {
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
if (err != cudaErrorPeerAccessAlreadyEnabled) {
CUDA_CHECK(err);
} else {
// reset the error
(void)cudaGetLastError();
}
} else {
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
if (err != cudaErrorPeerAccessNotEnabled) {
CUDA_CHECK(err);
} else {
// reset the error
(void)cudaGetLastError();
}
}
}
}
}
ggml_cuda_set_device(main_device);
#endif // NDEBUG
peer_access_enabled = enable_peer_access;
GGML_UNUSED(main_device);
}
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
@ -2483,11 +2548,6 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
// why is this here instead of mul_mat?
if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
}
switch (dst->op) {
case GGML_OP_ARGMAX:
ggml_cuda_argmax(ctx, dst);
@ -2845,21 +2905,43 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) {
}
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
CUDA_CHECK(cudaMemcpyAsync((char *) tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
}
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
CUDA_CHECK(cudaMemcpyAsync(data, (const char *) tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
}
static void ggml_backend_cuda_set_tensor_2d_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpy2DAsync(
(char *) tensor->data + offset, stride_tensor, data, stride_data, size, n_copies, cudaMemcpyHostToDevice, cuda_ctx->stream()));
}
static void ggml_backend_cuda_get_tensor_2d_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data,
size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
CUDA_CHECK(cudaMemcpy2DAsync(
data, stride_data, (const char *) tensor->data + offset, stride_tensor, size, n_copies, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
}
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
@ -2870,21 +2952,21 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
return false;
}
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
return false;
}
// device -> device copy
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *) backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *) backend_dst->context;
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif
#endif // NDEBUG
return false;
}
@ -2897,7 +2979,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
return false;
#else
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
#endif
#endif // GGML_CUDA_NO_PEER_COPY
}
// record event on src stream after the copy
@ -4343,6 +4425,8 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .free = */ ggml_backend_cuda_free,
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
/* .get_tensor_2d_async = */ ggml_backend_cuda_set_tensor_2d_async,
/* .set_tensor_2d_async = */ ggml_backend_cuda_get_tensor_2d_async,
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL,
@ -5130,6 +5214,9 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_allreduce_tensor") == 0) {
return (void *)ggml_backend_cuda_allreduce_tensor;
}
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
return (void *)ggml_backend_cuda_split_buffer_type;
}

View File

@ -6,6 +6,10 @@
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#ifdef GGML_USE_NCCL
#include <nccl.h>
#endif // GGML_USE_NCCL
#if CUDART_VERSION >= 11080
#include <cuda_fp8.h>
#define FP8_AVAILABLE

View File

@ -10,6 +10,11 @@
#include <rocwmma/rocwmma-version.hpp>
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
#ifdef GGML_USE_NCCL
#include <rccl/rccl.h>
#endif // GGML_USE_NCCL
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
@ -28,6 +33,7 @@
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define NCCL_CHECK(fn) {ncclResult_t err = fn; if(err != ncclSuccess) { GGML_ABORT("RCCL Failure RCCL returned: %i\n", err); }}
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)

56
ggml/src/ggml-ext.h Normal file
View File

@ -0,0 +1,56 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
// This is a "staging" header for new ggml API
// It is not publicly available and it should not be used by 3rd party projects
//
// When the API matures enough, it will be moved to the official public API
//
// Meta backend
//
#define GGML_BACKEND_META_MAX_DEVICES 16
enum ggml_backend_meta_split_axis {
// tensor split by tensor dimensions:
GGML_BACKEND_SPLIT_AXIS_0 = 0,
GGML_BACKEND_SPLIT_AXIS_1 = 1,
GGML_BACKEND_SPLIT_AXIS_2 = 2,
GGML_BACKEND_SPLIT_AXIS_3 = 3,
GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
// for internal bookkeeping only:
GGML_BACKEND_SPLIT_AXIS_NONE = 98,
GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
};
GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
struct ggml_backend_meta_split_state {
enum ggml_backend_meta_split_axis axis;
// for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
// - each device has a slice of the tensor along the split axis
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
// - some tensors have an inhomogenenous data layout along the split axis,
// those tensors are divided into segments which are each individually split across devices
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
uint32_t n_segments;
};
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
// TODO: this looks a bit strange - a backend API creates a device. I think we should try
// express this as a backend registry functionality instead
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);

View File

@ -1491,6 +1491,8 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor,
/* .clear = */ ggml_backend_hexagon_buffer_clear,
/* .reset = */ NULL,
@ -3002,6 +3004,8 @@ static struct ggml_backend_i hexagon_backend_i = {
/* .free = */ ggml_backend_hexagon_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_hexagon_synchronize,
/* .graph_plan_create = */ NULL,

View File

@ -47,6 +47,10 @@ find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
if (GGML_HIP_RCCL)
find_package(rccl REQUIRED)
endif()
if (${hip_VERSION} VERSION_LESS 6.1)
message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
endif()
@ -118,6 +122,10 @@ if (NOT GGML_HIP_MMQ_MFMA)
add_compile_definitions(GGML_HIP_NO_MMQ_MFMA)
endif()
if (GGML_HIP_RCCL)
add_compile_definitions(GGML_USE_NCCL) # RCCL has the same interface as NCCL.
endif()
if (GGML_HIP_EXPORT_METRICS)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps")
endif()
@ -142,4 +150,8 @@ if (GGML_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
if (GGML_HIP_RCCL)
target_link_libraries(ggml-hip PRIVATE ggml-base roc::rccl)
endif()
target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)

View File

@ -90,6 +90,8 @@ static ggml_backend_buffer_i ggml_backend_metal_buffer_shared_i = {
/* .memset_tensor = */ ggml_backend_metal_buffer_shared_memset_tensor,
/* .set_tensor = */ ggml_backend_metal_buffer_shared_set_tensor,
/* .get_tensor = */ ggml_backend_metal_buffer_shared_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_metal_buffer_shared_cpy_tensor,
/* .clear = */ ggml_backend_metal_buffer_shared_clear,
/* .reset = */ NULL,
@ -158,15 +160,17 @@ static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer
}
static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = {
/* .free_buffer = */ ggml_backend_metal_buffer_private_free_buffer,
/* .get_base = */ ggml_backend_metal_buffer_private_get_base,
/* .init_tensor = */ NULL,
/* .memset_tensor = */ ggml_backend_metal_buffer_private_memset_tensor,
/* .set_tensor = */ ggml_backend_metal_buffer_private_set_tensor,
/* .get_tensor = */ ggml_backend_metal_buffer_private_get_tensor,
/* .cpy_tensor = */ ggml_backend_metal_buffer_private_cpy_tensor,
/* .clear = */ ggml_backend_metal_buffer_private_clear,
/* .reset = */ NULL,
/* .free_buffer = */ ggml_backend_metal_buffer_private_free_buffer,
/* .get_base = */ ggml_backend_metal_buffer_private_get_base,
/* .init_tensor = */ NULL,
/* .memset_tensor = */ ggml_backend_metal_buffer_private_memset_tensor,
/* .set_tensor = */ ggml_backend_metal_buffer_private_set_tensor,
/* .get_tensor = */ ggml_backend_metal_buffer_private_get_tensor,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor = */ ggml_backend_metal_buffer_private_cpy_tensor,
/* .clear = */ ggml_backend_metal_buffer_private_clear,
/* .reset = */ NULL,
};
static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer) {
@ -563,6 +567,8 @@ static ggml_backend_i ggml_backend_metal_i = {
/* .free = */ ggml_backend_metal_free,
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups
/* .synchronize = */ ggml_backend_metal_synchronize,
/* .graph_plan_create = */ NULL,

View File

@ -4063,6 +4063,8 @@ static ggml_backend_i ggml_backend_opencl_i = {
/* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
/* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
/* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .synchronize = */ ggml_backend_opencl_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
@ -5778,6 +5780,8 @@ static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_opencl_buffer_clear,
/* .reset = */ ggml_backend_opencl_buffer_reset,

View File

@ -412,6 +412,8 @@ static const ggml_backend_buffer_i ggml_backend_openvino_buffer_interface = {
/* .memset_tensor = */ ggml_backend_openvino_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_openvino_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_openvino_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_openvino_buffer_cpy_tensor,
/* .clear = */ ggml_backend_openvino_buffer_clear,
/* .reset = */ NULL,
@ -617,6 +619,8 @@ static const ggml_backend_i ggml_backend_openvino_interface = {
/* .free = */ ggml_backend_openvino_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,

View File

@ -706,6 +706,8 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_rpc_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_rpc_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_rpc_buffer_cpy_tensor,
/* .clear = */ ggml_backend_rpc_buffer_clear,
/* .reset = */ NULL,
@ -894,6 +896,8 @@ static ggml_backend_i ggml_backend_rpc_interface = {
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .synchronize = */ ggml_backend_rpc_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@ -638,6 +638,8 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
/* .memset_tensor = */ ggml_backend_sycl_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor,
/* .clear = */ ggml_backend_sycl_buffer_clear,
/* .reset = */ ggml_backend_sycl_buffer_reset,
@ -1084,6 +1086,8 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_sycl_split_buffer_clear,
/* .reset = */ NULL,
@ -4553,6 +4557,8 @@ static ggml_backend_i ggml_backend_sycl_interface = {
/* .free = */ ggml_backend_sycl_free,
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
// // TODO: update for the new
// interface

View File

@ -101,6 +101,8 @@ const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor,
/* .clear = */ ggml_backend_remoting_buffer_clear,
/* .reset = */ NULL,
@ -113,6 +115,8 @@ const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = {
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor_from_ptr,
/* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor_from_ptr,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor,
/* .clear = */ ggml_backend_remoting_buffer_clear,
/* .reset = */ NULL,

View File

@ -34,6 +34,8 @@ static ggml_backend_i ggml_backend_remoting_interface = {
/* .free = */ ggml_backend_remoting_free,
/* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async,
/* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async,
/* .synchronize = */ NULL, // ggml_backend_remoting_synchronize,
/* .graph_plan_create = */ NULL,

View File

@ -13521,6 +13521,8 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
/* .memset_tensor = */ ggml_backend_vk_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_vk_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_vk_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor,
/* .clear = */ ggml_backend_vk_buffer_clear,
/* .reset = */ NULL,
@ -14979,6 +14981,8 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .free = */ ggml_backend_vk_free,
/* .set_tensor_async = */ ggml_backend_vk_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_vk_cpy_tensor_async,
/* .synchronize = */ ggml_backend_vk_synchronize,
/* .graph_plan_create = */ NULL,

View File

@ -3013,6 +3013,8 @@ static ggml_backend_i ggml_backend_webgpu_i = {
/* .free = */ ggml_backend_webgpu_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
@ -3170,6 +3172,8 @@ static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
/* .memset_tensor = */ ggml_backend_webgpu_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_webgpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_webgpu_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL, // TODO: optional, implement this
/* .clear = */ ggml_backend_webgpu_buffer_clear,
/* .reset = */ NULL, // TODO: optional, think it coordinates with

View File

@ -313,6 +313,8 @@ static ggml_backend_buffer_i ggml_backend_zdnn_buffer_i = {
/* .memset_tensor = */ ggml_backend_zdnn_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_zdnn_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_zdnn_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
/* .get_tensor_2d = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_zdnn_buffer_clear,
/* .reset = */ NULL,
@ -417,20 +419,22 @@ static enum ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend,
}
static ggml_backend_i ggml_backend_zdnn_i = {
/* .get_name = */ ggml_backend_zdnn_name,
/* .free = */ ggml_backend_zdnn_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_zdnn_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .graph_optimize = */ NULL,
/* .get_name = */ ggml_backend_zdnn_name,
/* .free = */ ggml_backend_zdnn_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_zdnn_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .graph_optimize = */ NULL,
};
static ggml_guid_t ggml_backend_zdnn_guid(void) {

View File

@ -407,6 +407,8 @@ static struct ggml_backend_i ggml_backend_zendnn_i = {
/* .free = */ ggml_backend_zendnn_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,

View File

@ -192,9 +192,10 @@ extern "C" {
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
LLAMA_SPLIT_MODE_TENSOR = 3,
};
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)

View File

@ -873,3 +873,34 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
return false;
}
}
bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
switch (arch) {
case LLM_ARCH_GROK:
case LLM_ARCH_MPT:
case LLM_ARCH_PLAMO2:
case LLM_ARCH_MINICPM3:
case LLM_ARCH_GEMMA3N:
case LLM_ARCH_MAMBA:
case LLM_ARCH_MAMBA2:
case LLM_ARCH_JAMBA:
case LLM_ARCH_FALCON_H1:
case LLM_ARCH_OLMO2:
case LLM_ARCH_OLMOE:
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_GLM_DSA:
case LLM_ARCH_BITNET:
case LLM_ARCH_T5:
case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_NEMOTRON_H_MOE:
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_LFM2:
case LLM_ARCH_LFM2MOE:
case LLM_ARCH_MINIMAX_M2:
case LLM_ARCH_MISTRAL4:
case LLM_ARCH_KIMI_LINEAR:
return false;
default:
return true;
}
}

View File

@ -630,6 +630,7 @@ llm_arch llm_arch_from_string(const std::string & name);
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
bool llm_arch_is_recurrent(const llm_arch & arch);
bool llm_arch_is_hybrid (const llm_arch & arch);
bool llm_arch_is_diffusion(const llm_arch & arch);
bool llm_arch_is_recurrent (const llm_arch & arch);
bool llm_arch_is_hybrid (const llm_arch & arch);
bool llm_arch_is_diffusion (const llm_arch & arch);
bool llm_arch_supports_sm_tensor(const llm_arch & arch);

View File

@ -1,5 +1,6 @@
#include "llama-context.h"
#include "ggml.h"
#include "llama-arch.h"
#include "llama-impl.h"
#include "llama-batch.h"
@ -8,6 +9,7 @@
#include "llama-mmap.h"
#include "llama-model.h"
#include "llama-ext.h"
#include "llama.h"
#include <cinttypes>
#include <cmath>
@ -217,10 +219,10 @@ llama_context::llama_context(
if (!hparams.vocab_only) {
// GPU backends
for (auto * dev : model.devices) {
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
for (const auto & dev : model.devices) {
ggml_backend_t backend = ggml_backend_dev_init(dev.dev, nullptr);
if (backend == nullptr) {
throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev)));
throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev.dev)));
}
backends.emplace_back(backend);
}
@ -295,8 +297,8 @@ llama_context::llama_context(
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
// use the host buffer of the first device CPU for faster transfer of the intermediate state
auto * dev = model.devices[0];
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
const auto & dev = model.devices[0];
auto * host_buft = ggml_backend_dev_host_buffer_type(dev.dev);
if (host_buft) {
buft = host_buft;
}
@ -1020,9 +1022,11 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
for (auto & backend : backends) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
if (set_abort_callback_fn) {
set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
if (reg) {
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
if (set_abort_callback_fn) {
set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
}
}
}
}
@ -2942,6 +2946,21 @@ llama_context * llama_init_from_model(
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
}
if (model->split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
LLAMA_LOG_INFO("%s: enabling flash_attn since it is required for SPLIT_MODE_TENSOR\n", __func__);
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
}
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_ENABLED) {
LLAMA_LOG_ERROR("%s: SPLIT_MODE_TENSOR requires flash_attn to be enabled\n", __func__);
return nullptr;
}
if (ggml_is_quantized(params.type_k) || ggml_is_quantized(params.type_v)) {
LLAMA_LOG_ERROR("%s: simultaneous use of SPLIT_MODE_TENSOR and KV cache quantization not implemented\n", __func__);
return nullptr;
}
}
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
const uint32_t blck_size = ggml_blck_size(params.type_k);
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
@ -3475,7 +3494,7 @@ void llama_perf_context_reset(llama_context * ctx) {
}
void llama_memory_breakdown_print(const struct llama_context * ctx) {
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
const auto & devices = ctx->get_model().devices;
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
@ -3511,7 +3530,7 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
if (dev) {
int i_dev = -1;
for (size_t i = 0; i < devices.size(); i++) {
if (devices[i] == dev) {
if (devices[i].dev == dev) {
i_dev = i;
break;
}
@ -3528,7 +3547,7 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
// print memory breakdown for each device:
for (size_t i = 0; i < devices.size(); i++) {
ggml_backend_dev_t dev = devices[i];
ggml_backend_dev_t dev = devices[i].dev;
llama_memory_breakdown_data mb = mb_dev[i];
const std::string name = ggml_backend_dev_name(dev);

View File

@ -1586,6 +1586,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(experts, "ffn_moe_weighted", il);
}
ggml_build_forward_expand(gf, experts);
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
assert(n_expert_used > 0);
@ -1605,6 +1607,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
ggml_build_forward_expand(gf, moe_out);
}
if (hparams.n_expert_used == 1) {
@ -2443,7 +2447,7 @@ ggml_tensor * llm_graph_context::build_rs(
ggml_build_forward_expand(gf,
ggml_cpy(ctx0,
states_extra,
ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
ggml_view_2d(ctx0, s, state_size, (n_rs - n_seqs), s->nb[1], (rs_head + n_seqs)*s->nb[1])));
return output_states;
}

View File

@ -1,5 +1,6 @@
#include "llama-memory-recurrent.h"
#include "ggml-backend.h"
#include "llama-impl.h"
#include "llama-io.h"
#include "llama-batch.h"
@ -91,8 +92,8 @@ llama_memory_recurrent::llama_memory_recurrent(
throw std::runtime_error("failed to create ggml context for rs cache");
}
ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size);
ggml_tensor * r = ggml_new_tensor_2d(ctx, type_r, hparams.n_embd_r(), mem_size);
ggml_tensor * s = ggml_new_tensor_2d(ctx, type_s, hparams.n_embd_s(), mem_size);
ggml_format_name(r, "cache_r_l%d", i);
ggml_format_name(s, "cache_s_l%d", i);
r_l[i] = r;

View File

@ -1,6 +1,7 @@
#include "llama-model.h"
#include "ggml.h"
#include "llama-arch.h"
#include "llama-hparams.h"
#include "llama-impl.h"
#include "llama-mmap.h"
#include "llama-cparams.h"
@ -12,9 +13,13 @@
#include "llama-memory-hybrid-iswa.h"
#include "llama-memory-recurrent.h"
#include "models/models.h"
#include "ggml.h"
#include "ggml-cpp.h"
#include "models/models.h"
// TODO: tmp until the ggml meta backend matures and becomes public
#include "../src/ggml-ext.h"
#include <algorithm>
#include <cassert>
@ -24,9 +29,330 @@
#include <cmath>
#include <functional>
#include <map>
#include <numeric>
#include <regex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const struct ggml_tensor * tensor, void * userdata) {
const llama_meta_device_get_split_state_userdata * ud = (const llama_meta_device_get_split_state_userdata *) userdata;
const llama_hparams & hparams = ud->model->hparams;
const std::string tensor_name = tensor->name;
const std::regex pattern_q_weight ("blk\\.\\d*\\.attn_q.weight");
const std::regex pattern_kv_weight ("blk\\.\\d*\\.attn_(k|v).weight");
const std::regex pattern_qkv_weight ("blk\\.\\d*\\.attn_qkv.weight");
const std::regex pattern_q_bias ("blk\\.\\d*\\.attn_q\\.bias");
const std::regex pattern_kv_bias ("blk\\.\\d*\\.attn_(k|v)\\.bias");
const std::regex pattern_qkv_bias ("blk\\.\\d*\\.attn_qkv.bias");
const std::regex pattern_qk_norm ("blk\\.\\d*\\.attn_(q|k)_norm\\.weight");
const std::regex pattern_kv_cache ("cache_(k|v)_l\\d*");
const std::regex pattern_attn_sinks ("blk\\.\\d*\\.attn_sinks.weight");
const std::regex pattern_attn_out_weight ("blk\\.\\d*\\.attn_output.weight");
const std::regex pattern_attn_out_bias ("blk\\.\\d*\\.attn_output.bias");
const std::regex pattern_attn_gate_weight("blk\\.\\d*\\.attn_gate.weight");
const std::regex pattern_ssm_dt ("blk\\.\\d*\\.ssm_dt.bias");
const std::regex pattern_ssm_a ("blk\\.\\d*\\.ssm_a");
const std::regex pattern_ssm_alpha ("blk\\.\\d*\\.ssm_alpha.weight");
const std::regex pattern_ssm_beta ("blk\\.\\d*\\.ssm_beta.weight");
const std::regex pattern_ssm_beta_alpha ("blk\\.\\d*\\.ssm_ba.weight");
const std::regex pattern_r_cache ("cache_r_l\\d*");
const std::regex pattern_s_cache ("cache_s_l\\d*");
const std::regex pattern_ssm_conv1d ("blk\\.\\d*\\.ssm_conv1d.weight");
const std::regex pattern_ssm_out_weight ("blk\\.\\d*\\.ssm_out.weight");
const std::regex pattern_ffn_up_gate_weight("blk\\.\\d*\\.ffn_(up|gate)(_exps)?.weight");
const std::regex pattern_ffn_up_gate_bias ("blk\\.\\d*\\.ffn_(up|gate)(_exps)?.bias");
const std::regex pattern_ffn_gate_up_weight("blk\\.\\d*\\.ffn_gate_up(_exps)?.weight");
const std::regex pattern_ffn_down_weight ("blk\\.\\d*\\.ffn_down(_exps)?.weight");
const std::regex pattern_ffn_down_bias ("blk\\.\\d*\\.ffn_down.bias");
const std::regex pattern_ffn_down_exps_bias("blk\\.\\d*\\.ffn_down_exps.bias");
const std::regex pattern_output_weight("output\\.weight");
const std::regex pattern_output_bias ("output\\.bias");
struct tensor_config {
ggml_backend_meta_split_axis axis;
const ggml_tensor * tensor_axis_0;
uint32_t il;
size_t rotation;
};
auto get_tensor_config_impl = [&](
const ggml_backend_meta_split_axis axis, const std::string & suffix = "", const std::string & suffix_fallback = "") -> tensor_config {
uint32_t il;
std::string prefix;
size_t rotation;
if (tensor_name.substr(0, 4) == "blk.") {
const size_t length_prefix = tensor_name.find('.', 4);
GGML_ASSERT(length_prefix != std::string::npos);
prefix = tensor_name.substr(0, length_prefix + 1);
il = std::stoull(tensor_name.substr(4, length_prefix));
rotation = il % ud->n_devices;
} else if (tensor_name.substr(0, 6) == "cache_") {
const size_t layer_index_start = tensor_name.find("_l", 6);
GGML_ASSERT(layer_index_start != std::string::npos);
il = std::stoull(tensor_name.substr(layer_index_start + 2));
prefix = "blk." + std::to_string(il) + ".";
rotation = il % ud->n_devices;
} else {
il = 0;
rotation = hparams.n_layer % ud->n_devices;
}
const ggml_tensor * tensor_axis_0 = suffix.empty() ? tensor : ud->model->get_tensor((prefix + suffix).c_str());
if (tensor_axis_0 == nullptr) {
GGML_ASSERT(!suffix_fallback.empty());
tensor_axis_0 = ud->model->get_tensor((prefix + suffix_fallback).c_str());
}
GGML_ASSERT(tensor_axis_0 != nullptr);
return {axis, tensor_axis_0, il, rotation};
};
auto get_tensor_config = [&]() -> tensor_config {
// standard attention
if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_kv_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight");
}
if (std::regex_match(tensor_name, pattern_q_bias) || std::regex_match(tensor_name, pattern_kv_bias)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight");
}
if (std::regex_match(tensor_name, pattern_qkv_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
}
if ( std::regex_match(tensor_name, pattern_qkv_bias)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
}
if (std::regex_match(tensor_name, pattern_qk_norm)) {
return get_tensor_config_impl(tensor->ne[1] == 1 ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight");
}
if (std::regex_match(tensor_name, pattern_kv_cache) || std::regex_match(tensor_name, pattern_attn_sinks)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight");
}
if (std::regex_match(tensor_name, pattern_attn_out_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
}
if (std::regex_match(tensor_name, pattern_attn_out_bias)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED);
}
if (std::regex_match(tensor_name, pattern_attn_gate_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
}
if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ssm_out.weight");
}
if (std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta) ||
std::regex_match(tensor_name, pattern_ssm_beta_alpha)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ssm_out.weight");
}
if (std::regex_match(tensor_name, pattern_r_cache) || std::regex_match(tensor_name, pattern_s_cache)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ssm_out.weight");
}
if (std::regex_match(tensor_name, pattern_ssm_conv1d)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ssm_out.weight");
}
if (std::regex_match(tensor_name, pattern_ssm_out_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
}
// FFN
if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ffn_down.weight", "ffn_down_exps.weight");
}
if (std::regex_match(tensor_name, pattern_ffn_up_gate_bias)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ffn_down.weight", "ffn_down_exps.weight");
}
if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "ffn_down.weight", "ffn_down_exps.weight");
}
if (std::regex_match(tensor_name, pattern_ffn_down_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ffn_down.weight", "ffn_down_exps.weight");
}
if (std::regex_match(tensor_name, pattern_ffn_down_bias)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED);
}
if (std::regex_match(tensor_name, pattern_ffn_down_exps_bias)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_PARTIAL);
}
// output
if (std::regex_match(tensor_name, pattern_output_weight)) {
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
}
if (std::regex_match(tensor_name, pattern_output_bias)) {
const ggml_tensor * output_weight = ud->model->get_tensor("output.weight");
GGML_ASSERT(output_weight != nullptr);
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
}
// everything else
return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED);
};
auto get_split_segments = [&](int axis, uint32_t il) -> std::vector<int64_t> {
if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) {
const int64_t head_k_dim = hparams.ssm_d_state;
const int64_t head_v_dim = hparams.ssm_d_state;
const int64_t n_k_heads = hparams.ssm_n_group;
const int64_t n_v_heads = hparams.ssm_dt_rank;
const int64_t key_dim = head_k_dim * n_k_heads;
const int64_t value_dim = head_v_dim * n_v_heads;
const int64_t head_ratio = n_v_heads / n_k_heads;
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) {
GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim);
return std::vector<int64_t>(2 + head_ratio, key_dim);
}
if (std::regex_match(tensor_name, pattern_attn_gate_weight) || std::regex_match(tensor_name, pattern_ssm_out_weight)) {
return std::vector<int64_t>(head_ratio, key_dim);
}
if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) ||
std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) {
return std::vector<int64_t>(head_ratio, n_k_heads);
}
if (std::regex_match(tensor_name, pattern_r_cache)) {
return std::vector<int64_t>(2 + head_ratio, key_dim * (hparams.ssm_d_conv - 1));
}
if (std::regex_match(tensor_name, pattern_s_cache)) {
return std::vector<int64_t>(head_ratio, n_k_heads * head_v_dim * head_v_dim);
}
if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) {
const int64_t n_ff_exp = hparams.n_ff_exp;
GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp);
return {n_ff_exp, n_ff_exp};
}
return {tensor->ne[axis]};
}
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_qkv_bias)) {
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(il);
GGML_ASSERT(hparams.n_embd_k_gqa() == n_embd_gqa);
GGML_ASSERT(tensor->ne[axis] == n_embd + 2*n_embd_gqa);
return {n_embd, n_embd_gqa, n_embd_gqa};
}
if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) {
const int64_t n_ff_exp = hparams.n_ff_exp;
GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp);
return {n_ff_exp, n_ff_exp};
}
return {tensor->ne[axis]};
};
auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector<int64_t> & segments) -> std::vector<int64_t> {
if (hparams.is_recurrent(il)) {
// linear attention
const int64_t head_dim = hparams.ssm_d_state;
const int64_t granularity_qkv = std::lcm(blck_size, head_dim);
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_attn_gate_weight) ||
std::regex_match(tensor_name, pattern_ssm_conv1d) || std::regex_match(tensor_name, pattern_ssm_out_weight)) {
return std::vector<int64_t>(segments.size(), granularity_qkv);
}
if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) ||
std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) {
return std::vector<int64_t>(segments.size(), granularity_qkv / head_dim);
}
if (std::regex_match(tensor_name, pattern_r_cache)) {
return std::vector<int64_t>(segments.size(), granularity_qkv * (hparams.ssm_d_conv - 1));
}
if (std::regex_match(tensor_name, pattern_s_cache)) {
return std::vector<int64_t>(segments.size(), granularity_qkv * head_dim);
}
} else {
// regular attention
const uint32_t n_gqa = hparams.n_gqa(il);
const uint32_t n_embd_q = n_gqa * hparams.n_embd_head_k(il);
if (std::regex_match(tensor_name, pattern_attn_sinks)) {
GGML_ASSERT(segments.size() == 1);
return {std::lcm(n_embd_q, blck_size)/n_embd_q * n_gqa};
}
const int64_t granularity_q = std::lcm(n_embd_q, blck_size);
if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_q_bias)) {
GGML_ASSERT(segments.size() == 1);
// some models have Q gate tensors, for those cases the granularity needs to be doubled:
if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) {
return {std::lcm(2*n_embd_q, blck_size)};
}
return {granularity_q};
}
if (std::regex_match(tensor_name, pattern_attn_out_weight)) {
GGML_ASSERT(segments.size() == 1);
return {granularity_q};
}
const int64_t granularity_kv = granularity_q / n_gqa;
if (std::regex_match(tensor_name, pattern_kv_weight) ||
std::regex_match(tensor_name, pattern_kv_bias) ||
std::regex_match(tensor_name, pattern_kv_cache)) {
GGML_ASSERT(segments.size() == 1);
return {granularity_kv};
}
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_qkv_bias)) {
GGML_ASSERT(segments.size() == 3);
return {granularity_q, granularity_kv, granularity_kv};
}
}
// FFN
if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight) || std::regex_match(tensor_name, pattern_ffn_up_gate_bias) ||
std::regex_match(tensor_name, pattern_ffn_gate_up_weight) || std::regex_match(tensor_name, pattern_ffn_down_weight)) {
GGML_ASSERT(segments.size() <= 2);
return std::vector<int64_t>(segments.size(), blck_size);
}
// everything else
GGML_ASSERT(segments.size() == 1);
return {1};
};
ggml_backend_meta_split_state split_state;
memset(&split_state, 0, sizeof(split_state));
tensor_config tc = get_tensor_config();
split_state.axis = tc.axis;
if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) {
const int64_t ne_full = tensor->ne[split_state.axis];
const int64_t blck_size = ggml_blck_size(tc.tensor_axis_0->type);
const float * tensor_split = ud->model->tensor_split();
std::vector<float> tensor_split_scan;
tensor_split_scan.reserve(ud->n_devices);
for (size_t j = 0; j < ud->n_devices; j++) {
tensor_split_scan.push_back(tensor_split == nullptr ? 0.0f : tensor_split[(j + tc.rotation) % ud->n_devices]);
if (j > 0) {
tensor_split_scan[j] += tensor_split_scan[j - 1];
}
}
const std::vector<int64_t> segments = get_split_segments(split_state.axis, tc.il);
const std::vector<int64_t> granularity = get_split_granularity(blck_size, tc.il, segments);
for (size_t is = 0; is < segments.size(); is++) {
const int64_t ne_s = segments[is];
const int64_t g_s = granularity[is];
GGML_ASSERT(ne_full % g_s == 0);
int64_t low = 0;
size_t j = 0;
for (; j < ud->n_devices - 1; j++) {
int64_t high = tensor_split_scan.back() == 0.0f ?
ne_s * (j+1)/ud->n_devices : ne_s * tensor_split_scan[j]/tensor_split_scan.back();
if (high % g_s != 0) {
high -= high % g_s;
}
split_state.ne[is*ud->n_devices + (j + tc.rotation) % ud->n_devices] = high - low;
low = high;
}
split_state.ne[is*ud->n_devices + (j + tc.rotation) % ud->n_devices] = ne_s - low;
}
split_state.n_segments = segments.size();
} else {
memset(split_state.ne, 0, sizeof(split_state.ne));
split_state.n_segments = 1;
}
return split_state;
GGML_UNUSED(userdata);
}
const char * llm_type_name(llm_type type) {
switch (type) {
@ -181,7 +507,7 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
}
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
static buft_list_t make_cpu_buft_list(const std::vector<llama_device> & devices, bool use_extra_bufts, bool no_host) {
buft_list_t buft_list;
// add ACCEL buffer types
@ -203,10 +529,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
// a better approach would be to handle this on a weight-by-weight basis using the offload_op
// function of the device to determine if it would benefit from being stored in a host buffer
if (!no_host) {
for (auto * dev : devices) {
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
for (const auto & dev : devices) {
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev.dev);
if (buft) {
buft_list.emplace_back(dev, buft);
buft_list.emplace_back(dev.dev, buft);
break;
}
}
@ -273,14 +599,16 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
// add the device extra buffer type (if any)
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
if (reg) {
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(dev, *extra_bufts);
++extra_bufts;
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(dev, *extra_bufts);
++extra_bufts;
}
}
}
@ -342,6 +670,9 @@ void llama_model::load_arch(llama_model_loader & ml) {
if (arch == LLM_ARCH_UNKNOWN) {
throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
}
if (!devices.empty() && devices[0].is_meta && !llm_arch_supports_sm_tensor(arch)) {
throw std::runtime_error(std::string("LLAMA_SPLIT_MODE_TENSOR not implemented for architecture '") + llm_arch_name(arch) + "'");
}
}
void llama_model::load_hparams(llama_model_loader & ml) {
@ -2624,11 +2955,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// build a list of buffer types for the CPU and GPU devices
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
for (auto * dev : devices) {
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
for (const auto & dev : devices) {
buft_list_t buft_list = make_gpu_buft_list(dev.dev, split_mode, tensor_split);
// add CPU buffer types as a fallback
buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
pimpl->gpu_buft_list.emplace(dev.dev, std::move(buft_list));
}
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
@ -2642,7 +2973,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
if (all_zero) {
// default split, by free memory
for (size_t i = 0; i < n_devices(); ++i) {
ggml_backend_dev_t dev = devices[i];
ggml_backend_dev_t dev = devices[i].dev;
size_t total;
size_t free;
ggml_backend_dev_memory(dev, &free, &total);
@ -2678,7 +3009,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
return {cpu_dev, &pimpl->cpu_buft_list};
}
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
auto * dev = devices.at(layer_gpu);
auto * dev = devices.at(layer_gpu).dev;
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
return {dev, &pimpl->gpu_buft_list.at(dev)};
};
@ -7763,6 +8094,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
ml.done_getting_tensors();
// populate tensors_by_name
for (auto & [_, ctx_ptr] : ml.ctx_map) {
for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
tensors_by_name.emplace_back(ggml_get_name(cur), cur);
}
}
ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
pimpl->mappings.reserve(ml.mappings.size());
@ -7881,13 +8219,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
// populate tensors_by_name
for (auto & [ctx, _] : pimpl->ctxs_bufs) {
for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
tensors_by_name.emplace_back(ggml_get_name(cur), cur);
}
}
if (ml.no_alloc) {
return true;
}
@ -7932,6 +8263,10 @@ size_t llama_model::n_devices() const {
return devices.size();
}
const float * llama_model::tensor_split() const {
return params.tensor_split;
}
uint32_t llama_model::n_gpu_layers() const {
return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
}

View File

@ -499,6 +499,19 @@ struct llama_layer {
struct llama_layer_nextn nextn;
};
struct llama_device {
bool is_meta;
ggml_backend_dev_t dev;
};
struct llama_meta_device_get_split_state_userdata {
size_t n_devices;
const struct llama_model * model;
};
struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const struct ggml_tensor * tensor, void * userdata);
struct llama_model {
llm_type type = LLM_TYPE_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
@ -553,7 +566,7 @@ struct llama_model {
std::unordered_map<std::string, std::string> gguf_kv;
// list of devices used in this model
std::vector<ggml_backend_dev_t> devices;
std::vector<llama_device> devices;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@ -561,6 +574,9 @@ struct llama_model {
// for keeping track of associated LoRA adapters
std::unordered_set<llama_adapter_lora *> loras;
// statically allocated context for assigning
struct llama_meta_device_get_split_state_userdata get_split_state_ud;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
@ -581,6 +597,7 @@ struct llama_model {
size_t size() const; // file size
size_t n_tensors() const;
size_t n_devices() const;
const float * tensor_split() const;
uint32_t n_gpu_layers() const;
llama_split_mode split_mode() const;

View File

@ -1,6 +1,5 @@
#include "llama.h"
#include "ggml-cpp.h"
#include "llama-impl.h"
#include "llama-chat.h"
@ -12,9 +11,13 @@
#include "llama-model.h"
#include "ggml.h"
#include "ggml-cpp.h"
#include "ggml-backend.h"
#include "gguf.h"
// TODO: tmp until the ggml meta backend matures and becomes public
#include "../src/ggml-ext.h"
#include <algorithm>
#include <cassert>
#include <cinttypes>
@ -24,6 +27,7 @@
#include <cstring>
#include <ctime>
#include <stdexcept>
#include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@ -53,7 +57,7 @@ struct llama_device_memory_data {
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
std::vector<llama_device> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
const ggml_log_level log_level) {
struct user_data_t {
struct {
@ -104,7 +108,7 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
continue;
}
for (size_t i = 0; i < ret.size(); i++) {
if (model->devices[i] == dev) {
if (model->devices[i].dev == dev) {
ret[i].mb.model += mb.model;
ret[i].mb.context += mb.context;
ret[i].mb.compute += mb.compute;
@ -115,7 +119,7 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
for (size_t i = 0; i < ret.size(); i++) {
size_t free;
size_t total;
ggml_backend_dev_memory(model->devices[i], &free, &total);
ggml_backend_dev_memory(model->devices[i].dev, &free, &total);
// devices can return 0 bytes for free and total memory if they do not
// have any to report. in this case, we will use the host memory as a fallback
@ -162,11 +166,14 @@ static void llama_params_fit_impl(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
throw llama_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
}
constexpr int64_t MiB = 1024*1024;
typedef std::vector<llama_device_memory_data> dmds_t;
const llama_model_params default_mparams = llama_model_default_params();
std::vector<ggml_backend_dev_t> devs;
std::vector<llama_device> devs;
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
uint32_t hp_nct = 0; // hparams.n_ctx_train
uint32_t hp_nex = 0; // hparams.n_expert
@ -191,10 +198,10 @@ static void llama_params_fit_impl(
{
dev_names.reserve(nd);
size_t max_length = 0;
for (ggml_backend_dev_t dev : devs) {
std::string name = ggml_backend_dev_name(dev);
for (const llama_device & dev : devs) {
std::string name = ggml_backend_dev_name(dev.dev);
name += " (";
name += ggml_backend_dev_description(dev);
name += ggml_backend_dev_description(dev.dev);
name += ")";
dev_names.push_back(name);
max_length = std::max(max_length, name.length());
@ -685,7 +692,7 @@ static void llama_params_fit_impl(
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
if (id < nd - 1) {
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1].dev);
}
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
@ -935,58 +942,111 @@ static struct llama_model * llama_model_load_from_file_impl(
// create list of devices to use with this model
if (params.devices) {
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
model->devices.push_back(*dev);
if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
size_t n_devs = 0;
while (params.devices[n_devs]) {
n_devs++;
}
if (n_devs == 0) {
LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__);
return nullptr;
}
LLAMA_LOG_INFO("%s: creating a Meta device with %zu devices\n", __func__, n_devs);
for (size_t i = 0; i < n_devs; ++i) {
LLAMA_LOG_INFO("%s: - device %zu: %s\n", __func__, i, ggml_backend_dev_name(params.devices[i]));
}
model->get_split_state_ud.n_devices = n_devs;
model->get_split_state_ud.model = model;
model->devices.push_back({
true, ggml_backend_meta_device(
params.devices, n_devs, llama_meta_device_get_split_state, &model->get_split_state_ud)
});
} else {
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
model->devices.push_back({false, *dev});
}
}
} else {
// default device selection
// build list of available devices
std::vector<ggml_backend_dev_t> gpus;
std::vector<ggml_backend_dev_t> igpus;
std::vector<ggml_backend_dev_t> rpc_servers;
std::vector<llama_device> gpus;
std::vector<llama_device> igpus;
std::vector<llama_device> rpc_servers;
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
switch (ggml_backend_dev_type(dev)) {
case GGML_BACKEND_DEVICE_TYPE_CPU:
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
// skip CPU backends since they are handled separately
break;
case GGML_BACKEND_DEVICE_TYPE_GPU: {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_servers.push_back(dev);
} else {
// check if there is already a GPU with the same device id
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
ggml_backend_dev_props d_props;
ggml_backend_dev_get_props(d, &d_props);
if (props.device_id && d_props.device_id) {
return strcmp(props.device_id, d_props.device_id) == 0;
}
return false;
});
if (it != gpus.end()) {
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
__func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
props.device_id ? props.device_id : "unknown id",
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
} else {
gpus.push_back(dev);
}
}
break;
if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
std::vector<ggml_backend_dev_t> devs;
devs.reserve(ggml_backend_dev_count());
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_buffer_type(dev) == ggml_backend_cpu_buffer_type()) {
LLAMA_LOG_INFO("%s: skipping %s (%s) for tensor parallelism\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev));
continue;
}
devs.push_back(dev);
}
if (devs.empty()) {
LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__);
return nullptr;
}
case GGML_BACKEND_DEVICE_TYPE_IGPU:
igpus.push_back(dev);
break;
LLAMA_LOG_INFO("%s: creating a Meta device for tensor parallelism from %zu devices:\n", __func__, devs.size());
for (size_t i = 0; i < devs.size(); ++i) {
LLAMA_LOG_INFO("%s: - device %zu: %s (%s)\n", __func__, i, ggml_backend_dev_name(devs[i]), ggml_backend_dev_description(devs[i]));
}
GGML_ASSERT(!devs.empty());
model->get_split_state_ud.n_devices = devs.size();
model->get_split_state_ud.model = model;
gpus.push_back({
true, ggml_backend_meta_device(
devs.data(), devs.size(), llama_meta_device_get_split_state, &model->get_split_state_ud)
});
} else {
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
switch (ggml_backend_dev_type(dev)) {
case GGML_BACKEND_DEVICE_TYPE_CPU:
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
// skip CPU backends since they are handled separately
break;
case GGML_BACKEND_DEVICE_TYPE_GPU: {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_servers.push_back({false, dev});
} else {
// check if there is already a GPU with the same device id
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](const llama_device & d) {
ggml_backend_dev_props d_props;
ggml_backend_dev_get_props(d.dev, &d_props);
if (props.device_id && d_props.device_id) {
return strcmp(props.device_id, d_props.device_id) == 0;
}
return false;
});
if (it != gpus.end()) {
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
__func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
props.device_id ? props.device_id : "unknown id",
ggml_backend_dev_name(it->dev), ggml_backend_dev_description(it->dev));
} else {
gpus.push_back({false, dev});
}
}
break;
}
case GGML_BACKEND_DEVICE_TYPE_IGPU:
igpus.push_back({false, dev});
break;
case GGML_BACKEND_DEVICE_TYPE_META:
GGML_ABORT("fatal error");
}
}
}
@ -1012,17 +1072,17 @@ static struct llama_model * llama_model_load_from_file_impl(
llama_model_free(model);
return nullptr;
}
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
llama_device main_gpu = model->devices[params.main_gpu];
model->devices.clear();
model->devices.push_back(main_gpu);
}
}
for (auto * dev : model->devices) {
for (const auto & dev : model->devices) {
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
ggml_backend_dev_get_props(dev.dev, &props);
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
ggml_backend_dev_name(dev.dev), ggml_backend_dev_description(dev.dev),
props.device_id ? props.device_id : "unknown id",
props.memory_free/1024/1024);
}

View File

@ -225,6 +225,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
cb(beta, "beta", il);
beta = ggml_sigmoid(ctx0, beta);
cb(beta, "beta_sigmoid", il);
ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s);
alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
@ -269,7 +270,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
cb(last_conv_states, "last_conv_states", il);
ggml_tensor * state_update_target =
ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs,
ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1],
kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
cb(state_update_target, "state_update_target", il);
@ -345,7 +346,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
// Update the recurrent states
ggml_build_forward_expand(gf,
ggml_cpy(ctx0, new_state,
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1],
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]

View File

@ -225,6 +225,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
cb(beta, "beta", il);
beta = ggml_sigmoid(ctx0, beta);
cb(beta, "beta_sigmoid", il);
ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s);
alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
@ -269,7 +270,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
cb(last_conv_states, "last_conv_states", il);
ggml_tensor * state_update_target =
ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs,
ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1],
kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
cb(state_update_target, "state_update_target", il);
@ -345,7 +346,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
// Update the recurrent states
ggml_build_forward_expand(gf,
ggml_cpy(ctx0, new_state,
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1],
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]

View File

@ -414,19 +414,19 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
GGML_ASSERT(num_v_heads % num_k_heads == 0);
int64_t repeat_factor = num_v_heads / num_k_heads;
// repeat interleave: reshape to (repeat part, 1, remaining part), do repeat, then reshape back
ggml_tensor * q_reshaped = ggml_reshape_3d(ctx0, q_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs);
ggml_tensor * k_reshaped = ggml_reshape_3d(ctx0, k_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs);
// repeat interleave: reshape to (repeat part, 1, remaining part...), do repeat, then reshape back
ggml_tensor * q_reshaped = ggml_reshape_4d(ctx0, q_conv, head_k_dim, 1, num_k_heads, n_seq_tokens * n_seqs);
ggml_tensor * k_reshaped = ggml_reshape_4d(ctx0, k_conv, head_k_dim, 1, num_k_heads, n_seq_tokens * n_seqs);
// Repeat along the third dimension (the new dimension with size 1)
ggml_tensor * q_repeated =
ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1);
ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads, n_seq_tokens * n_seqs);
ggml_tensor * k_repeated =
ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1);
ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads, n_seq_tokens * n_seqs);
// Reshape back to merge the head and repeat dimensions
// From [head_dim, num_k_heads, repeat_factor, n_seq_tokens * n_seqs]
// Back to [head_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs]
// From [head_dim, repeat_factor, num_k_heads, n_seq_tokens * n_seqs]
// Back to [head_dim, repeat_factor * num_k_heads, n_seq_tokens, n_seqs]
q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs);
k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs);
}

View File

@ -6,6 +6,8 @@
#include "ggml-cpp.h"
#include "llama.h"
#include "llama-cpp.h"
// TODO: replace with #include "llama-ext.h" in the future
#include "../src/llama-arch.h"
#include "../src/llama-model-saver.h"
@ -205,9 +207,9 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
ms.add_kv(LLM_KV_XIELU_ALPHA_P, 1.0f);
ms.add_kv(LLM_KV_XIELU_BETA, 1.0f);
ms.add_kv(LLM_KV_XIELU_EPS, 1.0e-7f);
ms.add_kv(LLM_KV_SSM_INNER_SIZE, arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 64 : 2*n_embd);
ms.add_kv(LLM_KV_SSM_INNER_SIZE, arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 256 : 2*n_embd);
ms.add_kv(LLM_KV_SSM_CONV_KERNEL, uint32_t(4));
ms.add_kv(LLM_KV_SSM_STATE_SIZE, uint32_t(32));
ms.add_kv(LLM_KV_SSM_STATE_SIZE, uint32_t(128));
ms.add_kv(LLM_KV_SSM_TIME_STEP_RANK, n_head);
ms.add_kv(LLM_KV_SSM_GROUP_COUNT, arch == LLM_ARCH_PLAMO2 ? 0 : uint32_t(2));
ms.add_kv(LLM_KV_KDA_HEAD_DIM, uint32_t(128));
@ -235,18 +237,23 @@ static bool silent_model_load_progress(float /*progress*/, void * /*user_data*/)
}
static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs,
const llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER, bool encode = false) {
GGML_ASSERT((gguf_ctx == nullptr) != (file == nullptr));
llama_model_params model_params = llama_model_default_params();
model_params.progress_callback = silent_model_load_progress;
std::vector<ggml_backend_dev_t> devs_copy = devs;
devs_copy.push_back(nullptr);
model_params.devices = devs_copy.data();
model_params.split_mode = split_mode;
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 0;
ctx_params.n_threads = 4;
ctx_params.n_threads_batch = 4;
if (!encode) {
ctx_params.n_ubatch = 64;
}
size_t tmp = seed;
llama_model_ptr model(gguf_ctx != nullptr ?
@ -357,6 +364,46 @@ static bool moe_implemented(const llm_arch arch) {
}
}
static bool arch_supported(const llm_arch arch) {
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
return false; // These models don't have usable implementations.
}
if (arch == LLM_ARCH_CHAMELEON) {
return false; // Only half-implemented and to be removed in the future.
}
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
return false; // FIXME CUDA backend crashes.
}
if (arch == LLM_ARCH_GEMMA4) {
return false; // FIXME @ngxson
}
if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
return false; // FIXME Embedding (?) models produce inconsistent results.
}
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
return false; // FIXME RWKV models hang indefinitely.
}
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
return false; // TODO vocab
}
if (arch == LLM_ARCH_PLM) {
return false; // TODO tensor shapes
}
if (arch == LLM_ARCH_DEEPSEEK2OCR) {
return false;
}
// FIXME some models are segfaulting with WebGPU:
#ifdef GGML_USE_WEBGPU
if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
return false;
}
#endif // GGML_USE_WEBGPU
return true;
}
static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) {
struct user_data_t {
struct {
@ -376,27 +423,11 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
}, &ud);
for (const llm_arch & arch : llm_arch_all()) {
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
if (arch == LLM_ARCH_UNKNOWN) {
continue;
}
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
continue; // These models don't have usable implementations.
}
if (arch == LLM_ARCH_CHAMELEON) {
continue; // Only half-implemented and to be removed in the future.
}
if (arch == LLM_ARCH_GEMMA4) {
continue; // FIXME @ngxson
}
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
continue; // FIXME
}
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
continue; // TODO vocab
}
if (arch == LLM_ARCH_PLM) {
continue; // TODO tensor shapes
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
continue;
}
for (bool moe : {false, true}) {
if (moe && !moe_implemented(arch)) {
@ -440,51 +471,47 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
const std::vector<llama_token> tokens = get_tokens(128, 128, seed);
struct device_config {
std::vector<ggml_backend_dev_t> devs;
std::string label;
llama_split_mode split_mode;
device_config(std::vector<ggml_backend_dev_t> devs, std::string name, llama_split_mode split_mode)
: devs(std::move(devs)), label(std::move(name)), split_mode(split_mode) {}
};
std::vector<device_config> dev_configs;
{
std::vector<ggml_backend_dev_t> devices_meta;
{
const size_t device_count = ggml_backend_dev_count();
for (size_t i = 0; i < device_count; i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
dev_configs.emplace_back(std::vector<ggml_backend_dev_t>{dev}, ggml_backend_dev_description(dev), LLAMA_SPLIT_MODE_LAYER);
// cpu-based devices cannot be used in tensor split mode
if (ggml_backend_dev_buffer_type(dev) != ggml_backend_cpu_buffer_type()) {
devices_meta.push_back(dev);
}
}
}
dev_configs.emplace_back(devices_meta, "Meta", LLAMA_SPLIT_MODE_TENSOR);
}
bool all_ok = true;
common_log_flush(common_log_main());
printf("|%15s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
printf("|---------------|------------------------------|------|---------------|---------|\n");
printf("|%16s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
printf("|----------------|------------------------------|------|---------------|---------|\n");
for (const llm_arch & arch : llm_arch_all()) {
if (arch == LLM_ARCH_UNKNOWN) {
continue;
}
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
continue;
}
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
continue; // These models don't have usable implementations.
}
if (arch == LLM_ARCH_CHAMELEON) {
continue; // Only half-implemented and to be removed in the future.
}
if (arch == LLM_ARCH_GEMMA4) {
continue; // FIXME @ngxson
}
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
continue; // FIXME CUDA backend crashes.
}
if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
continue; // FIXME Embedding (?) models produce inconsistent results.
}
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
continue; // FIXME RWKV models hang indefinitely.
}
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
continue; // TODO vocab
}
if (arch == LLM_ARCH_PLM) {
continue; // TODO tensor shapes
}
if (arch == LLM_ARCH_DEEPSEEK2OCR) {
continue; // TODO tensor shapes
}
// FIXME some models are segfaulting with WebGPU:
#ifdef GGML_USE_WEBGPU
if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
continue;
}
#endif // GGML_USE_WEBGPU
const bool encode = arch == LLM_ARCH_T5;
const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1;
for (bool moe : {false, true}) {
if (moe && !moe_implemented(arch)) {
continue;
@ -492,50 +519,64 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
if (!moe && moe_mandatory(arch)) {
continue;
}
const std::string config_name = moe ? "MoE" : "Dense";
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {});
const std::vector<float> logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
continue;
}
auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {dev});
std::string config_name = moe ? "MoE" : "Dense";
const std::vector<float> logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
const double nmse_val = nmse(logits_cpu, logits_dev);
char nmse_str[10];
snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val);
std::string status_nmse = "\033[1;32mOK\033[0m";
if (nmse_val > 1e-4) {
all_ok = false;
status_nmse = "\033[1;31mFAIL\033[0m";
}
std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_cpu;
std::vector<float> logits_cpu;
for (device_config & dc : dev_configs) {
std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_dev;
std::vector<float> logits_dev;
std::string status_nmse = "\033[1;33mSKIP\033[0m";
std::string status_roundtrip = "\033[1;33mSKIP\033[0m";
FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
if (file != nullptr && llama_model_saver_supports_arch(arch)) {
llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
ms.add_kv_from_model();
ms.add_tensors_from_model();
ms.save(file);
rewind(file);
auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, {dev});
const std::vector<float> logits_roundtrip = get_logits(
model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
status_roundtrip = "\033[1;32mOK\033[0m";
GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
for (size_t i = 0; i < logits_roundtrip.size(); i++) {
if (logits_roundtrip[i] != logits_dev[i]) {
char nmse_str[12] = {0};
bool skip = !arch_supported(arch) || (dc.split_mode == LLAMA_SPLIT_MODE_TENSOR && dc.devs.empty());
#if defined(GGML_USE_WEBGPU)
skip = true; // FIXME
#endif // GGML_USE_WEBGPU
if (!skip) {
if (logits_cpu.empty()) {
model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {}, LLAMA_SPLIT_MODE_LAYER, encode);
logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
}
if (dc.split_mode != LLAMA_SPLIT_MODE_TENSOR || llm_arch_supports_sm_tensor(arch)) {
model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, dc.devs, dc.split_mode, encode);
logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
const double nmse_val = nmse(logits_cpu, logits_dev);
snprintf(nmse_str, sizeof(nmse_str), "(%.2e)", nmse_val);
status_nmse = "\033[1;32mOK\033[0m";
if (nmse_val > 1e-4) {
all_ok = false;
status_roundtrip = "\033[1;31mFAIL\033[0m";
break;
status_nmse = "\033[1;31mFAIL\033[0m";
}
}
FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
// FIXME: when adding a tensor to a gguf_context a copy is made, this changes the pointer which the meta backend
// in turn uses to map the tensors to their simple equivalents - this is fundamentally incompatible
if (file != nullptr && llama_model_saver_supports_arch(arch) && dc.split_mode != LLAMA_SPLIT_MODE_TENSOR) {
GGML_ASSERT(model_and_ctx_dev.first && model_and_ctx_dev.second);
llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
ms.add_kv_from_model();
ms.add_tensors_from_model();
ms.save(file);
rewind(file);
auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, dc.devs, dc.split_mode, encode);
const std::vector<float> logits_roundtrip = get_logits(
model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
status_roundtrip = "\033[1;32mOK\033[0m";
GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
for (size_t i = 0; i < logits_roundtrip.size(); i++) {
if (logits_roundtrip[i] != logits_dev[i]) {
all_ok = false;
status_roundtrip = "\033[1;31mFAIL\033[0m";
break;
}
}
}
}
printf("|%15s|%30s|%6s|%15s (%8s)|%20s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
printf("|%16s|%30s|%6s|%15s %10s|%20s|\n", llm_arch_name(arch), dc.label.c_str(),
config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
}
}

View File

@ -260,6 +260,8 @@ static const char * split_mode_str(llama_split_mode mode) {
return "layer";
case LLAMA_SPLIT_MODE_ROW:
return "row";
case LLAMA_SPLIT_MODE_TENSOR:
return "tensor";
default:
GGML_ABORT("invalid split mode");
}
@ -444,7 +446,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n", join(cmd_params_defaults.n_cpu_moe, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -sm, --split-mode <none|layer|row|tensor> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
@ -743,6 +745,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
mode = LLAMA_SPLIT_MODE_LAYER;
} else if (m == "row") {
mode = LLAMA_SPLIT_MODE_ROW;
} else if (m == "tensor") {
mode = LLAMA_SPLIT_MODE_TENSOR;
} else {
invalid_param = true;
break;
@ -1768,7 +1772,7 @@ struct markdown_printer : public printer {
return 6;
}
if (field == "split_mode") {
return 5;
return 6;
}
if (field == "flash_attn") {
return 2;

View File

@ -2049,11 +2049,16 @@ int main(int argc, char ** argv) {
auto * model = llama_init->model();
auto * ctx = llama_init->context();
if (model == NULL) {
if (model == nullptr) {
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
if (ctx == nullptr) {
LOG_ERR("%s: failed to create context\n", __func__);
return 1;
}
const int n_ctx_train = llama_model_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {