Merge 39b96f8fe1 into 06bf3796f4
This commit is contained in:
commit
dceeeef926
|
|
@ -2331,19 +2331,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
||||
add_opt(common_arg(
|
||||
{"-sm", "--split-mode"}, "{none,layer,row}",
|
||||
{"-sm", "--split-mode"}, "{none,layer,row,tensor}",
|
||||
"how to split the model across multiple GPUs, one of:\n"
|
||||
"- none: use one GPU only\n"
|
||||
"- layer (default): split layers and KV across GPUs\n"
|
||||
"- row: split rows across GPUs",
|
||||
"- layer (default): split layers and KV across GPUs (pipelined)\n"
|
||||
"- row: split weight across GPUs by rows (parallelized)\n"
|
||||
"- tensor: split weights and KV across GPUs (parallelized)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::string arg_next = value;
|
||||
if (arg_next == "none") {
|
||||
if (value == "none") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||
} else if (arg_next == "layer") {
|
||||
} else if (value == "layer") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (arg_next == "row") {
|
||||
} else if (value == "row") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||
} else if (value == "tensor") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
|
||||
} else {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ extern "C" {
|
|||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
//
|
||||
// Backend (stream)
|
||||
|
|
@ -109,7 +109,18 @@ extern "C" {
|
|||
// the copy is performed after all the currently queued operations in backend_src
|
||||
// backend_dst will wait for the copy to complete before performing other operations
|
||||
// automatic fallback to sync copy if async is not supported
|
||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// asynchronous tensor shuffle
|
||||
// - src1, dst1 belong to backend_1
|
||||
// - src2, dst2 belong to backend_2
|
||||
// - src1 is copied to dst2
|
||||
// - src2 is copied to dst1
|
||||
// - both backends wait until both copies have completed
|
||||
GGML_API void ggml_backend_tensor_shfl_async(
|
||||
ggml_backend_t backend_1, ggml_backend_t backend_2,
|
||||
const struct ggml_tensor * src1, const struct ggml_tensor * src2,
|
||||
struct ggml_tensor * dst1, struct ggml_tensor * dst2);
|
||||
|
||||
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
|
||||
|
||||
|
|
@ -135,7 +146,9 @@ extern "C" {
|
|||
// integrated GPU device using host memory
|
||||
GGML_BACKEND_DEVICE_TYPE_IGPU,
|
||||
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
||||
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
||||
GGML_BACKEND_DEVICE_TYPE_ACCEL,
|
||||
// "meta" device wrapping multiple other devices for tensor parallelism
|
||||
GGML_BACKEND_DEVICE_TYPE_META,
|
||||
};
|
||||
|
||||
// functionality supported by the device
|
||||
|
|
@ -211,6 +224,52 @@ extern "C" {
|
|||
};
|
||||
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
|
||||
|
||||
//
|
||||
// Meta backend
|
||||
//
|
||||
|
||||
enum ggml_backend_meta_split_state {
|
||||
// tensor split by tensor dimensions:
|
||||
GGML_BACKEND_SPLIT_STATE_BY_NE0 = 0,
|
||||
GGML_BACKEND_SPLIT_STATE_BY_NE1 = 1,
|
||||
GGML_BACKEND_SPLIT_STATE_BY_NE2 = 2,
|
||||
GGML_BACKEND_SPLIT_STATE_BY_NE3 = 3,
|
||||
|
||||
GGML_BACKEND_SPLIT_STATE_MIRRORED = 10, // all values on all backends
|
||||
GGML_BACKEND_SPLIT_STATE_PARTIAL = 11, // each backend has a partial sum
|
||||
|
||||
// for internal bookkeeping only:
|
||||
GGML_BACKEND_SPLIT_STATE_NONE = 98,
|
||||
GGML_BACKEND_SPLIT_STATE_UNKNOWN = 99,
|
||||
};
|
||||
|
||||
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
|
||||
typedef enum ggml_backend_meta_split_state (*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
|
||||
|
||||
|
||||
GGML_API bool ggml_backend_dev_is_meta(ggml_backend_dev_t dev);
|
||||
GGML_API size_t ggml_backend_meta_dev_n_devs(ggml_backend_dev_t meta_dev);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_meta_dev_simple_dev(ggml_backend_dev_t meta_dev, size_t index);
|
||||
|
||||
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
|
||||
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
|
||||
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
|
||||
|
||||
GGML_API bool ggml_backend_buft_is_meta(ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_meta_buft_n_bufts(ggml_backend_buffer_type_t meta_buft);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_meta_buft_simple_buft(ggml_backend_buffer_type_t meta_buft, size_t index);
|
||||
|
||||
GGML_API bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf);
|
||||
GGML_API size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index);
|
||||
GGML_API struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index);
|
||||
|
||||
GGML_API bool ggml_backend_is_meta(ggml_backend_t backend);
|
||||
GGML_API size_t ggml_backend_meta_n_backends(ggml_backend_t meta_backend);
|
||||
GGML_API ggml_backend_t ggml_backend_meta_simple_backend(ggml_backend_t meta_backend, size_t index);
|
||||
|
||||
GGML_API enum ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
//
|
||||
|
|
|
|||
|
|
@ -200,6 +200,7 @@ add_library(ggml-base
|
|||
ggml.cpp
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
ggml-backend-meta.cpp
|
||||
ggml-opt.cpp
|
||||
ggml-threading.cpp
|
||||
ggml-threading.h
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <assert.h>
|
||||
|
|
@ -1240,6 +1241,9 @@ size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx,
|
|||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
if (ggml_backend_buft_is_meta(buft)) {
|
||||
return ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
}
|
||||
size_t nbytes_total = 0;
|
||||
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,9 @@
|
|||
|
||||
// ggml-backend internal header
|
||||
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
@ -90,9 +92,16 @@ extern "C" {
|
|||
void (*free)(ggml_backend_t backend);
|
||||
|
||||
// (optional) asynchronous tensor data access
|
||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
void (*set_tensor_async) (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor_async) (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
void (*set_tensor_2d_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
|
||||
void (*get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
|
||||
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
bool (*shfl_tensor_async)(ggml_backend_t backend_1, ggml_backend_t backend_2,
|
||||
const struct ggml_tensor * src1, const struct ggml_tensor * src2, struct ggml_tensor * dst1, struct ggml_tensor * dst2);
|
||||
|
||||
// (optional) backend-specific AllReduce operation for meta backend
|
||||
bool (*allreduce_tensor_async)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
|
||||
|
||||
// (optional) complete all pending operations (required if the backend supports async operations)
|
||||
void (*synchronize)(ggml_backend_t backend);
|
||||
|
|
@ -250,6 +259,9 @@ extern "C" {
|
|||
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
|
||||
#endif
|
||||
|
||||
// temporary workaround to statically allocate tensors from a context in a deduplicated way:
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -123,7 +123,7 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
|||
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
GGML_ASSERT(buffer);
|
||||
// get_base is optional if the buffer is zero-sized
|
||||
if (buffer->size == 0) {
|
||||
if (!ggml_backend_buffer_is_meta(buffer) && buffer->size == 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -388,7 +388,7 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
|||
|
||||
// backend copy
|
||||
|
||||
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
||||
|
||||
if (src == dst) {
|
||||
|
|
@ -402,7 +402,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|||
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
||||
#endif
|
||||
#endif // NDEBUG
|
||||
size_t nbytes = ggml_nbytes(src);
|
||||
void * data = malloc(nbytes);
|
||||
ggml_backend_tensor_get(src, data, 0, nbytes);
|
||||
|
|
@ -411,7 +411,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
||||
|
||||
if (src == dst) {
|
||||
|
|
@ -432,6 +432,20 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
|
|||
ggml_backend_tensor_copy(src, dst);
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_shfl_async(
|
||||
ggml_backend_t backend_1, ggml_backend_t backend_2,
|
||||
const struct ggml_tensor * src1, const struct ggml_tensor * src2,
|
||||
struct ggml_tensor * dst1, struct ggml_tensor * dst2) {
|
||||
GGML_ASSERT(ggml_are_same_layout(src1, dst1) && "cannot shuffle tensors with different layouts");
|
||||
GGML_ASSERT(ggml_are_same_layout(src2, dst2) && "cannot shuffle tensors with different layouts");
|
||||
if (backend_1->iface.shfl_tensor_async != NULL) {
|
||||
if (backend_1->iface.shfl_tensor_async(backend_1, backend_2, src1, src2, dst1, dst2)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
ggml_backend_tensor_copy_async(backend_1, backend_2, src1, dst2);
|
||||
ggml_backend_tensor_copy_async(backend_2, backend_1, src2, dst1);
|
||||
}
|
||||
// events
|
||||
|
||||
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
|
||||
|
|
@ -500,6 +514,7 @@ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
|||
}
|
||||
|
||||
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
|
||||
GGML_ASSERT(device);
|
||||
memset(props, 0, sizeof(*props));
|
||||
device->iface.get_props(device, props);
|
||||
}
|
||||
|
|
@ -1899,8 +1914,9 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct
|
|||
GGML_ASSERT(tensor->data == NULL);
|
||||
GGML_ASSERT(tensor->view_src == NULL);
|
||||
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
|
||||
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
|
||||
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer) ||
|
||||
(char *) addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
|
||||
(char *) ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
|
||||
|
||||
tensor->buffer = buffer;
|
||||
tensor->data = addr;
|
||||
|
|
|
|||
|
|
@ -260,8 +260,12 @@ static struct ggml_backend_i blas_backend_i = {
|
|||
/* .get_name = */ ggml_backend_blas_get_name,
|
||||
/* .free = */ ggml_backend_blas_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -2582,7 +2582,11 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
|||
/* .free = */ ggml_backend_cann_free,
|
||||
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_cann_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -195,7 +195,11 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|||
/* .free = */ ggml_backend_cpu_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||||
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
||||
|
|
|
|||
|
|
@ -2804,21 +2804,21 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
|||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
|
||||
if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// device -> device copy
|
||||
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
||||
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
||||
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *) backend_src->context;
|
||||
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *) backend_dst->context;
|
||||
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;
|
||||
|
||||
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
||||
#endif
|
||||
#endif // NDEBUG
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2831,7 +2831,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
|||
return false;
|
||||
#else
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
||||
#endif
|
||||
#endif // GGML_CUDA_NO_PEER_COPY
|
||||
}
|
||||
|
||||
// record event on src stream after the copy
|
||||
|
|
@ -2851,6 +2851,77 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_shfl_tensor_async(
|
||||
ggml_backend_t backend_1, ggml_backend_t backend_2, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst1, ggml_tensor * dst2) {
|
||||
ggml_backend_buffer_t buf_src1 = src1->view_src ? src1->view_src->buffer : src1->buffer;
|
||||
ggml_backend_buffer_t buf_src2 = src2->view_src ? src2->view_src->buffer : src2->buffer;
|
||||
ggml_backend_buffer_t buf_dst1 = dst1->view_src ? dst1->view_src->buffer : dst1->buffer;
|
||||
ggml_backend_buffer_t buf_dst2 = dst2->view_src ? dst2->view_src->buffer : dst2->buffer;
|
||||
|
||||
if (!ggml_backend_is_cuda(backend_1) || !ggml_backend_is_cuda(backend_2)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_backend_buffer_is_cuda(buf_src1) || !ggml_backend_buffer_is_cuda(buf_src2) ||
|
||||
!ggml_backend_buffer_is_cuda(buf_dst1) || !ggml_backend_buffer_is_cuda(buf_dst2)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// device -> device copy
|
||||
ggml_backend_cuda_context * cuda_ctx_1 = (ggml_backend_cuda_context *) backend_1->context;
|
||||
ggml_backend_cuda_context * cuda_ctx_2 = (ggml_backend_cuda_context *) backend_2->context;
|
||||
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src1 = (ggml_backend_cuda_buffer_context *) buf_src1->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src2 = (ggml_backend_cuda_buffer_context *) buf_src2->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst1 = (ggml_backend_cuda_buffer_context *) buf_dst1->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst2 = (ggml_backend_cuda_buffer_context *) buf_dst2->context;
|
||||
|
||||
if (cuda_ctx_1->device != buf_ctx_src1->device || cuda_ctx_2->device != buf_ctx_src2->device ||
|
||||
cuda_ctx_1->device != buf_ctx_dst1->device || cuda_ctx_2->device != buf_ctx_dst2->device) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
||||
#endif // NDEBUG
|
||||
return false;
|
||||
}
|
||||
|
||||
if (backend_1 != backend_2) {
|
||||
// Copies under control of src streams:
|
||||
if (cuda_ctx_1->device == cuda_ctx_2->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst2->data, src1->data, ggml_nbytes(dst2), cudaMemcpyDeviceToDevice, cuda_ctx_1->stream()));
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst1->data, src2->data, ggml_nbytes(dst1), cudaMemcpyDeviceToDevice, cuda_ctx_2->stream()));
|
||||
} else {
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
return false;
|
||||
#else
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst2->data, cuda_ctx_2->device, src1->data, cuda_ctx_1->device, ggml_nbytes(dst2), cuda_ctx_1->stream()));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst1->data, cuda_ctx_1->device, src2->data, cuda_ctx_2->device, ggml_nbytes(dst1), cuda_ctx_2->stream()));
|
||||
#endif // GGML_CUDA_NO_PEER_COPY
|
||||
}
|
||||
|
||||
// Record event on src streams after the copy:
|
||||
if (!cuda_ctx_1->copy_event) {
|
||||
ggml_cuda_set_device(cuda_ctx_1->device);
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_1->copy_event, cudaEventDisableTiming));
|
||||
}
|
||||
if (!cuda_ctx_2->copy_event) {
|
||||
ggml_cuda_set_device(cuda_ctx_2->device);
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_2->copy_event, cudaEventDisableTiming));
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaEventRecord(cuda_ctx_1->copy_event, cuda_ctx_1->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(cuda_ctx_2->copy_event, cuda_ctx_2->stream()));
|
||||
|
||||
// Wait on dst stream for the copies to complete:
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_2->stream(), cuda_ctx_1->copy_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_1->stream(), cuda_ctx_2->copy_event, 0));
|
||||
} else {
|
||||
// srcs and dsts are on the same backend:
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst2->data, src1->data, ggml_nbytes(dst2), cudaMemcpyDeviceToDevice, cuda_ctx_1->stream()));
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst1->data, src2->data, ggml_nbytes(dst1), cudaMemcpyDeviceToDevice, cuda_ctx_2->stream()));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
|
|
@ -4250,7 +4321,11 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
|||
/* .free = */ ggml_backend_cuda_free,
|
||||
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
||||
/* .shfl_tensor_async = */ ggml_backend_cuda_shfl_tensor_async,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -2756,7 +2756,11 @@ static struct ggml_backend_i hexagon_backend_i = {
|
|||
/* .free = */ ggml_backend_hexagon_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_hexagon_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -563,7 +563,11 @@ static ggml_backend_i ggml_backend_metal_i = {
|
|||
/* .free = */ ggml_backend_metal_free,
|
||||
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_metal_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -3478,6 +3478,10 @@ static ggml_backend_i ggml_backend_opencl_i = {
|
|||
/* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
|
||||
/* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
|
||||
/* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_opencl_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -893,6 +893,10 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_rpc_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -4455,9 +4455,13 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|||
/* .free = */ ggml_backend_sycl_free,
|
||||
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
|
||||
// // TODO: update for the new
|
||||
// interface
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -34,7 +34,11 @@ static ggml_backend_i ggml_backend_remoting_interface = {
|
|||
/* .free = */ ggml_backend_remoting_free,
|
||||
/* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async,
|
||||
/* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL, // ggml_backend_remoting_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -14374,7 +14374,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|||
/* .free = */ ggml_backend_vk_free,
|
||||
/* .set_tensor_async = */ ggml_backend_vk_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ ggml_backend_vk_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -2140,7 +2140,11 @@ static ggml_backend_i ggml_backend_webgpu_i = {
|
|||
/* .free = */ ggml_backend_webgpu_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -417,20 +417,24 @@ static enum ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend,
|
|||
}
|
||||
|
||||
static ggml_backend_i ggml_backend_zdnn_i = {
|
||||
/* .get_name = */ ggml_backend_zdnn_name,
|
||||
/* .free = */ ggml_backend_zdnn_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_zdnn_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
/* .get_name = */ ggml_backend_zdnn_name,
|
||||
/* .free = */ ggml_backend_zdnn_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_zdnn_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .graph_optimize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_zdnn_guid(void) {
|
||||
|
|
|
|||
|
|
@ -240,7 +240,11 @@ static struct ggml_backend_i ggml_backend_zendnn_i = {
|
|||
/* .free = */ ggml_backend_zendnn_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .shfl_tensor_async = */ NULL,
|
||||
/* .allreduce_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
|
|||
|
|
@ -189,9 +189,10 @@ extern "C" {
|
|||
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
|
||||
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
||||
LLAMA_SPLIT_MODE_TENSOR = 3,
|
||||
};
|
||||
|
||||
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||
|
|
|
|||
|
|
@ -972,9 +972,11 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
|
|||
|
||||
for (auto & backend : backends) {
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
|
||||
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
|
||||
if (set_abort_callback_fn) {
|
||||
set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
|
||||
if (reg) {
|
||||
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
|
||||
if (set_abort_callback_fn) {
|
||||
set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -417,14 +417,16 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
|
|||
|
||||
// add the device extra buffer type (if any)
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||
ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
|
||||
if (reg) {
|
||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||
ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
|
||||
|
||||
if (ggml_backend_dev_get_extra_bufts_fn) {
|
||||
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
|
||||
while (extra_bufts && *extra_bufts) {
|
||||
buft_list.emplace_back(dev, *extra_bufts);
|
||||
++extra_bufts;
|
||||
if (ggml_backend_dev_get_extra_bufts_fn) {
|
||||
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
|
||||
while (extra_bufts && *extra_bufts) {
|
||||
buft_list.emplace_back(dev, *extra_bufts);
|
||||
++extra_bufts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
143
src/llama.cpp
143
src/llama.cpp
|
|
@ -21,7 +21,9 @@
|
|||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
|
|
@ -160,6 +162,9 @@ static void llama_params_fit_impl(
|
|||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
throw llama_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
|
||||
}
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
|
@ -879,6 +884,42 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
return 0;
|
||||
}
|
||||
|
||||
static enum ggml_backend_meta_split_state llama_meta_device_get_tensor_split(const struct ggml_tensor * tensor, void * userdata) {
|
||||
// attention
|
||||
const std::regex pattern_qkv_weight("blk\\.\\d*\\.attn_(q|k|v).*");
|
||||
if (std::regex_match(tensor->name, pattern_qkv_weight)) {
|
||||
return GGML_BACKEND_SPLIT_STATE_BY_NE1;
|
||||
}
|
||||
const std::regex pattern_kv_cache("cache_(k|v)_l\\d*");
|
||||
if (std::regex_match(tensor->name, pattern_kv_cache)) {
|
||||
return GGML_BACKEND_SPLIT_STATE_BY_NE0;
|
||||
}
|
||||
const std::regex pattern_attn_out("blk\\.\\d*\\.attn_output.*");
|
||||
if (std::regex_match(tensor->name, pattern_attn_out)) {
|
||||
return GGML_BACKEND_SPLIT_STATE_BY_NE0;
|
||||
}
|
||||
|
||||
// FFN
|
||||
const std::regex pattern_ffn_up_gate("blk\\.\\d*\\.ffn_(up|gate).*");
|
||||
if (std::regex_match(tensor->name, pattern_ffn_up_gate)) {
|
||||
return GGML_BACKEND_SPLIT_STATE_BY_NE1;
|
||||
}
|
||||
const std::regex pattern_ffn_down("blk\\.\\d*\\.ffn_down.*");
|
||||
if (std::regex_match(tensor->name, pattern_ffn_down)) {
|
||||
return GGML_BACKEND_SPLIT_STATE_BY_NE0;
|
||||
}
|
||||
|
||||
// output
|
||||
const std::regex pattern_output("output");
|
||||
if (std::regex_match(tensor->name, pattern_output)) {
|
||||
return GGML_BACKEND_SPLIT_STATE_BY_NE1;
|
||||
}
|
||||
|
||||
// everything else
|
||||
return GGML_BACKEND_SPLIT_STATE_MIRRORED;
|
||||
GGML_UNUSED(userdata);
|
||||
}
|
||||
|
||||
static struct llama_model * llama_model_load_from_file_impl(
|
||||
const std::string & path_model,
|
||||
std::vector<std::string> & splits,
|
||||
|
|
@ -911,8 +952,16 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
|
||||
// create list of devices to use with this model
|
||||
if (params.devices) {
|
||||
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
||||
model->devices.push_back(*dev);
|
||||
if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
size_t n_devs = 0;
|
||||
while (params.devices[n_devs]) {
|
||||
n_devs++;
|
||||
}
|
||||
model->devices.push_back(ggml_backend_meta_device(params.devices, n_devs, llama_meta_device_get_tensor_split, nullptr));
|
||||
} else {
|
||||
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
||||
model->devices.push_back(*dev);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// default device selection
|
||||
|
|
@ -922,47 +971,61 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
std::vector<ggml_backend_dev_t> igpus;
|
||||
std::vector<ggml_backend_dev_t> rpc_servers;
|
||||
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
switch (ggml_backend_dev_type(dev)) {
|
||||
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
// skip CPU backends since they are handled separately
|
||||
break;
|
||||
if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
std::vector<ggml_backend_dev_t> devs;
|
||||
devs.reserve(ggml_backend_dev_count());
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
devs.push_back(ggml_backend_dev_get(i));
|
||||
}
|
||||
GGML_ASSERT(devs.size() >= 2);
|
||||
GGML_ASSERT(ggml_backend_dev_buffer_type(devs.back()) == ggml_backend_cpu_buffer_type());
|
||||
gpus.push_back(ggml_backend_meta_device(devs.data(), devs.size() - 1, llama_meta_device_get_tensor_split, nullptr));
|
||||
} else {
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
switch (ggml_backend_dev_type(dev)) {
|
||||
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
// skip CPU backends since they are handled separately
|
||||
break;
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
||||
rpc_servers.push_back(dev);
|
||||
} else {
|
||||
// check if there is already a GPU with the same device id
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
||||
ggml_backend_dev_props d_props;
|
||||
ggml_backend_dev_get_props(d, &d_props);
|
||||
if (props.device_id && d_props.device_id) {
|
||||
return strcmp(props.device_id, d_props.device_id) == 0;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
if (it != gpus.end()) {
|
||||
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
||||
__func__,
|
||||
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||
props.device_id ? props.device_id : "unknown id",
|
||||
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
||||
rpc_servers.push_back(dev);
|
||||
} else {
|
||||
gpus.push_back(dev);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
// check if there is already a GPU with the same device id
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
||||
ggml_backend_dev_props d_props;
|
||||
ggml_backend_dev_get_props(d, &d_props);
|
||||
if (props.device_id && d_props.device_id) {
|
||||
return strcmp(props.device_id, d_props.device_id) == 0;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||
igpus.push_back(dev);
|
||||
break;
|
||||
if (it != gpus.end()) {
|
||||
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
||||
__func__,
|
||||
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||
props.device_id ? props.device_id : "unknown id",
|
||||
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
||||
} else {
|
||||
gpus.push_back(dev);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||
igpus.push_back(dev);
|
||||
break;
|
||||
case GGML_BACKEND_DEVICE_TYPE_META:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -259,6 +259,8 @@ static const char * split_mode_str(llama_split_mode mode) {
|
|||
return "layer";
|
||||
case LLAMA_SPLIT_MODE_ROW:
|
||||
return "row";
|
||||
case LLAMA_SPLIT_MODE_TENSOR:
|
||||
return "tensor";
|
||||
default:
|
||||
GGML_ABORT("invalid split mode");
|
||||
}
|
||||
|
|
@ -440,7 +442,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
|
||||
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
|
||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
|
||||
printf(" -sm, --split-mode <none|layer|row|tensor> (default: %s)\n",
|
||||
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
printf(" -mg, --main-gpu <i> (default: %s)\n",
|
||||
join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||
|
|
@ -723,6 +725,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (m == "row") {
|
||||
mode = LLAMA_SPLIT_MODE_ROW;
|
||||
} else if (m == "tensor") {
|
||||
mode = LLAMA_SPLIT_MODE_TENSOR;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
|
|
@ -1685,7 +1689,7 @@ struct markdown_printer : public printer {
|
|||
return 6;
|
||||
}
|
||||
if (field == "split_mode") {
|
||||
return 5;
|
||||
return 6;
|
||||
}
|
||||
if (field == "flash_attn") {
|
||||
return 2;
|
||||
|
|
|
|||
Loading…
Reference in New Issue