Merge 7a838e78fc into 18ddaea2ae
This commit is contained in:
commit
5771f23030
|
|
@ -2497,27 +2497,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
|||
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determines if a tensor operation should be offloaded to the CANN
|
||||
* backend.
|
||||
*
|
||||
* This function checks if a given tensor operation should be offloaded to the
|
||||
* CANN backend based on the operation type and the size of the tensor. It
|
||||
* returns true if the second dimension (ne[1]) of the tensor is greater than or
|
||||
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
|
||||
*
|
||||
* @param backend Pointer to the CANN backend.
|
||||
* @param op Pointer to the tensor operation to check.
|
||||
* @return bool Returns true if the operation should be offloaded, otherwise
|
||||
* false.
|
||||
*/
|
||||
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
GGML_UNUSED(dev);
|
||||
|
||||
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Records an event on the CANN backend stream.
|
||||
*
|
||||
|
|
@ -2593,6 +2572,7 @@ struct ggml_backend_cann_device_context {
|
|||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
|
||||
|
|
@ -2669,6 +2649,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
|
|||
return ggml_backend_cann_host_buffer_type();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determines if a tensor operation should be offloaded to the CANN
|
||||
* backend.
|
||||
*
|
||||
* This function checks if a given tensor operation should be offloaded to the
|
||||
* CANN backend based on the operation type and the size of the tensor. It
|
||||
* returns true if the second dimension (ne[1]) of the tensor is greater than or
|
||||
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
|
||||
*
|
||||
* @param backend Pointer to the CANN backend.
|
||||
* @param op Pointer to the tensor operation to check.
|
||||
* @return bool Returns true if the operation should be offloaded, otherwise
|
||||
* false.
|
||||
*/
|
||||
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
|
||||
|
||||
return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Creates a new event for the CANN backend device.
|
||||
*
|
||||
|
|
@ -2785,12 +2785,14 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
|
|||
if (!initialized) {
|
||||
aclInit(nullptr);
|
||||
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
for (int i = 0; i < ggml_cann_info().device_count; i++) {
|
||||
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
|
||||
dev_ctx->description = aclrtGetSocName();
|
||||
dev_ctx->device = i;
|
||||
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
|
||||
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||
ggml_cann_set_device(i);
|
||||
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
|
||||
/* .reg = */ ®,
|
||||
|
|
|
|||
|
|
@ -4107,6 +4107,7 @@ struct ggml_backend_cuda_device_context {
|
|||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
|
|
@ -4660,11 +4661,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
|||
}
|
||||
|
||||
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
|
||||
return get_op_batch_size(op) >= min_batch_size;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
|
||||
|
|
@ -4832,6 +4831,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||
|
|
@ -4845,6 +4845,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
|
|
|
|||
|
|
@ -219,6 +219,8 @@ struct ggml_metal_device_props {
|
|||
bool use_shared_buffers;
|
||||
|
||||
bool supports_gpu_family_apple7;
|
||||
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
ggml_metal_device_t ggml_metal_device_init(void);
|
||||
|
|
|
|||
|
|
@ -782,6 +782,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|||
|
||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
|
||||
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
||||
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
||||
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
|
||||
|
|
|
|||
|
|
@ -625,14 +625,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
|||
}
|
||||
|
||||
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
return (op->op == GGML_OP_MUL_MAT ||
|
||||
op->op == GGML_OP_MUL_MAT_ID) &&
|
||||
get_op_batch_size(op) >= min_batch_size;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(op);
|
||||
get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
|
||||
}
|
||||
|
||||
static ggml_backend_device_i ggml_backend_metal_device_i = {
|
||||
|
|
|
|||
|
|
@ -4286,6 +4286,7 @@ struct ggml_backend_sycl_device_context {
|
|||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
|
||||
|
|
@ -4674,9 +4675,8 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
|||
}
|
||||
|
||||
static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
return get_op_batch_size(op) >= min_batch_size;
|
||||
GGML_UNUSED(dev);
|
||||
ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
|
||||
return get_op_batch_size(op) >= sycl_ctx->op_offload_min_batch_size;
|
||||
}
|
||||
|
||||
static ggml_backend_event_t
|
||||
|
|
@ -4799,6 +4799,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
|
|||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
||||
ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
|
||||
|
|
@ -4812,6 +4813,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
|
|||
prop, dpct::dev_mgr::instance().get_device(i))));
|
||||
|
||||
dev_ctx->description = prop.get_name();
|
||||
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_sycl_device_interface,
|
||||
|
|
|
|||
|
|
@ -14143,6 +14143,7 @@ struct ggml_backend_vk_device_context {
|
|||
std::string description;
|
||||
bool is_integrated_gpu;
|
||||
std::string pci_bus_id;
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
|
|
@ -14717,12 +14718,10 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
|
|||
}
|
||||
|
||||
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
|
||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
|
||||
UNUSED(dev);
|
||||
return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
|
||||
|
|
@ -14803,6 +14802,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
|||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
|
|
@ -14812,6 +14812,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
|||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
||||
ctx->op_offload_min_batch_size = min_batch_size;
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_vk_device_i,
|
||||
/* .reg = */ reg,
|
||||
|
|
|
|||
Loading…
Reference in New Issue