ggml: add env var GGML_OP_OFFLOAD_MIN_BATCH (#18535)
* ggml: add env var GGML_OP_OFFLOAD_MIN_BATCH * makes the min_batch_size for triggering op offload configurable via env var, defaulting to the prior hardcoded value of 32 * ggml: read GGML_OP_OFFLOAD_MIN_BATCH once and store to dev ctx * cann: forward declaration of device context struct * cann: move offload op check after device context declaration * cuda: fix whitespace Co-authored-by: Aman Gupta <amangupta052@gmail.com> --------- Co-authored-by: Aman Gupta <amangupta052@gmail.com>
This commit is contained in:
parent
9c142e3a2a
commit
9a5724dee2
|
|
@ -2541,27 +2541,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
||||||
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
|
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Determines if a tensor operation should be offloaded to the CANN
|
|
||||||
* backend.
|
|
||||||
*
|
|
||||||
* This function checks if a given tensor operation should be offloaded to the
|
|
||||||
* CANN backend based on the operation type and the size of the tensor. It
|
|
||||||
* returns true if the second dimension (ne[1]) of the tensor is greater than or
|
|
||||||
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
|
|
||||||
*
|
|
||||||
* @param backend Pointer to the CANN backend.
|
|
||||||
* @param op Pointer to the tensor operation to check.
|
|
||||||
* @return bool Returns true if the operation should be offloaded, otherwise
|
|
||||||
* false.
|
|
||||||
*/
|
|
||||||
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
|
||||||
const int min_batch_size = 32;
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
|
|
||||||
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Records an event on the CANN backend stream.
|
* @brief Records an event on the CANN backend stream.
|
||||||
*
|
*
|
||||||
|
|
@ -2637,6 +2616,7 @@ struct ggml_backend_cann_device_context {
|
||||||
int device;
|
int device;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string description;
|
std::string description;
|
||||||
|
int op_offload_min_batch_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -2713,6 +2693,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
|
||||||
return ggml_backend_cann_host_buffer_type();
|
return ggml_backend_cann_host_buffer_type();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Determines if a tensor operation should be offloaded to the CANN
|
||||||
|
* backend.
|
||||||
|
*
|
||||||
|
* This function checks if a given tensor operation should be offloaded to the
|
||||||
|
* CANN backend based on the operation type and the size of the tensor. It
|
||||||
|
* returns true if the second dimension (ne[1]) of the tensor is greater than or
|
||||||
|
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
|
||||||
|
*
|
||||||
|
* @param backend Pointer to the CANN backend.
|
||||||
|
* @param op Pointer to the tensor operation to check.
|
||||||
|
* @return bool Returns true if the operation should be offloaded, otherwise
|
||||||
|
* false.
|
||||||
|
*/
|
||||||
|
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
|
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
|
||||||
|
|
||||||
|
return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Creates a new event for the CANN backend device.
|
* @brief Creates a new event for the CANN backend device.
|
||||||
*
|
*
|
||||||
|
|
@ -2829,12 +2829,14 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
aclInit(nullptr);
|
aclInit(nullptr);
|
||||||
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
|
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
|
||||||
|
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||||
|
|
||||||
for (int i = 0; i < ggml_cann_info().device_count; i++) {
|
for (int i = 0; i < ggml_cann_info().device_count; i++) {
|
||||||
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
|
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
|
||||||
dev_ctx->description = aclrtGetSocName();
|
dev_ctx->description = aclrtGetSocName();
|
||||||
dev_ctx->device = i;
|
dev_ctx->device = i;
|
||||||
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
|
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
|
||||||
|
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||||
ggml_cann_set_device(i);
|
ggml_cann_set_device(i);
|
||||||
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
|
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
|
||||||
/* .reg = */ ®,
|
/* .reg = */ ®,
|
||||||
|
|
|
||||||
|
|
@ -4122,6 +4122,7 @@ struct ggml_backend_cuda_device_context {
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string description;
|
std::string description;
|
||||||
std::string pci_bus_id;
|
std::string pci_bus_id;
|
||||||
|
int op_offload_min_batch_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -4676,11 +4677,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||||
|
|
||||||
return get_op_batch_size(op) >= min_batch_size;
|
return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
|
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -4848,6 +4847,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||||
|
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||||
|
|
||||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||||
|
|
@ -4861,6 +4861,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||||
char pci_bus_id[16] = {};
|
char pci_bus_id[16] = {};
|
||||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||||
dev_ctx->pci_bus_id = pci_bus_id;
|
dev_ctx->pci_bus_id = pci_bus_id;
|
||||||
|
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||||
|
|
||||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||||
|
|
|
||||||
|
|
@ -219,6 +219,8 @@ struct ggml_metal_device_props {
|
||||||
bool use_shared_buffers;
|
bool use_shared_buffers;
|
||||||
|
|
||||||
bool supports_gpu_family_apple7;
|
bool supports_gpu_family_apple7;
|
||||||
|
|
||||||
|
int op_offload_min_batch_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_device_t ggml_metal_device_init(void);
|
ggml_metal_device_t ggml_metal_device_init(void);
|
||||||
|
|
|
||||||
|
|
@ -782,6 +782,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||||
|
|
||||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||||
|
|
||||||
|
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||||
|
|
||||||
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
||||||
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
||||||
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
|
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
|
||||||
|
|
|
||||||
|
|
@ -625,14 +625,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||||
|
|
||||||
return (op->op == GGML_OP_MUL_MAT ||
|
return (op->op == GGML_OP_MUL_MAT ||
|
||||||
op->op == GGML_OP_MUL_MAT_ID) &&
|
op->op == GGML_OP_MUL_MAT_ID) &&
|
||||||
get_op_batch_size(op) >= min_batch_size;
|
get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
GGML_UNUSED(op);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_device_i ggml_backend_metal_device_i = {
|
static ggml_backend_device_i ggml_backend_metal_device_i = {
|
||||||
|
|
|
||||||
|
|
@ -4286,6 +4286,7 @@ struct ggml_backend_sycl_device_context {
|
||||||
int device;
|
int device;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string description;
|
std::string description;
|
||||||
|
int op_offload_min_batch_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -4674,9 +4675,8 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
|
||||||
return get_op_batch_size(op) >= min_batch_size;
|
return get_op_batch_size(op) >= sycl_ctx->op_offload_min_batch_size;
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_event_t
|
static ggml_backend_event_t
|
||||||
|
|
@ -4799,6 +4799,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
|
ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
|
||||||
|
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||||
|
|
||||||
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
||||||
ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
|
ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
|
||||||
|
|
@ -4812,6 +4813,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
|
||||||
prop, dpct::dev_mgr::instance().get_device(i))));
|
prop, dpct::dev_mgr::instance().get_device(i))));
|
||||||
|
|
||||||
dev_ctx->description = prop.get_name();
|
dev_ctx->description = prop.get_name();
|
||||||
|
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||||
|
|
||||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||||
/* .iface = */ ggml_backend_sycl_device_interface,
|
/* .iface = */ ggml_backend_sycl_device_interface,
|
||||||
|
|
|
||||||
|
|
@ -14249,6 +14249,7 @@ struct ggml_backend_vk_device_context {
|
||||||
std::string description;
|
std::string description;
|
||||||
bool is_integrated_gpu;
|
bool is_integrated_gpu;
|
||||||
std::string pci_bus_id;
|
std::string pci_bus_id;
|
||||||
|
int op_offload_min_batch_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -14820,12 +14821,10 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
|
||||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
(op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||||
|
|
||||||
UNUSED(dev);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
|
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -14951,6 +14950,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||||
static std::mutex mutex;
|
static std::mutex mutex;
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
|
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||||
char desc[256];
|
char desc[256];
|
||||||
|
|
@ -14960,6 +14960,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||||
ctx->description = desc;
|
ctx->description = desc;
|
||||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||||
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
||||||
|
ctx->op_offload_min_batch_size = min_batch_size;
|
||||||
devices.push_back(new ggml_backend_device {
|
devices.push_back(new ggml_backend_device {
|
||||||
/* .iface = */ ggml_backend_vk_device_i,
|
/* .iface = */ ggml_backend_vk_device_i,
|
||||||
/* .reg = */ reg,
|
/* .reg = */ reg,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue