ggml: add env var GGML_OP_OFFLOAD_MIN_BATCH (#18535)

* ggml: add env var GGML_OP_OFFLOAD_MIN_BATCH
* makes the min_batch_size for triggering op offload configurable via env var, defaulting to the prior hardcoded value of 32

* ggml: read GGML_OP_OFFLOAD_MIN_BATCH once and store to dev ctx

* cann: forward declaration of device context struct

* cann: move offload op check after device context declaration

* cuda: fix whitespace

Co-authored-by: Aman Gupta <amangupta052@gmail.com>

---------

Co-authored-by: Aman Gupta <amangupta052@gmail.com>
This commit is contained in:
Doctor Shotgun 2026-01-08 01:03:21 -08:00 committed by GitHub
parent 9c142e3a2a
commit 9a5724dee2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 45 additions and 38 deletions

View File

@ -2541,27 +2541,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
}
/**
* @brief Determines if a tensor operation should be offloaded to the CANN
* backend.
*
* This function checks if a given tensor operation should be offloaded to the
* CANN backend based on the operation type and the size of the tensor. It
* returns true if the second dimension (ne[1]) of the tensor is greater than or
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
*
* @param backend Pointer to the CANN backend.
* @param op Pointer to the tensor operation to check.
* @return bool Returns true if the operation should be offloaded, otherwise
* false.
*/
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
GGML_UNUSED(dev);
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
}
/**
* @brief Records an event on the CANN backend stream.
*
@ -2637,6 +2616,7 @@ struct ggml_backend_cann_device_context {
int device;
std::string name;
std::string description;
int op_offload_min_batch_size;
};
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
@ -2713,6 +2693,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
return ggml_backend_cann_host_buffer_type();
}
/**
* @brief Determines if a tensor operation should be offloaded to the CANN
* backend.
*
* This function checks if a given tensor operation should be offloaded to the
* CANN backend based on the operation type and the size of the tensor. It
* returns true if the second dimension (ne[1]) of the tensor is greater than or
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
*
* @param backend Pointer to the CANN backend.
* @param op Pointer to the tensor operation to check.
* @return bool Returns true if the operation should be offloaded, otherwise
* false.
*/
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
}
/**
* @brief Creates a new event for the CANN backend device.
*
@ -2829,12 +2829,14 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
if (!initialized) {
aclInit(nullptr);
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
for (int i = 0; i < ggml_cann_info().device_count; i++) {
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
dev_ctx->description = aclrtGetSocName();
dev_ctx->device = i;
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
dev_ctx->op_offload_min_batch_size = min_batch_size;
ggml_cann_set_device(i);
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
/* .reg = */ &reg,

View File

@ -4122,6 +4122,7 @@ struct ggml_backend_cuda_device_context {
std::string name;
std::string description;
std::string pci_bus_id;
int op_offload_min_batch_size;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@ -4676,11 +4677,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
}
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
return get_op_batch_size(op) >= min_batch_size;
GGML_UNUSED(dev);
return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
}
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
@ -4848,6 +4847,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@ -4861,6 +4861,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id;
dev_ctx->op_offload_min_batch_size = min_batch_size;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,

View File

@ -219,6 +219,8 @@ struct ggml_metal_device_props {
bool use_shared_buffers;
bool supports_gpu_family_apple7;
int op_offload_min_batch_size;
};
ggml_metal_device_t ggml_metal_device_init(void);

View File

@ -782,6 +782,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;

View File

@ -625,14 +625,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
}
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
return (op->op == GGML_OP_MUL_MAT ||
op->op == GGML_OP_MUL_MAT_ID) &&
get_op_batch_size(op) >= min_batch_size;
GGML_UNUSED(dev);
GGML_UNUSED(op);
get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
}
static ggml_backend_device_i ggml_backend_metal_device_i = {

View File

@ -4286,6 +4286,7 @@ struct ggml_backend_sycl_device_context {
int device;
std::string name;
std::string description;
int op_offload_min_batch_size;
};
static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
@ -4674,9 +4675,8 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
}
static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
return get_op_batch_size(op) >= min_batch_size;
GGML_UNUSED(dev);
ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
return get_op_batch_size(op) >= sycl_ctx->op_offload_min_batch_size;
}
static ggml_backend_event_t
@ -4799,6 +4799,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
@ -4812,6 +4813,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
prop, dpct::dev_mgr::instance().get_device(i))));
dev_ctx->description = prop.get_name();
dev_ctx->op_offload_min_batch_size = min_batch_size;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_sycl_device_interface,

View File

@ -14249,6 +14249,7 @@ struct ggml_backend_vk_device_context {
std::string description;
bool is_integrated_gpu;
std::string pci_bus_id;
int op_offload_min_batch_size;
};
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
@ -14820,12 +14821,10 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
}
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
UNUSED(dev);
return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
(op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
}
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
@ -14951,6 +14950,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
char desc[256];
@ -14960,6 +14960,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
ctx->description = desc;
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
ctx->op_offload_min_batch_size = min_batch_size;
devices.push_back(new ggml_backend_device {
/* .iface = */ ggml_backend_vk_device_i,
/* .reg = */ reg,