From 3c1bcf26069be6c4683d0bf42794e85d5c520aab Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Thu, 1 Jan 2026 22:09:18 -0800 Subject: [PATCH 1/4] ggml: add env var GGML_OP_OFFLOAD_MIN_BATCH * makes the min_batch_size for triggering op offload configurable via env var, defaulting to the prior hardcoded value of 32 --- ggml/src/ggml-cann/ggml-cann.cpp | 2 +- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- ggml/src/ggml-metal/ggml-metal.cpp | 2 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index ef23ec78da..5b48e02606 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2512,7 +2512,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { * false. */ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; + const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; GGML_UNUSED(dev); return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 84eccea3f7..1cb879ed5a 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4660,7 +4660,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) { } static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; + const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; return get_op_batch_size(op) >= min_batch_size; diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index 70bf6f3d98..60da5baf30 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -625,7 +625,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) { } static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; + const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; return (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) && diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index e996d98be8..8259c76052 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4674,7 +4674,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) { } static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; + const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; return get_op_batch_size(op) >= min_batch_size; GGML_UNUSED(dev); } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 541e4a50b7..bd76b16cff 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -14651,7 +14651,7 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba } static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; + const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); From fa46774005e870a88bdb800e4c92fd6c96753aee Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Fri, 2 Jan 2026 13:26:12 -0800 Subject: [PATCH 2/4] ggml: read GGML_OP_OFFLOAD_MIN_BATCH once and store to dev ctx --- ggml/src/ggml-cann/ggml-cann.cpp | 8 +++++--- ggml/src/ggml-cuda/ggml-cuda.cu | 9 +++++---- ggml/src/ggml-metal/ggml-metal-device.h | 2 ++ ggml/src/ggml-metal/ggml-metal-device.m | 2 ++ ggml/src/ggml-metal/ggml-metal.cpp | 7 ++----- ggml/src/ggml-sycl/ggml-sycl.cpp | 8 +++++--- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 11 ++++++----- 7 files changed, 27 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 5b48e02606..c65d84b01f 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2512,10 +2512,9 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { * false. */ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; - GGML_UNUSED(dev); + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; - return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; + return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS; } /** @@ -2593,6 +2592,7 @@ struct ggml_backend_cann_device_context { int device; std::string name; std::string description; + int op_offload_min_batch_size; }; static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) { @@ -2785,12 +2785,14 @@ ggml_backend_reg_t ggml_backend_cann_reg() { if (!initialized) { aclInit(nullptr); ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context; + const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; for (int i = 0; i < ggml_cann_info().device_count; i++) { ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context(); dev_ctx->description = aclrtGetSocName(); dev_ctx->device = i; dev_ctx->name = GGML_CANN_NAME + std::to_string(i); + dev_ctx->op_offload_min_batch_size = min_batch_size; ggml_cann_set_device(i); ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface, /* .reg = */ ®, diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 1cb879ed5a..783dbc9bda 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4107,6 +4107,7 @@ struct ggml_backend_cuda_device_context { std::string name; std::string description; std::string pci_bus_id; + int op_offload_min_batch_size; }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { @@ -4660,11 +4661,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) { } static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; + ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context; - return get_op_batch_size(op) >= min_batch_size; - - GGML_UNUSED(dev); + return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size; } static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) { @@ -4832,6 +4831,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { std::lock_guard lock(mutex); if (!initialized) { ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; + const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; for (int i = 0; i < ggml_cuda_info().device_count; i++) { ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; @@ -4845,6 +4845,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { char pci_bus_id[16] = {}; snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); dev_ctx->pci_bus_id = pci_bus_id; + dev_ctx->op_offload_min_batch_size = min_batch_size; ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_cuda_device_interface, diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index d983b666ca..9c3b001487 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -219,6 +219,8 @@ struct ggml_metal_device_props { bool use_shared_buffers; bool supports_gpu_family_apple7; + + int op_offload_min_batch_size; }; ggml_metal_device_t ggml_metal_device_init(void); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 59badd0043..ff899a8170 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -782,6 +782,8 @@ ggml_metal_device_t ggml_metal_device_init(void) { dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7]; + dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; + dev->props.max_buffer_size = dev->mtl_device.maxBufferLength; dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize; dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength; diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index 60da5baf30..56b59f0afd 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -625,14 +625,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) { } static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; + ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context; return (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) && - get_op_batch_size(op) >= min_batch_size; - - GGML_UNUSED(dev); - GGML_UNUSED(op); + get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size; } static ggml_backend_device_i ggml_backend_metal_device_i = { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 8259c76052..8f8176b678 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4286,6 +4286,7 @@ struct ggml_backend_sycl_device_context { int device; std::string name; std::string description; + int op_offload_min_batch_size; }; static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) { @@ -4674,9 +4675,8 @@ static int64_t get_op_batch_size(const ggml_tensor * op) { } static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; - return get_op_batch_size(op) >= min_batch_size; - GGML_UNUSED(dev); + ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context; + return get_op_batch_size(op) >= sycl_ctx->op_offload_min_batch_size; } static ggml_backend_event_t @@ -4799,6 +4799,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() { std::lock_guard lock(mutex); if (!initialized) { ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context; + const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; for (int i = 0; i < ggml_sycl_info().device_count; i++) { ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context; @@ -4812,6 +4813,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() { prop, dpct::dev_mgr::instance().get_device(i)))); dev_ctx->description = prop.get_name(); + dev_ctx->op_offload_min_batch_size = min_batch_size; ggml_backend_dev_t dev = new ggml_backend_device { /* .iface = */ ggml_backend_sycl_device_interface, diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index bd76b16cff..c23f79c893 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -14077,6 +14077,7 @@ struct ggml_backend_vk_device_context { std::string description; bool is_integrated_gpu; std::string pci_bus_id; + int op_offload_min_batch_size; }; static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { @@ -14651,12 +14652,10 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba } static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; + ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context; - return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || - (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); - - UNUSED(dev); + return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) || + (op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID); } static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) { @@ -14737,6 +14736,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { + const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) { ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; char desc[256]; @@ -14746,6 +14746,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, ctx->description = desc; ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i); + ctx->op_offload_min_batch_size = min_batch_size; devices.push_back(new ggml_backend_device { /* .iface = */ ggml_backend_vk_device_i, /* .reg = */ reg, From a449358f8390da90372675611be5b87c15a86df1 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Fri, 2 Jan 2026 13:49:24 -0800 Subject: [PATCH 3/4] cann: forward declaration of device context struct --- ggml/src/ggml-cann/ggml-cann.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index c65d84b01f..275e2bd370 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2511,6 +2511,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { * @return bool Returns true if the operation should be offloaded, otherwise * false. */ +struct ggml_backend_cann_device_context; static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; From 7a838e78fc80c9e29338ffd06aadb7f7fae85930 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Fri, 2 Jan 2026 14:02:06 -0800 Subject: [PATCH 4/4] cann: move offload op check after device context declaration --- ggml/src/ggml-cann/ggml-cann.cpp | 41 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 275e2bd370..3d2e4e831a 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2497,27 +2497,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { return buft->iface.get_name == ggml_backend_cann_buffer_type_name; } -/** - * @brief Determines if a tensor operation should be offloaded to the CANN - * backend. - * - * This function checks if a given tensor operation should be offloaded to the - * CANN backend based on the operation type and the size of the tensor. It - * returns true if the second dimension (ne[1]) of the tensor is greater than or - * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS. - * - * @param backend Pointer to the CANN backend. - * @param op Pointer to the tensor operation to check. - * @return bool Returns true if the operation should be offloaded, otherwise - * false. - */ -struct ggml_backend_cann_device_context; -static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; - - return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS; -} - /** * @brief Records an event on the CANN backend stream. * @@ -2670,6 +2649,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type( return ggml_backend_cann_host_buffer_type(); } +/** + * @brief Determines if a tensor operation should be offloaded to the CANN + * backend. + * + * This function checks if a given tensor operation should be offloaded to the + * CANN backend based on the operation type and the size of the tensor. It + * returns true if the second dimension (ne[1]) of the tensor is greater than or + * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS. + * + * @param backend Pointer to the CANN backend. + * @param op Pointer to the tensor operation to check. + * @return bool Returns true if the operation should be offloaded, otherwise + * false. + */ +static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; + + return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS; +} + /** * @brief Creates a new event for the CANN backend device. *