From ded446b34c0cd803a0122446b848619adbb458cf Mon Sep 17 00:00:00 2001 From: lhez Date: Thu, 26 Mar 2026 08:52:21 -0700 Subject: [PATCH] opencl: allow large buffer for adreno (#20997) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 4dddcd82cf..c40e1f2d39 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -394,6 +394,9 @@ struct ggml_backend_opencl_context { bool fp16_support; bool has_vector_subgroup_broadcast; bool disable_fusion; + + bool adreno_has_large_buffer; + bool adreno_use_large_buffer; ggml_cl_compiler_version adreno_cl_compiler_version; int adreno_wave_size; @@ -787,6 +790,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve " -cl-mad-enable -cl-unsafe-math-optimizations" " -cl-finite-math-only -cl-fast-relaxed-math"; + if (backend_ctx->adreno_use_large_buffer) { + compile_opts += " -qcom-enable-large-buffer "; + } + GGML_LOG_INFO("ggml_opencl: loading OpenCL kernels"); // add @@ -3020,6 +3027,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { // Check if ext_buffer contains cl_khr_fp16 backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false"); + // check Adreno large buffer support + backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL; // fp16 is required if (!backend_ctx->fp16_support) { @@ -3086,6 +3095,18 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n"); #endif // GGML_OPENCL_USE_ADRENO_KERNELS + // determine whether to use large buffer for Adreno + backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr && + backend_ctx->gpu_family == GPU_FAMILY::ADRENO; + if (backend_ctx->adreno_use_large_buffer) { + if (!backend_ctx->adreno_has_large_buffer) { + GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n"); + backend_ctx->adreno_use_large_buffer = false; + } else { + GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n"); + } + } + cl_int err; // A local ref of cl_context for convenience @@ -5660,6 +5681,11 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b cl_int err; cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err); + if (err != CL_SUCCESS && backend_ctx->adreno_use_large_buffer) { + cl_mem_properties props[] = { 0x41A6 /* CL_LARGE_BUFFER_QCOM */, 1, 0 }; + mem = clCreateBufferWithProperties(backend_ctx->context, props, CL_MEM_READ_WRITE, size, NULL, &err); + } + if (err != CL_SUCCESS) { GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0); return nullptr;