diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 639715537b..353f6a4b46 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -263,6 +263,32 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive return { type, major, minor, patch }; } +// cl buffer wrapper +struct ggml_cl_buffer { + cl_mem buffer; + size_t size; + + ggml_cl_buffer() + : buffer(nullptr), size(0) {} + + ~ggml_cl_buffer() { + if (buffer) { + CL_CHECK(clReleaseMemObject(buffer)); + } + } + + void allocate(cl_context context, size_t new_size) { + if (new_size > size) { + size = new_size; + if (buffer) { + CL_CHECK(clReleaseMemObject(buffer)); + } + cl_int err; + CL_CHECK((buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err)); + } + } +}; + // Profiling struct ProfilingInfo { std::string op_name; @@ -376,6 +402,11 @@ struct ggml_backend_opencl_context { cl_context context; cl_command_queue queue; + // prealloc buffers for transposing weights and activations + ggml_cl_buffer prealloc_quant_trans; + ggml_cl_buffer prealloc_scales_trans; + ggml_cl_buffer prealloc_act_trans; + cl_program program_add; cl_program program_add_id; cl_program program_clamp; @@ -638,10 +669,6 @@ struct ggml_backend_opencl_context { cl_kernel kernel_transpose_16_buf; cl_kernel kernel_transpose_16_4x1; - cl_mem A_s_d_max; // max scale buffer size for transpose - cl_mem A_q_d_max; // max weight buffer size for transpose - cl_mem B_d_max; // max activation buffer size for transpose - // Gemm and Gemv related programs, kernels, etc cl_program program_CL_gemm; cl_program program_CL_gemv_general; @@ -2600,9 +2627,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { required_B_d_bytes, max_B_d_bytes); } - CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err)); - CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err)); - CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err)); + backend_ctx->prealloc_quant_trans.allocate(context, max_A_q_d_bytes); + backend_ctx->prealloc_scales_trans.allocate(context, max_A_s_d_bytes); + backend_ctx->prealloc_act_trans.allocate(context, max_B_d_bytes); #endif // GGML_OPENCL_USE_ADRENO_KERNELS backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr; @@ -3607,32 +3634,35 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, // use sub_buffer of max buffer size instead size_t q_size_bytes = K * M / 8 * sizeof(float); + backend_ctx->prealloc_quant_trans.allocate(context, q_size_bytes); + cl_buffer_region region; region.origin = 0; region.size = q_size_bytes; cl_mem qT_d = clCreateSubBuffer( - backend_ctx->A_q_d_max, + backend_ctx->prealloc_quant_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); - // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err); CL_CHECK(err); bool K_tile_trans = true; if ((K / 32) % 4 != 0){ K_tile_trans =false; } + size_t d_size_bytes = M * (K / 32) * 2; + backend_ctx->prealloc_scales_trans.allocate(context, d_size_bytes); + region.origin = 0; region.size = d_size_bytes; cl_mem dT_d = clCreateSubBuffer( - backend_ctx->A_s_d_max, + backend_ctx->prealloc_scales_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); - // cl_mem dT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, d_size_bytes, NULL, &err); CL_CHECK(err); // <----------------------------------------------------------------------------------> // @@ -7395,8 +7425,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co region.origin = 0; // Specify the size of the sub-buffer (divide by 2 for FP16) region.size = K * (N + padding) * sizeof(float)/2; + backend_ctx->prealloc_act_trans.allocate(context, region.size); + B_d = clCreateSubBuffer( - backend_ctx->B_d_max, + backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion,