diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index fa5fadd112..f9c92ef05a 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -22,6 +22,8 @@ if (GGML_OPENCL_USE_ADRENO_KERNELS) add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS) endif () +target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_LIST_DIR}/kernels") + if (GGML_OPENCL_EMBED_KERNELS) add_compile_definitions(GGML_OPENCL_EMBED_KERNELS) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 4850c11d14..28682357e4 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -29,6 +29,10 @@ #include #include +namespace ocl_kernel_prototypes { + #include "div.h" +} + #undef MIN #undef MAX #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -2861,6 +2865,132 @@ struct ggml_tensor_extra_cl { } }; +namespace /* anonymous */ { + +template struct cl_kernel_arg_setter {}; + +template <> struct cl_kernel_arg_setter { + typedef void func_t(int); + + static size_t set_arg(cl_kernel kernel, size_t index, int arg) { + CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); + return index + 1; + } +}; + +template <> struct cl_kernel_arg_setter { + typedef void func_t(cl_ulong); + + static size_t set_arg(cl_kernel kernel, size_t index, cl_ulong arg) { + CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); + return index + 1; + } +}; + +template <> struct cl_kernel_arg_setter { + typedef void func_t(float); + + static size_t set_arg(cl_kernel kernel, size_t index, float arg) { + CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); + return index + 1; + } +}; + +template <> struct cl_kernel_arg_setter { + typedef void func_t(char *, cl_ulong); + + static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) { + ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) t->extra; + static_assert(std::is_same_vdata_device), cl_mem>, "data_device type mismatch"); + + cl_ulong offset = extra->offset + t->view_offs; + CL_CHECK(clSetKernelArg(kernel, index, sizeof(cl_mem), &extra->data_device)); + CL_CHECK(clSetKernelArg(kernel, index + 1, sizeof(cl_ulong), &offset)); + return index + 2; + } +}; + +template <> struct cl_kernel_arg_setter { + typedef void func_t(int, int, int, int); + + static size_t set_arg(cl_kernel kernel, size_t index, const int64_t (&ne)[GGML_MAX_DIMS]) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS changed, update cl_kernel_arg_setter accordingly"); + + const int ne0 = (int) ne[0]; + const int ne1 = (int) ne[1]; + const int ne2 = (int) ne[2]; + const int ne3 = (int) ne[3]; + CL_CHECK(clSetKernelArg(kernel, index, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, index + 1, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, index + 2, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, index + 3, sizeof(int), &ne3)); + return index + 4; + } +}; + +template <> struct cl_kernel_arg_setter { + typedef void func_t(cl_ulong, cl_ulong, cl_ulong, cl_ulong); + + static size_t set_arg(cl_kernel kernel, size_t index, const size_t (&nb)[GGML_MAX_DIMS]) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS changed, update cl_kernel_arg_setter accordingly"); + + const cl_ulong nb0 = nb[0]; + const cl_ulong nb1 = nb[1]; + const cl_ulong nb2 = nb[2]; + const cl_ulong nb3 = nb[3]; + CL_CHECK(clSetKernelArg(kernel, index, sizeof(cl_ulong), &nb0)); + CL_CHECK(clSetKernelArg(kernel, index + 1, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, index + 2, sizeof(cl_ulong), &nb2)); + CL_CHECK(clSetKernelArg(kernel, index + 3, sizeof(cl_ulong), &nb3)); + return index + 4; + } +}; + +template static inline size_t cl_set_kernel_args(cl_kernel kernel, _TArgs &&... args) { + size_t index = 0; + ( + [&] { + index = cl_kernel_arg_setter< + std::remove_const_t>>>::set_arg(kernel, index, + args); + }(), + ...); + return index; +} + +template struct cl_func_args_concatenator {}; + +template +struct cl_func_args_concatenator { + using func_t = typename cl_func_args_concatenator::func_t; +}; + +template struct cl_func_args_concatenator { + using func_t = void(_TInnerArgs...); +}; + +template struct cl_kernel_signature_builder { + using args_t = std::remove_const_t>>; + using first_func_t = typename cl_kernel_arg_setter::func_t; + using func_t = + typename cl_func_args_concatenator::func_t>::func_t; +}; + +template struct cl_kernel_signature_builder<_TFinalArg> { + using args_t = std::remove_const_t>>; + using func_t = typename cl_kernel_arg_setter::func_t; +}; + +template +static inline size_t cl_set_kernel_args_safe(cl_kernel kernel, _TArgs &&... args) { + static_assert(std::is_same_v<_TFunc, typename cl_kernel_signature_builder<_TArgs...>::func_t>, + "Kernel argument type mismatch between prototype and called arguments"); + return cl_set_kernel_args(kernel, args...); +} + +} // namespace + // Additional tensor extra structs for quantized tensors. // These tensors are loaded from files and should not be allocated in scratch -- // they should always be allocated from the pool. Hence, they do not have an @@ -5417,41 +5547,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3]; - const cl_ulong nb00 = src0->nb[0]; - const cl_ulong nb01 = src0->nb[1]; - const cl_ulong nb02 = src0->nb[2]; - const cl_ulong nb03 = src0->nb[3]; - const int ne10 = src1->ne[0]; const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - const int ne13 = src1->ne[3]; - - const cl_ulong nb10 = src1->nb[0]; - const cl_ulong nb11 = src1->nb[1]; - const cl_ulong nb12 = src1->nb[2]; - const cl_ulong nb13 = src1->nb[3]; const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - const int ne3 = dst->ne[3]; - - const cl_ulong nb0 = dst->nb[0]; - const cl_ulong nb1 = dst->nb[1]; - const cl_ulong nb2 = dst->nb[2]; - const cl_ulong nb3 = dst->nb[3]; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offset1 = extra1->offset + src1->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - cl_kernel kernel; const bool bcast_row = ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0; @@ -5466,45 +5568,20 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const if (bcast_row) { kernel = backend_ctx->kernel_add_row; const int ne = ne00 / 4; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); + cl_set_kernel_args(kernel, src0, src1, dst, ne); } else { kernel = backend_ctx->kernel_add; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); - CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); - CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); - CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); - CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); - CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); - CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); + cl_set_kernel_args(kernel, + src0, + src1, + dst, + src0->ne, + src0->nb, + src1->ne, + src1->nb, + dst->ne, + dst->nb + ); } } else if (dst->type == GGML_TYPE_F16) { GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32); @@ -5514,49 +5591,22 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const if (bcast_row) { kernel = backend_ctx->kernel_add_row_f16; const int ne = ne00 / 4; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &type_src0)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &type_src1)); + cl_set_kernel_args(kernel, src0, src1, dst, ne, type_src0, type_src1); } else { kernel = backend_ctx->kernel_add_f16; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); - CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); - CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); - CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); - CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); - CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); - CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); - CL_CHECK(clSetKernelArg(kernel, 30, sizeof(int), &type_src0)); - CL_CHECK(clSetKernelArg(kernel, 31, sizeof(int), &type_src1)); + cl_set_kernel_args(kernel, + src0, + src1, + dst, + src0->ne, + src0->nb, + src1->ne, + src1->nb, + dst->ne, + dst->nb, + type_src0, + type_src1 + ); } } else { GGML_ASSERT(false && "unsupported data types for add"); @@ -5617,32 +5667,9 @@ static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, con ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; - ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offset1 = extra1->offset + src1->view_offs; - cl_ulong offset2 = extra2->offset + src2->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - cl_kernel kernel = backend_ctx->kernel_add_id; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb21)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1)); + cl_set_kernel_args(kernel, src0, src1, src2, dst, nb01, nb02, nb11, nb21, ne0, ne1); int nth = MIN(ne00, (int) backend_ctx->get_kernel_workgroup_size(kernel)); size_t global_work_size[] = { (size_t)ne01*nth, (size_t)ne02, 1 }; @@ -5668,41 +5695,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3]; - const cl_ulong nb00 = src0->nb[0]; - const cl_ulong nb01 = src0->nb[1]; - const cl_ulong nb02 = src0->nb[2]; - const cl_ulong nb03 = src0->nb[3]; - const int ne10 = src1->ne[0]; const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - const int ne13 = src1->ne[3]; UNUSED(ne13); - - const cl_ulong nb10 = src1->nb[0]; - const cl_ulong nb11 = src1->nb[1]; - const cl_ulong nb12 = src1->nb[2]; - const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13); const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - const int ne3 = dst->ne[3]; - - const cl_ulong nb0 = dst->nb[0]; - const cl_ulong nb1 = dst->nb[1]; - const cl_ulong nb2 = dst->nb[2]; - const cl_ulong nb3 = dst->nb[3]; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offset1 = extra1->offset + src1->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - bool bcast_row = false; cl_kernel kernel; @@ -5721,13 +5720,7 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_mul_row_f16; } - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); + cl_set_kernel_args(kernel, src0, src1, dst, ne); } else { if (src0->type == GGML_TYPE_F32) { kernel = backend_ctx->kernel_mul; @@ -5735,36 +5728,18 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_mul_f16; } - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); - CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); - CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); - CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); - CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); - CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); - CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); + cl_set_kernel_args( + kernel, + src0, + src1, + dst, + src0->ne, + src0->nb, + src1->ne, + src1->nb, + dst->ne, + dst->nb + ); } if (bcast_row) { @@ -5804,38 +5779,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3]; - const cl_ulong nb00 = src0->nb[0]; - const cl_ulong nb01 = src0->nb[1]; - const cl_ulong nb02 = src0->nb[2]; - const cl_ulong nb03 = src0->nb[3]; - const int ne10 = src1->ne[0]; const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - const int ne13 = src1->ne[3]; - - const cl_ulong nb10 = src1->nb[0]; - const cl_ulong nb11 = src1->nb[1]; - const cl_ulong nb12 = src1->nb[2]; - const cl_ulong nb13 = src1->nb[3]; const int ne0 = dst->ne[0]; - const cl_ulong nb0 = dst->nb[0]; - const cl_ulong nb1 = dst->nb[1]; - const cl_ulong nb2 = dst->nb[2]; - const cl_ulong nb3 = dst->nb[3]; - ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offset1 = extra1->offset + src1->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - bool bcast_row = false; cl_kernel kernel; @@ -5854,43 +5804,35 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_div_row_f16; } - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); + cl_set_kernel_args(kernel, src0, src1, dst, ne); } else { if (src0->type == GGML_TYPE_F32) { kernel = backend_ctx->kernel_div; + cl_set_kernel_args_safe( + kernel, + src0, + src1, + dst, + src0->nb, + src1->ne, + src1->nb, + ne0, + dst->nb + ); } else { kernel = backend_ctx->kernel_div_f16; + cl_set_kernel_args( + kernel, + src0, + src1, + dst, + src0->nb, + src1->ne, + src1->nb, + ne0, + dst->nb + ); } - - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3)); } if (bcast_row) { @@ -7721,25 +7663,13 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor return; } - ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong off_src0 = extra_src0->offset + src0->view_offs; - cl_ulong off_dst = extra_dst->offset + dst->view_offs; - const int logical_dim = dst->op_params[0]; const int max_period = dst->op_params[1]; const int dst_nb1_bytes = dst->nb[1]; cl_kernel kernel = backend_ctx->kernel_timestep_embedding; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period)); + cl_set_kernel_args(kernel, src0, dst, dst_nb1_bytes, logical_dim, max_period); size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1); @@ -9907,20 +9837,9 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float)); memcpy(&bias, ((int32_t *) dst->op_params) + 1, sizeof(float)); - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - cl_kernel kernel = backend_ctx->kernel_scale; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias)); + cl_set_kernel_args(kernel, src0, dst, scale, bias); int n = ggml_nelements(dst)/4; @@ -10057,24 +9976,12 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - cl_kernel kernel; if (ne00%8 == 0) { kernel = backend_ctx->kernel_diag_mask_inf_8; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past)); + cl_set_kernel_args(kernel, src0, dst, ne00, ne01, n_past); size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1}; size_t local_work_size[] = {64, 1, 1}; @@ -10083,13 +9990,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr } else { kernel = backend_ctx->kernel_diag_mask_inf; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past)); + cl_set_kernel_args(kernel, src0, dst, ne00, ne01, n_past); size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02}; size_t local_work_size[] = {64, 1, 1}; diff --git a/ggml/src/ggml-opencl/kernels/div.cl b/ggml/src/ggml-opencl/kernels/div.cl index 6d9b4ade9f..eb1d2da002 100644 --- a/ggml/src/ggml-opencl/kernels/div.cl +++ b/ggml/src/ggml-opencl/kernels/div.cl @@ -1,5 +1,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#include "div.h" + //------------------------------------------------------------------------------ // div //------------------------------------------------------------------------------ diff --git a/ggml/src/ggml-opencl/kernels/div.h b/ggml/src/ggml-opencl/kernels/div.h new file mode 100644 index 0000000000..fd65b5706f --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/div.h @@ -0,0 +1,31 @@ + +#ifndef __KERNELS_DIV_H__ +#define __KERNELS_DIV_H__ + +#include "ocl_defs.h" + +OCL_KERNEL void kernel_div(OCL_GLOBAL char * src0, + ulong offset0, + OCL_GLOBAL char * src1, + ulong offset1, + OCL_GLOBAL char * dst, + ulong offsetd, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + int ne13, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3); + +#endif // __KERNELS_DIV_H__ diff --git a/ggml/src/ggml-opencl/kernels/embed_kernel.py b/ggml/src/ggml-opencl/kernels/embed_kernel.py index b5d1d7242b..867c0d4961 100644 --- a/ggml/src/ggml-opencl/kernels/embed_kernel.py +++ b/ggml/src/ggml-opencl/kernels/embed_kernel.py @@ -2,8 +2,24 @@ import sys import logging +import re +import os + logger = logging.getLogger("opencl-embed-kernel") +INCLUDE_PATTERN = re.compile(r'#include\s+"(.*)".*') + + +def parse_file_line(ifile, ofile, base_path: str): + for i in ifile: + if m := INCLUDE_PATTERN.match(i): + include_file = os.path.join(base_path, m.group(1)) + logger.info(f"Embedding file: {include_file}") + with open(include_file, "r") as incf: + parse_file_line(incf, ofile, base_path) + else: + ofile.write('R"({})"\n'.format(i)) + def main(): logging.basicConfig(level=logging.INFO) @@ -12,14 +28,9 @@ def main(): logger.info("Usage: python embed_kernel.py ") sys.exit(1) - ifile = open(sys.argv[1], "r") - ofile = open(sys.argv[2], "w") - - for i in ifile: - ofile.write('R"({})"\n'.format(i)) - - ifile.close() - ofile.close() + ipath = os.path.dirname(sys.argv[1]) + with open(sys.argv[1], "r") as ifile, open(sys.argv[2], "w") as ofile: + parse_file_line(ifile, ofile, ipath) if __name__ == "__main__": diff --git a/ggml/src/ggml-opencl/kernels/ocl_defs.h b/ggml/src/ggml-opencl/kernels/ocl_defs.h new file mode 100644 index 0000000000..edf32ad0a7 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/ocl_defs.h @@ -0,0 +1,18 @@ + +#ifndef __OCL_DEFS_H__ +#define __OCL_DEFS_H__ + +#ifdef __OPENCL_C_VERSION__ +// Device (OpenCL) Definitions +# define OCL_KERNEL kernel +# define OCL_GLOBAL global +#else +// Host (C++) Definitions +# define OCL_KERNEL +# define OCL_GLOBAL +# define __kernel +# define __global +# define ulong cl_ulong +#endif + +#endif // __OCL_DEFS_H__