This commit is contained in:
nullname 2026-02-01 20:26:43 +08:00 committed by GitHub
commit d0285c5940
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 269 additions and 304 deletions

View File

@ -22,6 +22,8 @@ if (GGML_OPENCL_USE_ADRENO_KERNELS)
add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
endif ()
target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_LIST_DIR}/kernels")
if (GGML_OPENCL_EMBED_KERNELS)
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)

View File

@ -29,6 +29,10 @@
#include <charconv>
#include <mutex>
namespace ocl_kernel_prototypes {
#include "div.h"
}
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -2861,6 +2865,132 @@ struct ggml_tensor_extra_cl {
}
};
namespace /* anonymous */ {
template <typename _TData> struct cl_kernel_arg_setter {};
template <> struct cl_kernel_arg_setter<int> {
typedef void func_t(int);
static size_t set_arg(cl_kernel kernel, size_t index, int arg) {
CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg));
return index + 1;
}
};
template <> struct cl_kernel_arg_setter<cl_ulong> {
typedef void func_t(cl_ulong);
static size_t set_arg(cl_kernel kernel, size_t index, cl_ulong arg) {
CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg));
return index + 1;
}
};
template <> struct cl_kernel_arg_setter<float> {
typedef void func_t(float);
static size_t set_arg(cl_kernel kernel, size_t index, float arg) {
CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg));
return index + 1;
}
};
template <> struct cl_kernel_arg_setter<ggml_tensor> {
typedef void func_t(char *, cl_ulong);
static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) {
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) t->extra;
static_assert(std::is_same_v<decltype(extra->data_device), cl_mem>, "data_device type mismatch");
cl_ulong offset = extra->offset + t->view_offs;
CL_CHECK(clSetKernelArg(kernel, index, sizeof(cl_mem), &extra->data_device));
CL_CHECK(clSetKernelArg(kernel, index + 1, sizeof(cl_ulong), &offset));
return index + 2;
}
};
template <> struct cl_kernel_arg_setter<int64_t[GGML_MAX_DIMS]> {
typedef void func_t(int, int, int, int);
static size_t set_arg(cl_kernel kernel, size_t index, const int64_t (&ne)[GGML_MAX_DIMS]) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS changed, update cl_kernel_arg_setter accordingly");
const int ne0 = (int) ne[0];
const int ne1 = (int) ne[1];
const int ne2 = (int) ne[2];
const int ne3 = (int) ne[3];
CL_CHECK(clSetKernelArg(kernel, index, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, index + 1, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, index + 2, sizeof(int), &ne2));
CL_CHECK(clSetKernelArg(kernel, index + 3, sizeof(int), &ne3));
return index + 4;
}
};
template <> struct cl_kernel_arg_setter<size_t[GGML_MAX_DIMS]> {
typedef void func_t(cl_ulong, cl_ulong, cl_ulong, cl_ulong);
static size_t set_arg(cl_kernel kernel, size_t index, const size_t (&nb)[GGML_MAX_DIMS]) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS changed, update cl_kernel_arg_setter accordingly");
const cl_ulong nb0 = nb[0];
const cl_ulong nb1 = nb[1];
const cl_ulong nb2 = nb[2];
const cl_ulong nb3 = nb[3];
CL_CHECK(clSetKernelArg(kernel, index, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, index + 1, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, index + 2, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, index + 3, sizeof(cl_ulong), &nb3));
return index + 4;
}
};
template <typename... _TArgs> static inline size_t cl_set_kernel_args(cl_kernel kernel, _TArgs &&... args) {
size_t index = 0;
(
[&] {
index = cl_kernel_arg_setter<
std::remove_const_t<std::remove_pointer_t<std::remove_reference_t<_TArgs>>>>::set_arg(kernel, index,
args);
}(),
...);
return index;
}
template <typename... _TArgs> struct cl_func_args_concatenator {};
template <typename... _TArgs, typename... _TInnerArgs1, typename... _TInnerArgs2>
struct cl_func_args_concatenator<void(_TInnerArgs1...), void(_TInnerArgs2...), _TArgs...> {
using func_t = typename cl_func_args_concatenator<void(_TInnerArgs1..., _TInnerArgs2...), _TArgs...>::func_t;
};
template <typename... _TInnerArgs> struct cl_func_args_concatenator<void(_TInnerArgs...)> {
using func_t = void(_TInnerArgs...);
};
template <typename _TFirstArg, typename... _TRestArgs> struct cl_kernel_signature_builder {
using args_t = std::remove_const_t<std::remove_pointer_t<std::remove_reference_t<_TFirstArg>>>;
using first_func_t = typename cl_kernel_arg_setter<args_t>::func_t;
using func_t =
typename cl_func_args_concatenator<first_func_t,
typename cl_kernel_signature_builder<_TRestArgs...>::func_t>::func_t;
};
template <typename _TFinalArg> struct cl_kernel_signature_builder<_TFinalArg> {
using args_t = std::remove_const_t<std::remove_pointer_t<std::remove_reference_t<_TFinalArg>>>;
using func_t = typename cl_kernel_arg_setter<args_t>::func_t;
};
template <typename _TFunc, typename... _TArgs>
static inline size_t cl_set_kernel_args_safe(cl_kernel kernel, _TArgs &&... args) {
static_assert(std::is_same_v<_TFunc, typename cl_kernel_signature_builder<_TArgs...>::func_t>,
"Kernel argument type mismatch between prototype and called arguments");
return cl_set_kernel_args(kernel, args...);
}
} // namespace
// Additional tensor extra structs for quantized tensors.
// These tensors are loaded from files and should not be allocated in scratch --
// they should always be allocated from the pool. Hence, they do not have an
@ -5417,41 +5547,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3];
const cl_ulong nb10 = src1->nb[0];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb13 = src1->nb[3];
const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel;
const bool bcast_row = ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0;
@ -5466,45 +5568,20 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
if (bcast_row) {
kernel = backend_ctx->kernel_add_row;
const int ne = ne00 / 4;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
cl_set_kernel_args(kernel, src0, src1, dst, ne);
} else {
kernel = backend_ctx->kernel_add;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
cl_set_kernel_args(kernel,
src0,
src1,
dst,
src0->ne,
src0->nb,
src1->ne,
src1->nb,
dst->ne,
dst->nb
);
}
} else if (dst->type == GGML_TYPE_F16) {
GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
@ -5514,49 +5591,22 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
if (bcast_row) {
kernel = backend_ctx->kernel_add_row_f16;
const int ne = ne00 / 4;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &type_src0));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &type_src1));
cl_set_kernel_args(kernel, src0, src1, dst, ne, type_src0, type_src1);
} else {
kernel = backend_ctx->kernel_add_f16;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
CL_CHECK(clSetKernelArg(kernel, 30, sizeof(int), &type_src0));
CL_CHECK(clSetKernelArg(kernel, 31, sizeof(int), &type_src1));
cl_set_kernel_args(kernel,
src0,
src1,
dst,
src0->ne,
src0->nb,
src1->ne,
src1->nb,
dst->ne,
dst->nb,
type_src0,
type_src1
);
}
} else {
GGML_ASSERT(false && "unsupported data types for add");
@ -5617,32 +5667,9 @@ static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, con
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offset2 = extra2->offset + src2->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel = backend_ctx->kernel_add_id;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb21));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
cl_set_kernel_args(kernel, src0, src1, src2, dst, nb01, nb02, nb11, nb21, ne0, ne1);
int nth = MIN(ne00, (int) backend_ctx->get_kernel_workgroup_size(kernel));
size_t global_work_size[] = { (size_t)ne01*nth, (size_t)ne02, 1 };
@ -5668,41 +5695,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3]; UNUSED(ne13);
const cl_ulong nb10 = src1->nb[0];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
bool bcast_row = false;
cl_kernel kernel;
@ -5721,13 +5720,7 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
kernel = backend_ctx->kernel_mul_row_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
cl_set_kernel_args(kernel, src0, src1, dst, ne);
} else {
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_mul;
@ -5735,36 +5728,18 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
kernel = backend_ctx->kernel_mul_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
cl_set_kernel_args(
kernel,
src0,
src1,
dst,
src0->ne,
src0->nb,
src1->ne,
src1->nb,
dst->ne,
dst->nb
);
}
if (bcast_row) {
@ -5804,38 +5779,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3];
const cl_ulong nb10 = src1->nb[0];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb13 = src1->nb[3];
const int ne0 = dst->ne[0];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
bool bcast_row = false;
cl_kernel kernel;
@ -5854,43 +5804,35 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
kernel = backend_ctx->kernel_div_row_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
cl_set_kernel_args(kernel, src0, src1, dst, ne);
} else {
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_div;
cl_set_kernel_args_safe<decltype(ocl_kernel_prototypes::kernel_div)>(
kernel,
src0,
src1,
dst,
src0->nb,
src1->ne,
src1->nb,
ne0,
dst->nb
);
} else {
kernel = backend_ctx->kernel_div_f16;
cl_set_kernel_args(
kernel,
src0,
src1,
dst,
src0->nb,
src1->ne,
src1->nb,
ne0,
dst->nb
);
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
}
if (bcast_row) {
@ -7721,25 +7663,13 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
return;
}
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
const int logical_dim = dst->op_params[0];
const int max_period = dst->op_params[1];
const int dst_nb1_bytes = dst->nb[1];
cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period));
cl_set_kernel_args(kernel, src0, dst, dst_nb1_bytes, logical_dim, max_period);
size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
@ -9907,20 +9837,9 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float));
memcpy(&bias, ((int32_t *) dst->op_params) + 1, sizeof(float));
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel = backend_ctx->kernel_scale;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
cl_set_kernel_args(kernel, src0, dst, scale, bias);
int n = ggml_nelements(dst)/4;
@ -10057,24 +9976,12 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel;
if (ne00%8 == 0) {
kernel = backend_ctx->kernel_diag_mask_inf_8;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past));
cl_set_kernel_args(kernel, src0, dst, ne00, ne01, n_past);
size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
size_t local_work_size[] = {64, 1, 1};
@ -10083,13 +9990,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
} else {
kernel = backend_ctx->kernel_diag_mask_inf;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past));
cl_set_kernel_args(kernel, src0, dst, ne00, ne01, n_past);
size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
size_t local_work_size[] = {64, 1, 1};

View File

@ -1,5 +1,7 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#include "div.h"
//------------------------------------------------------------------------------
// div
//------------------------------------------------------------------------------

View File

@ -0,0 +1,31 @@
#ifndef __KERNELS_DIV_H__
#define __KERNELS_DIV_H__
#include "ocl_defs.h"
OCL_KERNEL void kernel_div(OCL_GLOBAL char * src0,
ulong offset0,
OCL_GLOBAL char * src1,
ulong offset1,
OCL_GLOBAL char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3);
#endif // __KERNELS_DIV_H__

View File

@ -2,8 +2,24 @@
import sys
import logging
import re
import os
logger = logging.getLogger("opencl-embed-kernel")
INCLUDE_PATTERN = re.compile(r'#include\s+"(.*)".*')
def parse_file_line(ifile, ofile, base_path: str):
for i in ifile:
if m := INCLUDE_PATTERN.match(i):
include_file = os.path.join(base_path, m.group(1))
logger.info(f"Embedding file: {include_file}")
with open(include_file, "r") as incf:
parse_file_line(incf, ofile, base_path)
else:
ofile.write('R"({})"\n'.format(i))
def main():
logging.basicConfig(level=logging.INFO)
@ -12,14 +28,9 @@ def main():
logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
sys.exit(1)
ifile = open(sys.argv[1], "r")
ofile = open(sys.argv[2], "w")
for i in ifile:
ofile.write('R"({})"\n'.format(i))
ifile.close()
ofile.close()
ipath = os.path.dirname(sys.argv[1])
with open(sys.argv[1], "r") as ifile, open(sys.argv[2], "w") as ofile:
parse_file_line(ifile, ofile, ipath)
if __name__ == "__main__":

View File

@ -0,0 +1,18 @@
#ifndef __OCL_DEFS_H__
#define __OCL_DEFS_H__
#ifdef __OPENCL_C_VERSION__
// Device (OpenCL) Definitions
# define OCL_KERNEL kernel
# define OCL_GLOBAL global
#else
// Host (C++) Definitions
# define OCL_KERNEL
# define OCL_GLOBAL
# define __kernel
# define __global
# define ulong cl_ulong
#endif
#endif // __OCL_DEFS_H__