From 522ef487a1ccf492d6c3570abf0a92326855e746 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 1 Jan 2026 13:32:43 +0800 Subject: [PATCH 01/17] refactor: simplify kernel argument setting with variadic template function --- ggml/src/ggml-opencl/ggml-opencl.cpp | 89 ++++++++++++++++------------ 1 file changed, 52 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 353f6a4b46..c7b29a301e 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -53,6 +53,15 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor); +template static inline void set_kernel_args(cl_kernel kernel, _TArgs... args) { + size_t index = 0; + ( + [&] { + CL_CHECK(clSetKernelArg(kernel, index++, sizeof(args), &args)); + }(), + ...); +} + // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. // Precompute mp (m' in the paper) and L such that division // can be computed using a multiply (high 32b of 64b result) @@ -5098,13 +5107,16 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_mul_row_f16; } - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); + set_kernel_args( + kernel, + extra0->data_device, + offset0, + extra1->data_device, + offset1, + extrad->data_device, + offsetd, + ne + ); } else { if (src0->type == GGML_TYPE_F32) { kernel = backend_ctx->kernel_mul; @@ -5112,36 +5124,39 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_mul_f16; } - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); - CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); - CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); - CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); - CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); - CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); - CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); + set_kernel_args( + kernel, + extra0->data_device, + offset0, + extra1->data_device, + offset1, + extrad->data_device, + offsetd, + ne00, + ne01, + ne02, + ne03, + nb00, + nb01, + nb02, + nb03, + ne10, + ne11, + ne12, + ne13, + nb10, + nb11, + nb12, + nb13, + ne0, + ne1, + ne2, + ne3, + nb0, + nb1, + nb2, + nb3 + ); } if (bcast_row) { From 1fae16787e43841f9e87be0f875945d701290900 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 1 Jan 2026 23:08:21 +0800 Subject: [PATCH 02/17] refactor: enhance kernel argument setting with specialized templates --- ggml/src/ggml-opencl/ggml-opencl.cpp | 80 ++++++++++++++++------------ 1 file changed, 47 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index c7b29a301e..968fe1db2d 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -53,15 +53,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor); -template static inline void set_kernel_args(cl_kernel kernel, _TArgs... args) { - size_t index = 0; - ( - [&] { - CL_CHECK(clSetKernelArg(kernel, index++, sizeof(args), &args)); - }(), - ...); -} - // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. // Precompute mp (m' in the paper) and L such that division // can be computed using a multiply (high 32b of 64b result) @@ -2687,6 +2678,49 @@ struct ggml_tensor_extra_cl { } }; +template struct ocl_kernel_arg_setter {}; + +template <> struct ocl_kernel_arg_setter { + static size_t set_arg(cl_kernel kernel, size_t index, int arg) { + CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); + return index + 1; + } +}; + +template <> struct ocl_kernel_arg_setter { + static size_t set_arg(cl_kernel kernel, size_t index, cl_ulong arg) { + CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); + return index + 1; + } +}; + +template <> struct ocl_kernel_arg_setter { + static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) { + ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) t->extra; + static_assert(std::is_same_vdata_device), cl_mem>, "data_device type mismatch"); + + cl_ulong offset = extra->offset + t->view_offs; + CL_CHECK(clSetKernelArg(kernel, index, sizeof(cl_mem), &extra->data_device)); + CL_CHECK(clSetKernelArg(kernel, index + 1, sizeof(cl_ulong), &offset)); + return index + 2; + } +}; + +template <> struct ocl_kernel_arg_setter { + static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) { + return ocl_kernel_arg_setter::set_arg(kernel, index, t); + } +}; + +template static inline void set_kernel_args(cl_kernel kernel, _TArgs... args) { + size_t index = 0; + ( + [&] { + index = ocl_kernel_arg_setter::set_arg(kernel, index, args); + }(), + ...); +} + // Additional tensor extra structs for quantized tensors. // These tensors are loaded from files and should not be allocated in scratch -- // they should always be allocated from the pool. Hence, they do not have an @@ -5081,14 +5115,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offset1 = extra1->offset + src1->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - bool bcast_row = false; cl_kernel kernel; @@ -5107,16 +5133,7 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_mul_row_f16; } - set_kernel_args( - kernel, - extra0->data_device, - offset0, - extra1->data_device, - offset1, - extrad->data_device, - offsetd, - ne - ); + set_kernel_args(kernel, src0, src1, dst, ne); } else { if (src0->type == GGML_TYPE_F32) { kernel = backend_ctx->kernel_mul; @@ -5126,12 +5143,9 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const set_kernel_args( kernel, - extra0->data_device, - offset0, - extra1->data_device, - offset1, - extrad->data_device, - offsetd, + src0, + src1, + dst, ne00, ne01, ne02, From b1b8fd9abf0b9a95cea4251b47e0e425e59d5f58 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 1 Jan 2026 23:39:09 +0800 Subject: [PATCH 03/17] refactor: add specialized kernel argument setter for float and simplify kernel argument setting --- ggml/src/ggml-opencl/ggml-opencl.cpp | 81 +++++----------------------- 1 file changed, 12 insertions(+), 69 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 968fe1db2d..ae7f78f639 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2694,6 +2694,13 @@ template <> struct ocl_kernel_arg_setter { } }; +template <> struct ocl_kernel_arg_setter { + static size_t set_arg(cl_kernel kernel, size_t index, float arg) { + CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); + return index + 1; + } +}; + template <> struct ocl_kernel_arg_setter { static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) { ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) t->extra; @@ -5037,32 +5044,9 @@ static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, con ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; - ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offset1 = extra1->offset + src1->view_offs; - cl_ulong offset2 = extra2->offset + src2->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - cl_kernel kernel = backend_ctx->kernel_add_id; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb21)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1)); + set_kernel_args(kernel, src0, src1, src2, dst, nb01, nb02, nb11, nb21, ne0, ne1); int nth = MIN(ne00, (int) backend_ctx->get_kernel_workgroup_size(kernel)); size_t global_work_size[] = { (size_t)ne01*nth, (size_t)ne02, 1 }; @@ -6855,25 +6839,13 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor return; } - ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong off_src0 = extra_src0->offset + src0->view_offs; - cl_ulong off_dst = extra_dst->offset + dst->view_offs; - const int logical_dim = dst->op_params[0]; const int max_period = dst->op_params[1]; const int dst_nb1_bytes = dst->nb[1]; cl_kernel kernel = backend_ctx->kernel_timestep_embedding; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period)); + set_kernel_args(kernel, src0, dst, dst_nb1_bytes, logical_dim, max_period); size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1); @@ -8681,20 +8653,9 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float)); memcpy(&bias, ((int32_t *) dst->op_params) + 1, sizeof(float)); - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - cl_kernel kernel = backend_ctx->kernel_scale; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias)); + set_kernel_args(kernel, src0, dst, scale, bias); int n = ggml_nelements(dst)/4; @@ -8831,24 +8792,12 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - cl_kernel kernel; if (ne00%8 == 0) { kernel = backend_ctx->kernel_diag_mask_inf_8; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past)); + set_kernel_args(kernel, src0, dst, ne00, ne01, n_past); size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1}; size_t local_work_size[] = {64, 1, 1}; @@ -8857,13 +8806,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr } else { kernel = backend_ctx->kernel_diag_mask_inf; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past)); + set_kernel_args(kernel, src0, dst, ne00, ne01, n_past); size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02}; size_t local_work_size[] = {64, 1, 1}; From ada3e50321d2d8a4a3dace10b4b2358fd6fdef37 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 2 Jan 2026 23:53:03 +0800 Subject: [PATCH 04/17] wip --- ggml/src/ggml-opencl/ggml-opencl.cpp | 33 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index ae7f78f639..7289fcc37c 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2678,30 +2678,30 @@ struct ggml_tensor_extra_cl { } }; -template struct ocl_kernel_arg_setter {}; +template struct cl_kernel_arg_setter {}; -template <> struct ocl_kernel_arg_setter { +template <> struct cl_kernel_arg_setter { static size_t set_arg(cl_kernel kernel, size_t index, int arg) { CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); return index + 1; } }; -template <> struct ocl_kernel_arg_setter { +template <> struct cl_kernel_arg_setter { static size_t set_arg(cl_kernel kernel, size_t index, cl_ulong arg) { CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); return index + 1; } }; -template <> struct ocl_kernel_arg_setter { +template <> struct cl_kernel_arg_setter { static size_t set_arg(cl_kernel kernel, size_t index, float arg) { CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); return index + 1; } }; -template <> struct ocl_kernel_arg_setter { +template <> struct cl_kernel_arg_setter { static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) { ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) t->extra; static_assert(std::is_same_vdata_device), cl_mem>, "data_device type mismatch"); @@ -2713,19 +2713,20 @@ template <> struct ocl_kernel_arg_setter { } }; -template <> struct ocl_kernel_arg_setter { +template <> struct cl_kernel_arg_setter { static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) { - return ocl_kernel_arg_setter::set_arg(kernel, index, t); + return cl_kernel_arg_setter::set_arg(kernel, index, t); } }; -template static inline void set_kernel_args(cl_kernel kernel, _TArgs... args) { +template static inline size_t cl_set_kernel_args(cl_kernel kernel, _TArgs... args) { size_t index = 0; ( [&] { - index = ocl_kernel_arg_setter::set_arg(kernel, index, args); + index = cl_kernel_arg_setter::set_arg(kernel, index, args); }(), ...); + return index; } // Additional tensor extra structs for quantized tensors. @@ -5046,7 +5047,7 @@ static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, con cl_kernel kernel = backend_ctx->kernel_add_id; - set_kernel_args(kernel, src0, src1, src2, dst, nb01, nb02, nb11, nb21, ne0, ne1); + cl_set_kernel_args(kernel, src0, src1, src2, dst, nb01, nb02, nb11, nb21, ne0, ne1); int nth = MIN(ne00, (int) backend_ctx->get_kernel_workgroup_size(kernel)); size_t global_work_size[] = { (size_t)ne01*nth, (size_t)ne02, 1 }; @@ -5117,7 +5118,7 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_mul_row_f16; } - set_kernel_args(kernel, src0, src1, dst, ne); + cl_set_kernel_args(kernel, src0, src1, dst, ne); } else { if (src0->type == GGML_TYPE_F32) { kernel = backend_ctx->kernel_mul; @@ -5125,7 +5126,7 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_mul_f16; } - set_kernel_args( + cl_set_kernel_args( kernel, src0, src1, @@ -6845,7 +6846,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor cl_kernel kernel = backend_ctx->kernel_timestep_embedding; - set_kernel_args(kernel, src0, dst, dst_nb1_bytes, logical_dim, max_period); + cl_set_kernel_args(kernel, src0, dst, dst_nb1_bytes, logical_dim, max_period); size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1); @@ -8655,7 +8656,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons cl_kernel kernel = backend_ctx->kernel_scale; - set_kernel_args(kernel, src0, dst, scale, bias); + cl_set_kernel_args(kernel, src0, dst, scale, bias); int n = ggml_nelements(dst)/4; @@ -8797,7 +8798,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr if (ne00%8 == 0) { kernel = backend_ctx->kernel_diag_mask_inf_8; - set_kernel_args(kernel, src0, dst, ne00, ne01, n_past); + cl_set_kernel_args(kernel, src0, dst, ne00, ne01, n_past); size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1}; size_t local_work_size[] = {64, 1, 1}; @@ -8806,7 +8807,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr } else { kernel = backend_ctx->kernel_diag_mask_inf; - set_kernel_args(kernel, src0, dst, ne00, ne01, n_past); + cl_set_kernel_args(kernel, src0, dst, ne00, ne01, n_past); size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02}; size_t local_work_size[] = {64, 1, 1}; From 220a33afe87204bf419543ea8d3deb89dd72a33f Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 3 Jan 2026 00:28:21 +0800 Subject: [PATCH 05/17] refactor: update kernel argument setter for ggml_tensor and simplify argument handling --- ggml/src/ggml-opencl/ggml-opencl.cpp | 88 ++++++++++++---------------- 1 file changed, 38 insertions(+), 50 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 7289fcc37c..e5fe616dba 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2701,7 +2701,7 @@ template <> struct cl_kernel_arg_setter { } }; -template <> struct cl_kernel_arg_setter { +template <> struct cl_kernel_arg_setter { static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) { ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) t->extra; static_assert(std::is_same_vdata_device), cl_mem>, "data_device type mismatch"); @@ -2713,17 +2713,43 @@ template <> struct cl_kernel_arg_setter { } }; -template <> struct cl_kernel_arg_setter { - static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) { - return cl_kernel_arg_setter::set_arg(kernel, index, t); +template <> struct cl_kernel_arg_setter { + static size_t set_arg(cl_kernel kernel, size_t index, const int64_t (&ne)[GGML_MAX_DIMS]) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS changed, update cl_kernel_arg_setter accordingly"); + + const int ne0 = (int)ne[0]; + const int ne1 = (int)ne[1]; + const int ne2 = (int)ne[2]; + const int ne3 = (int)ne[3]; + CL_CHECK(clSetKernelArg(kernel, index, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, index + 1, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, index + 2, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, index + 3, sizeof(int), &ne3)); + return index + 4; } }; -template static inline size_t cl_set_kernel_args(cl_kernel kernel, _TArgs... args) { +template <> struct cl_kernel_arg_setter { + static size_t set_arg(cl_kernel kernel, size_t index, const size_t (&nb)[GGML_MAX_DIMS]) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS changed, update cl_kernel_arg_setter accordingly"); + + const cl_ulong nb0 = nb[0]; + const cl_ulong nb1 = nb[1]; + const cl_ulong nb2 = nb[2]; + const cl_ulong nb3 = nb[3]; + CL_CHECK(clSetKernelArg(kernel, index, sizeof(cl_ulong), &nb0)); + CL_CHECK(clSetKernelArg(kernel, index + 1, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, index + 2, sizeof(cl_ulong), &nb2)); + CL_CHECK(clSetKernelArg(kernel, index + 3, sizeof(cl_ulong), &nb3)); + return index + 4; + } +}; + +template static inline size_t cl_set_kernel_args(cl_kernel kernel, _TArgs&&... args) { size_t index = 0; ( [&] { - index = cl_kernel_arg_setter::set_arg(kernel, index, args); + index = cl_kernel_arg_setter>>>::set_arg(kernel, index, args); }(), ...); return index; @@ -5073,30 +5099,10 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3]; - const cl_ulong nb00 = src0->nb[0]; - const cl_ulong nb01 = src0->nb[1]; - const cl_ulong nb02 = src0->nb[2]; - const cl_ulong nb03 = src0->nb[3]; - const int ne10 = src1->ne[0]; const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - const int ne13 = src1->ne[3]; UNUSED(ne13); - - const cl_ulong nb10 = src1->nb[0]; - const cl_ulong nb11 = src1->nb[1]; - const cl_ulong nb12 = src1->nb[2]; - const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13); const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - const int ne3 = dst->ne[3]; - - const cl_ulong nb0 = dst->nb[0]; - const cl_ulong nb1 = dst->nb[1]; - const cl_ulong nb2 = dst->nb[2]; - const cl_ulong nb3 = dst->nb[3]; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; @@ -5131,30 +5137,12 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const src0, src1, dst, - ne00, - ne01, - ne02, - ne03, - nb00, - nb01, - nb02, - nb03, - ne10, - ne11, - ne12, - ne13, - nb10, - nb11, - nb12, - nb13, - ne0, - ne1, - ne2, - ne3, - nb0, - nb1, - nb2, - nb3 + src0->ne, + src0->nb, + src1->ne, + src1->nb, + dst->ne, + dst->nb ); } From e1ac6411d36b5e6dfc906b291769537473168ffc Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 5 Jan 2026 11:46:06 +0800 Subject: [PATCH 06/17] refactor: simplify kernel argument setting in ggml_cl_div function --- ggml/src/ggml-opencl/ggml-opencl.cpp | 67 +++++----------------------- 1 file changed, 12 insertions(+), 55 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index e5fe616dba..a85fa872aa 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -5183,38 +5183,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3]; - const cl_ulong nb00 = src0->nb[0]; - const cl_ulong nb01 = src0->nb[1]; - const cl_ulong nb02 = src0->nb[2]; - const cl_ulong nb03 = src0->nb[3]; - const int ne10 = src1->ne[0]; const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - const int ne13 = src1->ne[3]; - - const cl_ulong nb10 = src1->nb[0]; - const cl_ulong nb11 = src1->nb[1]; - const cl_ulong nb12 = src1->nb[2]; - const cl_ulong nb13 = src1->nb[3]; const int ne0 = dst->ne[0]; - const cl_ulong nb0 = dst->nb[0]; - const cl_ulong nb1 = dst->nb[1]; - const cl_ulong nb2 = dst->nb[2]; - const cl_ulong nb3 = dst->nb[3]; - ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offset1 = extra1->offset + src1->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - bool bcast_row = false; cl_kernel kernel; @@ -5233,13 +5208,7 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_div_row_f16; } - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); + cl_set_kernel_args(kernel, src0, src1, dst, ne); } else { if (src0->type == GGML_TYPE_F32) { kernel = backend_ctx->kernel_div; @@ -5247,29 +5216,17 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_div_f16; } - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3)); + cl_set_kernel_args( + kernel, + src0, + src1, + dst, + src0->nb, + src1->ne, + src1->nb, + ne0, + dst->nb + ); } if (bcast_row) { From 4b447c99cfe454a043f7ef9ed400a382cb6351a6 Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 5 Jan 2026 12:26:06 +0800 Subject: [PATCH 07/17] refactor: simplify kernel argument setting in ggml_cl_add function --- ggml/src/ggml-opencl/ggml-opencl.cpp | 132 ++++++--------------------- 1 file changed, 26 insertions(+), 106 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index a85fa872aa..89c5ed9082 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -4871,41 +4871,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3]; - const cl_ulong nb00 = src0->nb[0]; - const cl_ulong nb01 = src0->nb[1]; - const cl_ulong nb02 = src0->nb[2]; - const cl_ulong nb03 = src0->nb[3]; - const int ne10 = src1->ne[0]; const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - const int ne13 = src1->ne[3]; - - const cl_ulong nb10 = src1->nb[0]; - const cl_ulong nb11 = src1->nb[1]; - const cl_ulong nb12 = src1->nb[2]; - const cl_ulong nb13 = src1->nb[3]; const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - const int ne3 = dst->ne[3]; - - const cl_ulong nb0 = dst->nb[0]; - const cl_ulong nb1 = dst->nb[1]; - const cl_ulong nb2 = dst->nb[2]; - const cl_ulong nb3 = dst->nb[3]; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offset1 = extra1->offset + src1->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - cl_kernel kernel; const bool bcast_row = ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0; @@ -4920,45 +4892,20 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const if (bcast_row) { kernel = backend_ctx->kernel_add_row; const int ne = ne00 / 4; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); + cl_set_kernel_args(kernel, src0, src1, dst, ne); } else { kernel = backend_ctx->kernel_add; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); - CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); - CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); - CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); - CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); - CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); - CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); + cl_set_kernel_args(kernel, + src0, + src1, + dst, + src0->ne, + src0->nb, + src1->ne, + src1->nb, + dst->ne, + dst->nb + ); } } else if (dst->type == GGML_TYPE_F16) { GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32); @@ -4968,49 +4915,22 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const if (bcast_row) { kernel = backend_ctx->kernel_add_row_f16; const int ne = ne00 / 4; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &type_src0)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &type_src1)); + cl_set_kernel_args(kernel, src0, src1, dst, ne, type_src0, type_src1); } else { kernel = backend_ctx->kernel_add_f16; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); - CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); - CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); - CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); - CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); - CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); - CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); - CL_CHECK(clSetKernelArg(kernel, 30, sizeof(int), &type_src0)); - CL_CHECK(clSetKernelArg(kernel, 31, sizeof(int), &type_src1)); + cl_set_kernel_args(kernel, + src0, + src1, + dst, + src0->ne, + src0->nb, + src1->ne, + src1->nb, + dst->ne, + dst->nb, + type_src0, + type_src1 + ); } } else { GGML_ASSERT(false && "unsupported data types for add"); From 06d0f91e42a33e739b9aba72e14748f2a5c94ac0 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 8 Jan 2026 10:16:50 +0800 Subject: [PATCH 08/17] wip --- ggml/src/ggml-opencl/ggml-opencl.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 89c5ed9082..982fa59e43 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2678,6 +2678,8 @@ struct ggml_tensor_extra_cl { } }; +namespace /* anonymous */ { + template struct cl_kernel_arg_setter {}; template <> struct cl_kernel_arg_setter { @@ -2713,14 +2715,14 @@ template <> struct cl_kernel_arg_setter { } }; -template <> struct cl_kernel_arg_setter { +template <> struct cl_kernel_arg_setter { static size_t set_arg(cl_kernel kernel, size_t index, const int64_t (&ne)[GGML_MAX_DIMS]) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS changed, update cl_kernel_arg_setter accordingly"); - const int ne0 = (int)ne[0]; - const int ne1 = (int)ne[1]; - const int ne2 = (int)ne[2]; - const int ne3 = (int)ne[3]; + const int ne0 = (int) ne[0]; + const int ne1 = (int) ne[1]; + const int ne2 = (int) ne[2]; + const int ne3 = (int) ne[3]; CL_CHECK(clSetKernelArg(kernel, index, sizeof(int), &ne0)); CL_CHECK(clSetKernelArg(kernel, index + 1, sizeof(int), &ne1)); CL_CHECK(clSetKernelArg(kernel, index + 2, sizeof(int), &ne2)); @@ -2729,7 +2731,7 @@ template <> struct cl_kernel_arg_setter { } }; -template <> struct cl_kernel_arg_setter { +template <> struct cl_kernel_arg_setter { static size_t set_arg(cl_kernel kernel, size_t index, const size_t (&nb)[GGML_MAX_DIMS]) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS changed, update cl_kernel_arg_setter accordingly"); @@ -2745,16 +2747,20 @@ template <> struct cl_kernel_arg_setter { } }; -template static inline size_t cl_set_kernel_args(cl_kernel kernel, _TArgs&&... args) { +template static inline size_t cl_set_kernel_args(cl_kernel kernel, _TArgs &&... args) { size_t index = 0; ( [&] { - index = cl_kernel_arg_setter>>>::set_arg(kernel, index, args); + index = cl_kernel_arg_setter< + std::remove_const_t>>>::set_arg(kernel, index, + args); }(), ...); return index; } +} // namespace + // Additional tensor extra structs for quantized tensors. // These tensors are loaded from files and should not be allocated in scratch -- // they should always be allocated from the pool. Hence, they do not have an From 463dc3faef5cdbcf29526f1be6e1f534018061d1 Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 14 Jan 2026 00:44:17 +0800 Subject: [PATCH 09/17] refactor: enhance file parsing in embed_kernel.py to support nested includes --- ggml/src/ggml-opencl/kernels/embed_kernel.py | 28 ++++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-opencl/kernels/embed_kernel.py b/ggml/src/ggml-opencl/kernels/embed_kernel.py index b5d1d7242b..81d9868a4b 100644 --- a/ggml/src/ggml-opencl/kernels/embed_kernel.py +++ b/ggml/src/ggml-opencl/kernels/embed_kernel.py @@ -2,8 +2,25 @@ import sys import logging +import re +import os + logger = logging.getLogger("opencl-embed-kernel") +INCLUDE_PATTERN = re.compile(r'#include\s+"(.*)"') + + +def parse_file_line(ifile, ofile, base_path: str): + for i in ifile: + i = i.rstrip() + if m := INCLUDE_PATTERN.match(i): + include_file = os.path.join(base_path, m.group(1)) + logger.info(f"Embedding file: {include_file}") + with open(include_file, "r") as incf: + parse_file_line(incf, ofile, base_path) + else: + ofile.write('R"({})"\n'.format(i)) + def main(): logging.basicConfig(level=logging.INFO) @@ -12,14 +29,9 @@ def main(): logger.info("Usage: python embed_kernel.py ") sys.exit(1) - ifile = open(sys.argv[1], "r") - ofile = open(sys.argv[2], "w") - - for i in ifile: - ofile.write('R"({})"\n'.format(i)) - - ifile.close() - ofile.close() + ipath = os.path.dirname(sys.argv[1]) + with open(sys.argv[1], "r") as ifile, open(sys.argv[2], "w") as ofile: + parse_file_line(ifile, ofile, ipath) if __name__ == "__main__": From e6db6313a93d8b5ace22682473e46a553321b575 Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 14 Jan 2026 13:20:08 +0800 Subject: [PATCH 10/17] feat: add OpenCL kernel definitions and include files for division operations --- ggml/src/ggml-opencl/kernels/div.cl | 2 ++ ggml/src/ggml-opencl/kernels/div.h | 31 ++++++++++++++++++++ ggml/src/ggml-opencl/kernels/embed_kernel.py | 3 +- ggml/src/ggml-opencl/kernels/ocl_defs.h | 16 ++++++++++ 4 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 ggml/src/ggml-opencl/kernels/div.h create mode 100644 ggml/src/ggml-opencl/kernels/ocl_defs.h diff --git a/ggml/src/ggml-opencl/kernels/div.cl b/ggml/src/ggml-opencl/kernels/div.cl index 6d9b4ade9f..eb1d2da002 100644 --- a/ggml/src/ggml-opencl/kernels/div.cl +++ b/ggml/src/ggml-opencl/kernels/div.cl @@ -1,5 +1,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#include "div.h" + //------------------------------------------------------------------------------ // div //------------------------------------------------------------------------------ diff --git a/ggml/src/ggml-opencl/kernels/div.h b/ggml/src/ggml-opencl/kernels/div.h new file mode 100644 index 0000000000..75f66a0a2a --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/div.h @@ -0,0 +1,31 @@ + +#ifndef __KERNELS_DIV_H__ +#define __KERNELS_DIV_H__ + +#include "ocl_defs.h" + +kernel void kernel_div(global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + int ne13, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3); + +#endif // __KERNELS_DIV_H__ diff --git a/ggml/src/ggml-opencl/kernels/embed_kernel.py b/ggml/src/ggml-opencl/kernels/embed_kernel.py index 81d9868a4b..867c0d4961 100644 --- a/ggml/src/ggml-opencl/kernels/embed_kernel.py +++ b/ggml/src/ggml-opencl/kernels/embed_kernel.py @@ -7,12 +7,11 @@ import os logger = logging.getLogger("opencl-embed-kernel") -INCLUDE_PATTERN = re.compile(r'#include\s+"(.*)"') +INCLUDE_PATTERN = re.compile(r'#include\s+"(.*)".*') def parse_file_line(ifile, ofile, base_path: str): for i in ifile: - i = i.rstrip() if m := INCLUDE_PATTERN.match(i): include_file = os.path.join(base_path, m.group(1)) logger.info(f"Embedding file: {include_file}") diff --git a/ggml/src/ggml-opencl/kernels/ocl_defs.h b/ggml/src/ggml-opencl/kernels/ocl_defs.h new file mode 100644 index 0000000000..a0b8feedb1 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/ocl_defs.h @@ -0,0 +1,16 @@ + +#ifndef __OCL_DEFS_H__ +#define __OCL_DEFS_H__ + +#ifdef __OPENCL_C_VERSION__ +// Device (OpenCL) Definitions +#else +// Host (C++) Definitions +# define kernel +# define global +# define __kernel +# define __global +# define ulong cl_ulong +#endif + +#endif // __OCL_DEFS_H__ From 5fbff1aa3a37761c0013af4e8c398398973bf791 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 15 Jan 2026 11:57:37 +0800 Subject: [PATCH 11/17] feat: add OpenCL kernel support for division operations and update includes --- ggml/src/ggml-opencl/CMakeLists.txt | 2 ++ ggml/src/ggml-opencl/ggml-opencl.cpp | 4 +++ ggml/src/ggml-opencl/kernels/div.h | 46 ++++++++++++------------- ggml/src/ggml-opencl/kernels/ocl_defs.h | 6 ++-- 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 2a4b79eb6a..977b945d16 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -22,6 +22,8 @@ if (GGML_OPENCL_USE_ADRENO_KERNELS) add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS) endif () +target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_LIST_DIR}/kernels") + if (GGML_OPENCL_EMBED_KERNELS) add_compile_definitions(GGML_OPENCL_EMBED_KERNELS) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 982fa59e43..38d72cba6b 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -29,6 +29,10 @@ #include #include +namespace ocl_kernel_prototypes { + #include "div.h" +} + #undef MIN #undef MAX #define MIN(a, b) ((a) < (b) ? (a) : (b)) diff --git a/ggml/src/ggml-opencl/kernels/div.h b/ggml/src/ggml-opencl/kernels/div.h index 75f66a0a2a..fd65b5706f 100644 --- a/ggml/src/ggml-opencl/kernels/div.h +++ b/ggml/src/ggml-opencl/kernels/div.h @@ -4,28 +4,28 @@ #include "ocl_defs.h" -kernel void kernel_div(global char * src0, - ulong offset0, - global char * src1, - ulong offset1, - global char * dst, - ulong offsetd, - ulong nb00, - ulong nb01, - ulong nb02, - ulong nb03, - int ne10, - int ne11, - int ne12, - int ne13, - ulong nb10, - ulong nb11, - ulong nb12, - ulong nb13, - int ne0, - ulong nb0, - ulong nb1, - ulong nb2, - ulong nb3); +OCL_KERNEL void kernel_div(OCL_GLOBAL char * src0, + ulong offset0, + OCL_GLOBAL char * src1, + ulong offset1, + OCL_GLOBAL char * dst, + ulong offsetd, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + int ne13, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3); #endif // __KERNELS_DIV_H__ diff --git a/ggml/src/ggml-opencl/kernels/ocl_defs.h b/ggml/src/ggml-opencl/kernels/ocl_defs.h index a0b8feedb1..3faf1ecf56 100644 --- a/ggml/src/ggml-opencl/kernels/ocl_defs.h +++ b/ggml/src/ggml-opencl/kernels/ocl_defs.h @@ -4,10 +4,12 @@ #ifdef __OPENCL_C_VERSION__ // Device (OpenCL) Definitions +# define OCL_KERNEL kernel +# define OCL_GLOBAL global #else // Host (C++) Definitions -# define kernel -# define global +# define OCL_KERNEL +# define OCL_GLOBAL # define __kernel # define __global # define ulong cl_ulong From b2a283d5e9c3e92619134e277f30e5a941aef58f Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 15 Jan 2026 22:29:08 +0800 Subject: [PATCH 12/17] feat: enhance OpenCL kernel division operations with new argument setters and invoker --- ggml/src/ggml-opencl/ggml-opencl.cpp | 100 +++++++++++++++++++++--- ggml/src/ggml-opencl/kernels/div.h | 46 +++++------ ggml/src/ggml-opencl/kernels/ocl_defs.h | 6 +- 3 files changed, 115 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 38d72cba6b..7284bcdfe2 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2693,6 +2693,14 @@ template <> struct cl_kernel_arg_setter { } }; +template <> struct cl_kernel_arg_setter>> { + static size_t set_arg(cl_kernel kernel, size_t index, cl_mem arg) { + CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); + return index + 1; + } +}; + + template <> struct cl_kernel_arg_setter { static size_t set_arg(cl_kernel kernel, size_t index, cl_ulong arg) { CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); @@ -2763,6 +2771,23 @@ template static inline size_t cl_set_kernel_args(cl_kernel return index; } +template +struct cl_kernel_invoker {}; + +template +struct cl_kernel_invoker { + static void invoke(cl_kernel kernel, _TArgs... args) { + size_t index = 0; + ( + [&] { + index = cl_kernel_arg_setter< + std::remove_const_t>>>::set_arg(kernel, index, + args); + }(), + ...); + } +}; + } // namespace // Additional tensor extra structs for quantized tensors. @@ -5142,21 +5167,72 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const } else { if (src0->type == GGML_TYPE_F32) { kernel = backend_ctx->kernel_div; + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + const cl_ulong nb00 = src0->nb[0]; + const cl_ulong nb01 = src0->nb[1]; + const cl_ulong nb02 = src0->nb[2]; + const cl_ulong nb03 = src0->nb[3]; + + const int ne12 = src1->ne[2]; + const int ne13 = src1->ne[3]; + + const cl_ulong nb10 = src1->nb[0]; + const cl_ulong nb11 = src1->nb[1]; + const cl_ulong nb12 = src1->nb[2]; + const cl_ulong nb13 = src1->nb[3]; + + const cl_ulong nb0 = dst->nb[0]; + const cl_ulong nb1 = dst->nb[1]; + const cl_ulong nb2 = dst->nb[2]; + const cl_ulong nb3 = dst->nb[3]; + + cl_kernel_invoker::invoke( + kernel, + extra0->data_device, + offset0, + extra1->data_device, + offset1, + extrad->data_device, + offsetd, + nb00, + nb01, + nb02, + nb03, + ne10, + ne11, + ne12, + ne13, + nb10, + nb11, + nb12, + nb13, + ne0, + nb0, + nb1, + nb2, + nb3 + ); } else { kernel = backend_ctx->kernel_div_f16; + cl_set_kernel_args( + kernel, + src0, + src1, + dst, + src0->nb, + src1->ne, + src1->nb, + ne0, + dst->nb + ); } - - cl_set_kernel_args( - kernel, - src0, - src1, - dst, - src0->nb, - src1->ne, - src1->nb, - ne0, - dst->nb - ); } if (bcast_row) { diff --git a/ggml/src/ggml-opencl/kernels/div.h b/ggml/src/ggml-opencl/kernels/div.h index fd65b5706f..f09d88cc19 100644 --- a/ggml/src/ggml-opencl/kernels/div.h +++ b/ggml/src/ggml-opencl/kernels/div.h @@ -4,28 +4,28 @@ #include "ocl_defs.h" -OCL_KERNEL void kernel_div(OCL_GLOBAL char * src0, - ulong offset0, - OCL_GLOBAL char * src1, - ulong offset1, - OCL_GLOBAL char * dst, - ulong offsetd, - ulong nb00, - ulong nb01, - ulong nb02, - ulong nb03, - int ne10, - int ne11, - int ne12, - int ne13, - ulong nb10, - ulong nb11, - ulong nb12, - ulong nb13, - int ne0, - ulong nb0, - ulong nb1, - ulong nb2, - ulong nb3); +OCL_KERNEL void kernel_div(ocl_global_char_ptr src0, + ulong offset0, + ocl_global_char_ptr src1, + ulong offset1, + ocl_global_char_ptr dst, + ulong offsetd, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + int ne13, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3); #endif // __KERNELS_DIV_H__ diff --git a/ggml/src/ggml-opencl/kernels/ocl_defs.h b/ggml/src/ggml-opencl/kernels/ocl_defs.h index 3faf1ecf56..b6743890b3 100644 --- a/ggml/src/ggml-opencl/kernels/ocl_defs.h +++ b/ggml/src/ggml-opencl/kernels/ocl_defs.h @@ -4,12 +4,14 @@ #ifdef __OPENCL_C_VERSION__ // Device (OpenCL) Definitions -# define OCL_KERNEL kernel -# define OCL_GLOBAL global +# define OCL_KERNEL kernel +# define OCL_GLOBAL global +# define ocl_global_char_ptr global char * #else // Host (C++) Definitions # define OCL_KERNEL # define OCL_GLOBAL +# define ocl_global_char_ptr cl_mem # define __kernel # define __global # define ulong cl_ulong From 208f8454cd332c5f4001798cbf61f3c2e209e5a9 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 16 Jan 2026 22:44:32 +0800 Subject: [PATCH 13/17] feat: add function type definitions for OpenCL kernel argument setters --- ggml/src/ggml-opencl/ggml-opencl.cpp | 106 +++++++++++++-------------- 1 file changed, 51 insertions(+), 55 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 7284bcdfe2..61386c3918 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2687,6 +2687,8 @@ namespace /* anonymous */ { template struct cl_kernel_arg_setter {}; template <> struct cl_kernel_arg_setter { + typedef void func_t(int); + static size_t set_arg(cl_kernel kernel, size_t index, int arg) { CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); return index + 1; @@ -2694,6 +2696,8 @@ template <> struct cl_kernel_arg_setter { }; template <> struct cl_kernel_arg_setter>> { + typedef void func_t(cl_mem); + static size_t set_arg(cl_kernel kernel, size_t index, cl_mem arg) { CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); return index + 1; @@ -2702,6 +2706,8 @@ template <> struct cl_kernel_arg_setter struct cl_kernel_arg_setter { + typedef void func_t(cl_ulong); + static size_t set_arg(cl_kernel kernel, size_t index, cl_ulong arg) { CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); return index + 1; @@ -2709,6 +2715,8 @@ template <> struct cl_kernel_arg_setter { }; template <> struct cl_kernel_arg_setter { + typedef void func_t(float); + static size_t set_arg(cl_kernel kernel, size_t index, float arg) { CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); return index + 1; @@ -2716,6 +2724,8 @@ template <> struct cl_kernel_arg_setter { }; template <> struct cl_kernel_arg_setter { + typedef void func_t(cl_mem, cl_ulong); + static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) { ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) t->extra; static_assert(std::is_same_vdata_device), cl_mem>, "data_device type mismatch"); @@ -2728,6 +2738,8 @@ template <> struct cl_kernel_arg_setter { }; template <> struct cl_kernel_arg_setter { + typedef void func_t(int, int, int, int); + static size_t set_arg(cl_kernel kernel, size_t index, const int64_t (&ne)[GGML_MAX_DIMS]) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS changed, update cl_kernel_arg_setter accordingly"); @@ -2744,6 +2756,8 @@ template <> struct cl_kernel_arg_setter { }; template <> struct cl_kernel_arg_setter { + typedef void func_t(cl_ulong, cl_ulong, cl_ulong, cl_ulong); + static size_t set_arg(cl_kernel kernel, size_t index, const size_t (&nb)[GGML_MAX_DIMS]) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS changed, update cl_kernel_arg_setter accordingly"); @@ -2771,18 +2785,41 @@ template static inline size_t cl_set_kernel_args(cl_kernel return index; } -template -struct cl_kernel_invoker {}; +template struct type_merger {}; + +template +struct type_merger { + using func_t = typename type_merger::func_t; +}; + +template struct type_merger { + using func_t = void(_TInnerArgs...); +}; + +template struct cl_param_type_extractor { + using args_t = std::remove_const_t>>; + using first_func_t = + typename cl_kernel_arg_setter::func_t; + using func_t = typename type_merger::func_t>::func_t; +}; + +template struct cl_param_type_extractor<_TFinalArg> { + using args_t = std::remove_const_t>>; + using func_t = typename cl_kernel_arg_setter::func_t; +}; + +template struct cl_kernel_invoker { + template static void invoke(cl_kernel kernel, _TCalledArgs &&... args) { + static_assert(std::is_same_v<_TFunc, typename cl_param_type_extractor<_TCalledArgs...>::func_t>, + "Kernel argument type mismatch between prototype and called arguments"); -template -struct cl_kernel_invoker { - static void invoke(cl_kernel kernel, _TArgs... args) { size_t index = 0; ( [&] { index = cl_kernel_arg_setter< - std::remove_const_t>>>::set_arg(kernel, index, - args); + std::remove_const_t>>>::set_arg(kernel, + index, + args); }(), ...); } @@ -5167,57 +5204,16 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const } else { if (src0->type == GGML_TYPE_F32) { kernel = backend_ctx->kernel_div; - ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; - ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; - ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - - cl_ulong offset0 = extra0->offset + src0->view_offs; - cl_ulong offset1 = extra1->offset + src1->view_offs; - cl_ulong offsetd = extrad->offset + dst->view_offs; - - const cl_ulong nb00 = src0->nb[0]; - const cl_ulong nb01 = src0->nb[1]; - const cl_ulong nb02 = src0->nb[2]; - const cl_ulong nb03 = src0->nb[3]; - - const int ne12 = src1->ne[2]; - const int ne13 = src1->ne[3]; - - const cl_ulong nb10 = src1->nb[0]; - const cl_ulong nb11 = src1->nb[1]; - const cl_ulong nb12 = src1->nb[2]; - const cl_ulong nb13 = src1->nb[3]; - - const cl_ulong nb0 = dst->nb[0]; - const cl_ulong nb1 = dst->nb[1]; - const cl_ulong nb2 = dst->nb[2]; - const cl_ulong nb3 = dst->nb[3]; - cl_kernel_invoker::invoke( kernel, - extra0->data_device, - offset0, - extra1->data_device, - offset1, - extrad->data_device, - offsetd, - nb00, - nb01, - nb02, - nb03, - ne10, - ne11, - ne12, - ne13, - nb10, - nb11, - nb12, - nb13, + src0, + src1, + dst, + src0->nb, + src1->ne, + src1->nb, ne0, - nb0, - nb1, - nb2, - nb3 + dst->nb ); } else { kernel = backend_ctx->kernel_div_f16; From 61093a4159f90ec58b52d6445e413f0f2caf9604 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 16 Jan 2026 23:02:08 +0800 Subject: [PATCH 14/17] wip --- ggml/src/ggml-opencl/ggml-opencl.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 61386c3918..0ccdb4c328 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2695,16 +2695,6 @@ template <> struct cl_kernel_arg_setter { } }; -template <> struct cl_kernel_arg_setter>> { - typedef void func_t(cl_mem); - - static size_t set_arg(cl_kernel kernel, size_t index, cl_mem arg) { - CL_CHECK(clSetKernelArg(kernel, index, sizeof(arg), &arg)); - return index + 1; - } -}; - - template <> struct cl_kernel_arg_setter { typedef void func_t(cl_ulong); From 08dbd97356fd12d2c25deb80e238ad3a53d3c548 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 16 Jan 2026 23:11:16 +0800 Subject: [PATCH 15/17] feat: update OpenCL kernel argument types and invoker for division operations --- ggml/src/ggml-opencl/ggml-opencl.cpp | 21 +++++------ ggml/src/ggml-opencl/kernels/div.h | 46 ++++++++++++------------- ggml/src/ggml-opencl/kernels/ocl_defs.h | 2 -- 3 files changed, 31 insertions(+), 38 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 0ccdb4c328..6ffcc4bef1 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2714,7 +2714,7 @@ template <> struct cl_kernel_arg_setter { }; template <> struct cl_kernel_arg_setter { - typedef void func_t(cl_mem, cl_ulong); + typedef void func_t(char *, cl_ulong); static size_t set_arg(cl_kernel kernel, size_t index, const ggml_tensor * t) { ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) t->extra; @@ -2799,22 +2799,17 @@ template struct cl_param_type_extractor<_TFinalArg> { }; template struct cl_kernel_invoker { - template static void invoke(cl_kernel kernel, _TCalledArgs &&... args) { + template static size_t invoke(cl_kernel kernel, _TCalledArgs &&... args) { static_assert(std::is_same_v<_TFunc, typename cl_param_type_extractor<_TCalledArgs...>::func_t>, "Kernel argument type mismatch between prototype and called arguments"); - - size_t index = 0; - ( - [&] { - index = cl_kernel_arg_setter< - std::remove_const_t>>>::set_arg(kernel, - index, - args); - }(), - ...); + return cl_set_kernel_args(kernel, args...); } }; +template static inline size_t cl_set_kernel_args_safe(cl_kernel kernel, _TArgs &&... args) { + return cl_kernel_invoker<_TFunc>::invoke(kernel, args...); +} + } // namespace // Additional tensor extra structs for quantized tensors. @@ -5194,7 +5189,7 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const } else { if (src0->type == GGML_TYPE_F32) { kernel = backend_ctx->kernel_div; - cl_kernel_invoker::invoke( + cl_set_kernel_args_safe( kernel, src0, src1, diff --git a/ggml/src/ggml-opencl/kernels/div.h b/ggml/src/ggml-opencl/kernels/div.h index f09d88cc19..fd65b5706f 100644 --- a/ggml/src/ggml-opencl/kernels/div.h +++ b/ggml/src/ggml-opencl/kernels/div.h @@ -4,28 +4,28 @@ #include "ocl_defs.h" -OCL_KERNEL void kernel_div(ocl_global_char_ptr src0, - ulong offset0, - ocl_global_char_ptr src1, - ulong offset1, - ocl_global_char_ptr dst, - ulong offsetd, - ulong nb00, - ulong nb01, - ulong nb02, - ulong nb03, - int ne10, - int ne11, - int ne12, - int ne13, - ulong nb10, - ulong nb11, - ulong nb12, - ulong nb13, - int ne0, - ulong nb0, - ulong nb1, - ulong nb2, - ulong nb3); +OCL_KERNEL void kernel_div(OCL_GLOBAL char * src0, + ulong offset0, + OCL_GLOBAL char * src1, + ulong offset1, + OCL_GLOBAL char * dst, + ulong offsetd, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + int ne13, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3); #endif // __KERNELS_DIV_H__ diff --git a/ggml/src/ggml-opencl/kernels/ocl_defs.h b/ggml/src/ggml-opencl/kernels/ocl_defs.h index b6743890b3..edf32ad0a7 100644 --- a/ggml/src/ggml-opencl/kernels/ocl_defs.h +++ b/ggml/src/ggml-opencl/kernels/ocl_defs.h @@ -6,12 +6,10 @@ // Device (OpenCL) Definitions # define OCL_KERNEL kernel # define OCL_GLOBAL global -# define ocl_global_char_ptr global char * #else // Host (C++) Definitions # define OCL_KERNEL # define OCL_GLOBAL -# define ocl_global_char_ptr cl_mem # define __kernel # define __global # define ulong cl_ulong From e1308c80173595b4cf8bda84ed6db14e852aeae1 Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 17 Jan 2026 01:38:55 +0800 Subject: [PATCH 16/17] refactor: rename type merger to cl_func_args_concatenator for clarity and update related structures --- ggml/src/ggml-opencl/ggml-opencl.cpp | 31 ++++++++++++---------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 6ffcc4bef1..f8034fa175 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2775,22 +2775,22 @@ template static inline size_t cl_set_kernel_args(cl_kernel return index; } -template struct type_merger {}; +template struct cl_func_args_concatenator {}; template -struct type_merger { - using func_t = typename type_merger::func_t; +struct cl_func_args_concatenator { + using func_t = typename cl_func_args_concatenator::func_t; }; -template struct type_merger { +template struct cl_func_args_concatenator { using func_t = void(_TInnerArgs...); }; template struct cl_param_type_extractor { - using args_t = std::remove_const_t>>; - using first_func_t = - typename cl_kernel_arg_setter::func_t; - using func_t = typename type_merger::func_t>::func_t; + using args_t = std::remove_const_t>>; + using first_func_t = typename cl_kernel_arg_setter::func_t; + using func_t = typename cl_func_args_concatenator::func_t>::func_t; }; template struct cl_param_type_extractor<_TFinalArg> { @@ -2798,16 +2798,11 @@ template struct cl_param_type_extractor<_TFinalArg> { using func_t = typename cl_kernel_arg_setter::func_t; }; -template struct cl_kernel_invoker { - template static size_t invoke(cl_kernel kernel, _TCalledArgs &&... args) { - static_assert(std::is_same_v<_TFunc, typename cl_param_type_extractor<_TCalledArgs...>::func_t>, - "Kernel argument type mismatch between prototype and called arguments"); - return cl_set_kernel_args(kernel, args...); - } -}; - -template static inline size_t cl_set_kernel_args_safe(cl_kernel kernel, _TArgs &&... args) { - return cl_kernel_invoker<_TFunc>::invoke(kernel, args...); +template +static inline size_t cl_set_kernel_args_safe(cl_kernel kernel, _TArgs &&... args) { + static_assert(std::is_same_v<_TFunc, typename cl_param_type_extractor<_TArgs...>::func_t>, + "Kernel argument type mismatch between prototype and called arguments"); + return cl_set_kernel_args(kernel, args...); } } // namespace From 020c2af1fe2736eaebd74819b4fb2e340f0d620c Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 17 Jan 2026 01:45:44 +0800 Subject: [PATCH 17/17] wip --- ggml/src/ggml-opencl/ggml-opencl.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index f8034fa175..2d8f4ea304 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2786,21 +2786,22 @@ template struct cl_func_args_concatenator struct cl_param_type_extractor { +template struct cl_kernel_signature_builder { using args_t = std::remove_const_t>>; using first_func_t = typename cl_kernel_arg_setter::func_t; - using func_t = typename cl_func_args_concatenator::func_t>::func_t; + using func_t = + typename cl_func_args_concatenator::func_t>::func_t; }; -template struct cl_param_type_extractor<_TFinalArg> { +template struct cl_kernel_signature_builder<_TFinalArg> { using args_t = std::remove_const_t>>; using func_t = typename cl_kernel_arg_setter::func_t; }; template static inline size_t cl_set_kernel_args_safe(cl_kernel kernel, _TArgs &&... args) { - static_assert(std::is_same_v<_TFunc, typename cl_param_type_extractor<_TArgs...>::func_t>, + static_assert(std::is_same_v<_TFunc, typename cl_kernel_signature_builder<_TArgs...>::func_t>, "Kernel argument type mismatch between prototype and called arguments"); return cl_set_kernel_args(kernel, args...); }