opencl: add `SET`, support i32 for `CPY`, minor refactor for cpy (#20101)

This commit is contained in:
lhez 2026-03-04 21:32:26 -08:00 committed by GitHub
parent 1a29907d2e
commit 69fd345335
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 168 additions and 28 deletions

View File

@ -416,7 +416,6 @@ struct ggml_backend_opencl_context {
cl_program program_add;
cl_program program_add_id;
cl_program program_clamp;
cl_program program_cpy;
cl_program program_cvt;
cl_program program_diag_mask_inf;
cl_program program_gelu;
@ -514,7 +513,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
cl_kernel kernel_mul_mat_f32_f32;
cl_kernel kernel_mul_mat_f16_f16;
cl_kernel kernel_mul_mat_f16_f32_1row;
@ -873,13 +872,14 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
#else
const std::string kernel_src = read_file("cpy.cl");
#endif
backend_ctx->program_cpy =
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f16", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f32", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f16", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f32", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(prog, "kernel_cpy_f16_f16", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
GGML_LOG_CONT(".");
}
@ -3544,9 +3544,21 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
default:
return false;
}
case GGML_TYPE_I32:
switch (op->type) {
case GGML_TYPE_I32:
return true;
default:
return false;
}
default:
return false;
}
case GGML_OP_SET: {
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32) &&
op->type == op->src[0]->type &&
op->type == op->src[1]->type;
}
case GGML_OP_SCALE:
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
case GGML_OP_ADD:
@ -10782,28 +10794,13 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
// GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
UNUSED(dst);
const int ne00 = src0 ? src0->ne[0] : 0;
const int ne01 = src0 ? src0->ne[1] : 0;
const int ne02 = src0 ? src0->ne[2] : 0;
const int ne03 = src0 ? src0->ne[3] : 0;
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
GGML_TENSOR_LOCALS(int, ne1, src1, ne);
GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
const int ne10 = src1 ? src1->ne[0] : 0;
const int ne11 = src1 ? src1->ne[1] : 0;
const int ne12 = src1 ? src1->ne[2] : 0;
const int ne13 = src1 ? src1->ne[3] : 0;
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
const enum ggml_type src0t = src0->type;
const enum ggml_type src1t = src1->type;
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@ -10840,6 +10837,15 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
GGML_ASSERT(false && "not implemented");
}
break;
case GGML_TYPE_I32:
switch (src1t) {
case GGML_TYPE_I32:
kernel = backend_ctx->kernel_cpy_i32_i32;
break;
default:
GGML_ASSERT(false && "not implemented");
}
break;
default:
GGML_ASSERT(false && "not implemented");
}
@ -10878,6 +10884,89 @@ static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const
UNUSED(src1);
}
static void ggml_cl_set(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(src1);
GGML_ASSERT(src1->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
GGML_ASSERT((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32) &&
src1->type == src0->type && dst->type == src0->type);
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
GGML_TENSOR_LOCALS(int, ne1, src1, ne);
GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
GGML_TENSOR_LOCALS(int, ne, dst, ne);
GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
const cl_ulong pnb1 = ((const int32_t *)dst->op_params)[0];
const cl_ulong pnb2 = ((const int32_t *)dst->op_params)[1];
const cl_ulong pnb3 = ((const int32_t *)dst->op_params)[2];
const cl_ulong offs = ((const int32_t *)dst->op_params)[3];
const bool inplace = (bool)((const int32_t *)dst->op_params)[4];
cl_kernel kernel = nullptr;
// for inplace case, dst is a view of src0 and is updated on top of it
// so for non-inplace case, copy src0 to dst first
if (!inplace) {
ggml_cl_cpy(backend, src0, dst, nullptr);
}
// then copy src1 to dst with specified offset
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_cpy_f32_f32;
} else if (src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
kernel = backend_ctx->kernel_cpy_i32_i32;
} else {
GGML_ASSERT(false && "not implemented");
}
offsetd += offs;
cl_ulong nb = ggml_element_size(dst);
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &pnb1));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &pnb2));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &pnb3));
int max_local_size = backend_ctx->get_kernel_workgroup_size(kernel);
const int nth = MIN(max_local_size, ne00);
size_t global_work_size[] = {(size_t)ne11*nth, (size_t)ne12, (size_t)ne13};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@ -11651,6 +11740,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_cpy;
break;
case GGML_OP_SET:
if (!any_on_device) {
return false;
}
func = ggml_cl_set;
break;
case GGML_OP_DUP:
case GGML_OP_CONT:
if (!any_on_device) {

View File

@ -182,3 +182,48 @@ kernel void kernel_cpy_f32_f32(
dst_data[i00] = src[0];
}
}
kernel void kernel_cpy_i32_i32(
global int * src0,
ulong offset0,
global int * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne0,
int ne1,
int ne2,
int ne3,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = (global int*)((global char*)src0 + offset0);
dst = (global int*)((global char*)dst + offsetd);
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
int i3 = n / (ne2*ne1*ne0);
int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
global int * dst_data = (global int *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
global const int * src = (global int *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
dst_data[i00] = src[0];
}
}