opencl: unpack q4_0 for adreno in get_tensor (#18278)
This commit is contained in:
parent
e3b35ddf1c
commit
eb492bf43f
|
|
@ -494,6 +494,7 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
||||
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_restore_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
||||
cl_kernel kernel_mul_mv_q6_K_f32;
|
||||
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
|
||||
|
|
@ -634,6 +635,7 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_transpose_32;
|
||||
cl_kernel kernel_transpose_32_16;
|
||||
cl_kernel kernel_transpose_16;
|
||||
cl_kernel kernel_transpose_16_buf;
|
||||
cl_kernel kernel_transpose_16_4x1;
|
||||
|
||||
cl_mem A_s_d_max; // max scale buffer size for transpose
|
||||
|
|
@ -806,6 +808,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
|
||||
|
|
@ -2004,7 +2007,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
|
|
@ -3933,6 +3937,91 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
cl_int err;
|
||||
cl_kernel kernel;
|
||||
|
||||
cl_int M = tensor->ne[1]; // ne01
|
||||
cl_int K = tensor->ne[0]; // ne00
|
||||
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
GGML_ASSERT(M % 4 == 0);
|
||||
|
||||
size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
|
||||
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
||||
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
|
||||
|
||||
cl_mem buf_trans_q;
|
||||
cl_mem buf_trans_d;
|
||||
|
||||
CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
size_q, NULL, &err), err));
|
||||
CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
size_d, NULL, &err), err));
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_16_buf;
|
||||
|
||||
// transpose q back
|
||||
cl_int stride_k_q = K/4;
|
||||
size_t local_size_q[3] = {64, 1, 1};
|
||||
size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_size_q, local_size_q, 0, NULL, NULL));
|
||||
|
||||
// transpose scales back
|
||||
cl_int stride_k_d = K/32;
|
||||
size_t local_size_d[3] = {64, 1, 1};
|
||||
size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_size_d, local_size_d, 0, NULL, NULL));
|
||||
|
||||
// unpack
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, NULL));
|
||||
|
||||
// read back to host
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
CL_CHECK(clReleaseMemObject(buf_trans_q));
|
||||
CL_CHECK(clReleaseMemObject(buf_trans_d));
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
|
|
|
|||
|
|
@ -117,6 +117,27 @@ kernel void kernel_convert_block_q4_0_noshuffle(
|
|||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q4_0_noshuffle(
|
||||
global uchar * src_q,
|
||||
global half * src_d,
|
||||
global struct block_q4_0 * dst,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
for (int i = 0; i < QK4_0/4; ++i) {
|
||||
uchar x0 = q[i + 0 ] ;
|
||||
uchar x1 = q[i + QK4_0/4];
|
||||
|
||||
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
||||
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_mxfp4
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -44,6 +44,19 @@ kernel void kernel_transpose_16_4x1(
|
|||
write_imageh(output, i * rows + j, (half4)(temp0, temp1, temp2, temp3));
|
||||
}
|
||||
|
||||
// Transpose treating each element as 16-bit using buffer
|
||||
kernel void kernel_transpose_16_buf(
|
||||
global const ushort * input,
|
||||
global ushort * output,
|
||||
const int ldi,
|
||||
const int ldo
|
||||
) {
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
output[x*ldo + y] = input[y*ldi + x];
|
||||
}
|
||||
|
||||
// 32-bit transpose, loading/storing a 4x4 tile of elements
|
||||
kernel void kernel_transpose_32(
|
||||
__read_only image1d_buffer_t input,
|
||||
|
|
|
|||
Loading…
Reference in New Issue