Merge 7794488b32 into dc6592431b
This commit is contained in:
commit
c37a428032
|
|
@ -89,6 +89,7 @@ set(GGML_OPENCL_KERNELS
|
|||
mul_mv_q4_1_f32
|
||||
mul_mv_q4_1_f32_flat
|
||||
mul_mv_q4_k_f32
|
||||
mul_mv_q4_k_f32_flat
|
||||
mul_mv_q6_k_f32
|
||||
mul_mv_q6_k_f32_flat
|
||||
mul_mv_q8_0_f32
|
||||
|
|
@ -107,6 +108,7 @@ set(GGML_OPENCL_KERNELS
|
|||
mul_mm_q4_0_f32_l4_lm
|
||||
mul_mm_q4_1_f32_l4_lm
|
||||
mul_mm_q8_0_f32_l4_lm
|
||||
mul_mm_q4_k_f32_l4_lm
|
||||
mul_mm_q6_k_f32_l4_lm
|
||||
mul_mm_q8_0_f32_8x4
|
||||
gemv_noshuffle_q4_1_f32
|
||||
|
|
|
|||
|
|
@ -534,11 +534,13 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_restore_block_q4_0_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_1_noshuffle;
|
||||
cl_kernel kernel_restore_block_q4_1_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
|
||||
cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
||||
cl_kernel kernel_mul_mv_q4_1_f32;
|
||||
cl_kernel kernel_mul_mv_q4_1_f32_flat;
|
||||
cl_kernel kernel_mul_mv_q4_K_f32;
|
||||
cl_kernel kernel_mul_mv_q4_K_f32_flat;
|
||||
cl_kernel kernel_mul_mv_q6_K_f32;
|
||||
cl_kernel kernel_mul_mv_q6_K_f32_flat;
|
||||
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
|
||||
|
|
@ -578,6 +580,7 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_mul_mm_q4_0_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q4_1_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q4_k_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_q6_k_f32_l4_lm;
|
||||
|
||||
std::vector<ProfilingInfo> profiling_info;
|
||||
|
|
@ -917,6 +920,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
|
|
@ -1209,6 +1214,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_q4_k_f32_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_q4_k_f32_flat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_q4_k_f32_flat.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_q4_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q4_K_f32_flat", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_q6_k_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
|
|
@ -1482,6 +1504,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mm_q4_k_f32_l4_lm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mm_q4_k_f32_l4_lm.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mm_q4_k_f32_l4_lm.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mm_q4_k_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q4_k_f32_l4_lm", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mm_q6_k_f32_l4_lm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
|
|
@ -3347,6 +3386,40 @@ struct ggml_tensor_extra_cl_q8_0 {
|
|||
}
|
||||
};
|
||||
|
||||
struct ggml_tensor_extra_cl_q4_K {
|
||||
// Quantized values
|
||||
cl_mem q = nullptr;
|
||||
// Scales for each super block.
|
||||
cl_mem s = nullptr;
|
||||
// Scales
|
||||
cl_mem d = nullptr;
|
||||
// Min
|
||||
cl_mem dm = nullptr;
|
||||
|
||||
~ggml_tensor_extra_cl_q4_K() {
|
||||
reset();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
if (q != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(q));
|
||||
q = nullptr;
|
||||
}
|
||||
if (s != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(s));
|
||||
s = nullptr;
|
||||
}
|
||||
if (d != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(d));
|
||||
d = nullptr;
|
||||
}
|
||||
if (dm != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(dm));
|
||||
dm = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_tensor_extra_cl_q6_K {
|
||||
// Lower 4 bits of quantized weights.
|
||||
cl_mem ql = nullptr;
|
||||
|
|
@ -3956,6 +4029,12 @@ struct ggml_backend_opencl_buffer_context {
|
|||
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K_in_use) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K) {
|
||||
delete e;
|
||||
}
|
||||
|
|
@ -4039,6 +4118,21 @@ struct ggml_backend_opencl_buffer_context {
|
|||
return extra;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl_q4_K * ggml_opencl_alloc_temp_tensor_extra_q4_K() {
|
||||
ggml_tensor_extra_cl_q4_K * extra;
|
||||
if (temp_tensor_extras_q4_K.empty()) {
|
||||
extra = new ggml_tensor_extra_cl_q4_K();
|
||||
} else {
|
||||
extra = temp_tensor_extras_q4_K.back();
|
||||
temp_tensor_extras_q4_K.pop_back();
|
||||
}
|
||||
|
||||
temp_tensor_extras_q4_K_in_use.push_back(extra);
|
||||
|
||||
extra->reset();
|
||||
return extra;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() {
|
||||
ggml_tensor_extra_cl_q6_K * extra;
|
||||
if (temp_tensor_extras_q6_K.empty()) {
|
||||
|
|
@ -4080,6 +4174,11 @@ struct ggml_backend_opencl_buffer_context {
|
|||
}
|
||||
temp_tensor_extras_q8_0_in_use.clear();
|
||||
|
||||
for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K_in_use) {
|
||||
temp_tensor_extras_q4_K.push_back(e);
|
||||
}
|
||||
temp_tensor_extras_q4_K_in_use.clear();
|
||||
|
||||
for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
|
||||
temp_tensor_extras_q6_K.push_back(e);
|
||||
}
|
||||
|
|
@ -4101,6 +4200,8 @@ struct ggml_backend_opencl_buffer_context {
|
|||
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
|
||||
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q4_K *> temp_tensor_extras_q4_K;
|
||||
std::vector<ggml_tensor_extra_cl_q4_K *> temp_tensor_extras_q4_K_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K;
|
||||
std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K_in_use;
|
||||
|
||||
|
|
@ -4835,6 +4936,83 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q4_K) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
||||
|
||||
// Allocate the new extra and create aliases from the original.
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
ggml_tensor_extra_cl_q4_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_K();
|
||||
|
||||
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
||||
size_t size_dm = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
||||
size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(3 * ggml_blck_size(tensor->type) / 64);
|
||||
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
|
||||
GGML_ASSERT(size_d + size_dm + size_s + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
CL_CHECK(clEnqueueWriteBuffer(
|
||||
queue, data_device, CL_TRUE, 0,
|
||||
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
||||
|
||||
cl_buffer_region region;
|
||||
|
||||
// Create subbuffer for d.
|
||||
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
||||
region.size = size_d;
|
||||
extra->d = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
auto previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for mins.
|
||||
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
|
||||
region.size = size_dm;
|
||||
extra->dm = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for s.
|
||||
region.origin = align_to(previous_origin + size_dm, backend_ctx->alignment);
|
||||
region.size = size_s;
|
||||
extra->s = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for quants.
|
||||
region.origin = align_to(previous_origin + size_s, backend_ctx->alignment);
|
||||
region.size = size_q;
|
||||
extra->q = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
tensor->extra = extra;
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
||||
|
|
@ -5245,6 +5423,34 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q4_K) {
|
||||
ggml_tensor_extra_cl_q4_K * extra = (ggml_tensor_extra_cl_q4_K *)tensor->extra;
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
|
||||
|
||||
|
|
@ -9357,6 +9563,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
|
||||
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q4_K * extra0_q4_K = (ggml_tensor_extra_cl_q4_K *)src0->extra;
|
||||
ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
|
||||
#endif
|
||||
|
||||
|
|
@ -10005,6 +10212,50 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
return;
|
||||
}
|
||||
case GGML_TYPE_Q4_K: {
|
||||
if (ne11 < 32) {
|
||||
break;
|
||||
}
|
||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
|
||||
break;
|
||||
}
|
||||
|
||||
kernel = backend_ctx->kernel_mul_mm_q4_k_f32_l4_lm;
|
||||
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
||||
|
||||
int batch_stride_a = ne00*ne01;
|
||||
int batch_stride_b = ne10*ne11;
|
||||
int batch_stride_d = ne0*ne1;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_K->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_K->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_K->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_K->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_a
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); // stride_b
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne01)); // stride_d
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_a));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_b));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &batch_stride_d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3));
|
||||
|
||||
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
|
||||
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
|
||||
size_t local_work_size[] = {(size_t)nth0, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
return;
|
||||
}
|
||||
case GGML_TYPE_Q6_K: {
|
||||
if (ne11 < 32) {
|
||||
break;
|
||||
|
|
@ -10449,6 +10700,43 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K: {
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
kernel = backend_ctx->kernel_mul_mv_q4_K_f32_flat;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
nth0 = 16;
|
||||
nth1 = 1;
|
||||
ndst = 4;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
nth1 = 2;
|
||||
ndst = 16;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_K->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_K->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_K->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_K->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3));
|
||||
#else
|
||||
kernel = backend_ctx->kernel_mul_mv_q4_K_f32;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
|
|
@ -10482,6 +10770,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
|
||||
#endif // GGML_OPENCL_SOA_Q
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_Q5_K:
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@
|
|||
#define QK8_0 32
|
||||
#define QR8_0 1
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE (3 * QK_K / 64)
|
||||
#define K_QUANTS_PER_ITERATION 2
|
||||
|
||||
typedef char int8_t;
|
||||
|
|
@ -55,6 +56,16 @@ struct block_q4_1 {
|
|||
uchar qs[QK4_1 / 2]; // nibbles / quants
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_k
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q4_K {
|
||||
half d; // delta
|
||||
half dm; // min
|
||||
uchar s[K_SCALE_SIZE];
|
||||
uchar q[QK_K / 2]; // nibbles / quants
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q6_K
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
@ -408,6 +419,62 @@ kernel void kernel_restore_block_q8_0_trans(
|
|||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q4_K
|
||||
// Convert the block_q4_K format to 4 separate arrays (AOS -> SOA).
|
||||
// This kernel does not deshuffle the bits.
|
||||
// Each thread processes a super block.
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_convert_block_q4_K(
|
||||
global struct block_q4_K * src0,
|
||||
global uchar * dst_q,
|
||||
global uchar * dst_s,
|
||||
global half * dst_d,
|
||||
global half * dst_dm
|
||||
) {
|
||||
global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
|
||||
global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE*get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
global half * dm = (global half *) dst_dm + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
*dm = b->dm;
|
||||
|
||||
for (int i = 0; i < QK_K/2; ++i) {
|
||||
q[i] = b->q[i];
|
||||
}
|
||||
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
||||
s[i] = b->s[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Restore block_q4_K from flattened arrays.
|
||||
// Each thread processes a super block.
|
||||
kernel void kernel_restore_block_q4_K(
|
||||
global uchar * src_q,
|
||||
global uchar * src_s,
|
||||
global half * src_d,
|
||||
global half * src_dm,
|
||||
global struct block_q4_K * dst
|
||||
) {
|
||||
global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
|
||||
global uchar * s = (global uchar *) src_s + K_SCALE_SIZE*get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
global half * dm = (global half *) src_dm + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
b->dm = *dm;
|
||||
|
||||
for (int i = 0; i < QK_K/2; ++i) {
|
||||
b->q[i] = q[i];
|
||||
}
|
||||
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
||||
b->s[i] = s[i];
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q6_K
|
||||
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
|
||||
|
|
|
|||
|
|
@ -0,0 +1,179 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define LOAD_VEC_A 4
|
||||
#define LOAD_VEC_B 4
|
||||
|
||||
#define BM 64
|
||||
#define BN 64
|
||||
#define BK 32
|
||||
#define TM 4
|
||||
#define TN 8
|
||||
|
||||
kernel void kernel_mul_mm_q4_k_f32_l4_lm(
|
||||
global uchar4 * src0_q,
|
||||
global uchar * src0_s,
|
||||
global half * src0_d,
|
||||
global half * src0_dm,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne11,
|
||||
int ne12,
|
||||
|
||||
int stride_a,
|
||||
int stride_b,
|
||||
int stride_d,
|
||||
|
||||
int batch_stride_a,
|
||||
int batch_stride_b,
|
||||
int batch_stride_d,
|
||||
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float *)((global char*)dst + offsetd);
|
||||
|
||||
local float buf_a[BM * BK];
|
||||
local float buf_b[BN * BK];
|
||||
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
const int i13 = batch_idx / ne12;
|
||||
const int i12 = batch_idx % ne12;
|
||||
|
||||
const int i03 = i13 / r3;
|
||||
const int i02 = i12 / r2;
|
||||
|
||||
const int batch_idx_a = i03 * ne02 + i02;
|
||||
|
||||
const int ir = get_group_id(0);
|
||||
const int ic = get_group_id(1);
|
||||
|
||||
const int tid = get_local_id(0);
|
||||
const int th_r = tid % (BM / TM);
|
||||
const int th_c = tid / (BM / TM);
|
||||
|
||||
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
|
||||
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
|
||||
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
|
||||
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
|
||||
|
||||
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
|
||||
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
|
||||
|
||||
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
|
||||
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
|
||||
|
||||
float sums[TM * TN];
|
||||
float cache_a[TM];
|
||||
float cache_b[TN];
|
||||
|
||||
for (int i = 0; i < TM * TN; i++) {
|
||||
sums[i] = 0.0f;
|
||||
}
|
||||
|
||||
for (int block = 0; block < ne00; block += BK) {
|
||||
for (int l = 0; l < BM; l += loadstride_a) {
|
||||
if (ir*BM + loadc_a + l < ne01) {
|
||||
int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
|
||||
int ib = idx / 64;
|
||||
int iqs = (idx % 64) * 2;
|
||||
|
||||
int n = iqs / 32;
|
||||
int b = (iqs % 32) / 16;
|
||||
int is = 2 * n + b;
|
||||
int qsi = n * 32 + (iqs % 16) * 2;
|
||||
|
||||
char * scales = src0_s + ib * 12;
|
||||
|
||||
int scidx0 = (is < 4) ? is : (is + 4);
|
||||
int scidx1 = (is < 4) ? is : (is - 4);
|
||||
int scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
int scidxshift1 = (is < 4) ? 0 : 2;
|
||||
int mbidx0 = is + 4;
|
||||
int mbidx1 = (is < 4) ? is + 4 : is;
|
||||
int mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
int mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
int mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
int mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
uchar sc = (scales[scidx0] & 0xF) | ((scales[scidx1] & scidxmask1) >> scidxshift1);
|
||||
uchar mbyte = ((scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((scales[mbidx1] & mbidxmask1) >> mbidxshift1);
|
||||
|
||||
float d = (float)src0_d[ib] * (float)sc;
|
||||
float m = -(float)src0_dm[ib] * (float)mbyte;
|
||||
|
||||
global uchar4 * qs = src0_q + ib*32 + (qsi >> 2);
|
||||
uchar4 q = *qs;
|
||||
float4 v1 = (convert_float4((uchar4)((q.s0 >> (b * 4))&0x0F, (q.s1 >> (b * 4))&0x0F, (q.s2 >> (b * 4))&0x0F, (q.s3 >> (b * 4))&0x0F)))*d + m;
|
||||
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = v1.s0;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = v1.s1;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = v1.s2;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = v1.s3;
|
||||
} else {
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = 0.0f;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
for (int l = 0; l < BN; l += loadstride_b) {
|
||||
if (ic*BN + loadc_b + l < ne11) {
|
||||
int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
|
||||
} else {
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
pos_a += BK / LOAD_VEC_A;
|
||||
pos_b += BK / LOAD_VEC_B;
|
||||
|
||||
for (int i = 0; i < BK; i++) {
|
||||
for (int j = 0; j < TM; j++) {
|
||||
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < TN; j++) {
|
||||
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
|
||||
}
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
const int sums_idx = cc*TM + cr;
|
||||
sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int dr = ir * BM + th_r * TM;
|
||||
const int dc = ic * BN + th_c * TN;
|
||||
|
||||
const int offsets = batch_idx * batch_stride_d;
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
if (dr + cr < ne01 && dc + cc < ne11) {
|
||||
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,196 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_K
|
||||
//------------------------------------------------------------------------------
|
||||
#define QK_K 256
|
||||
#define BLOCK_Q4K_SIZE 144
|
||||
#define K_SCALE_SIZE 12
|
||||
|
||||
// 8 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
typedef struct {
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
|
||||
uchar scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uchar qs[QK_K/2]; // 4-bit quants
|
||||
} block_q4_K;
|
||||
|
||||
#undef N_DST
|
||||
#undef N_SIMDGROUP
|
||||
#undef N_SIMDWIDTH
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 4 // number of rows each SIMD group works on
|
||||
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 16 // SIMD group size
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 16
|
||||
#define N_SIMDGROUP 2
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
#undef BLOCK_STRIDE
|
||||
// number of (super) blocks each subgroup processes
|
||||
// each thread in a subgroup processes a block (32 weights)
|
||||
#define BLOCK_STRIDE (N_SIMDWIDTH/8)
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_q4_K_f32_flat(
|
||||
global uchar * src0_q,
|
||||
global uchar * src0_s,
|
||||
global half * src0_d,
|
||||
global half * src0_dm,
|
||||
global char * src1,
|
||||
int offset1,
|
||||
global char * dst,
|
||||
int offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne12,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
ushort kmask1 = 0x3f3f;
|
||||
ushort kmask2 = 0x0f0f;
|
||||
ushort kmask3 = 0xc0c0;
|
||||
|
||||
int ix = get_sub_group_local_id()/8;
|
||||
int it = get_sub_group_local_id()%8;
|
||||
int iq = it/4;
|
||||
int ir = it%4;
|
||||
|
||||
int nb = ne00/QK_K;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = get_group_id(2);
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
int offset_src0 = (first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03)/BLOCK_Q4K_SIZE;
|
||||
uint blk = nb01 / BLOCK_Q4K_SIZE;
|
||||
global uchar * blk_q = (global uchar *)src0_q + offset_src0*(QK_K/2);
|
||||
global uchar * blk_s = (global uchar *)src0_s + offset_src0*K_SCALE_SIZE;
|
||||
global half * blk_d = (global half *)src0_d + offset_src0;
|
||||
global half * blk_dm = (global half *)src0_dm + offset_src0;
|
||||
|
||||
int offset_src1 = r1*nb11 + (i12)*nb12 + (i13)*nb13;
|
||||
global float * y = (global float *)(src1 + offset_src1);
|
||||
|
||||
float yl[16];
|
||||
float yh[16];
|
||||
float sumf[N_DST] = {0.f};
|
||||
float all_sum;
|
||||
|
||||
global float * y4 = y + ix * QK_K + 64 * iq + 8 * ir;
|
||||
|
||||
ushort sc16[4];
|
||||
uchar * sc8 = (uchar *)sc16;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
|
||||
float4 sumy = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i+0] = y4[i+0];
|
||||
sumy.s0 += yl[i+0];
|
||||
|
||||
yl[i+8] = y4[i+32];
|
||||
sumy.s1 += yl[i+8];
|
||||
|
||||
yh[i+0] = y4[i+128];
|
||||
sumy.s2 += yh[i+0];
|
||||
|
||||
yh[i+8] = y4[i+160];
|
||||
sumy.s3 += yh[i+8];
|
||||
}
|
||||
|
||||
global ushort * q1 = (global ushort *)(blk_q + ib * (QK_K/2)) + (16 * iq + 4 * ir);
|
||||
global ushort * sc = (global ushort *)(blk_s + ib * K_SCALE_SIZE) + iq;
|
||||
global half * d = blk_d + ib;
|
||||
global half * dm = blk_dm + ib;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
sc16[0] = sc[0] & kmask1;
|
||||
sc16[1] = sc[2] & kmask1;
|
||||
sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
|
||||
sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
|
||||
|
||||
global ushort * q2 = q1 + 32;
|
||||
|
||||
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
|
||||
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
acc1.s0 += yl[i+0] * (q1[i/2] & 0x000F);
|
||||
acc1.s1 += yl[i+1] * (q1[i/2] & 0x0F00);
|
||||
acc1.s2 += yl[i+8] * (q1[i/2] & 0x00F0);
|
||||
acc1.s3 += yl[i+9] * (q1[i/2] & 0xF000);
|
||||
acc2.s0 += yh[i+0] * (q2[i/2] & 0x000F);
|
||||
acc2.s1 += yh[i+1] * (q2[i/2] & 0x0F00);
|
||||
acc2.s2 += yh[i+8] * (q2[i/2] & 0x00F0);
|
||||
acc2.s3 += yh[i+9] * (q2[i/2] & 0xF000);
|
||||
}
|
||||
|
||||
float dall = *d;
|
||||
float dmin = *dm;
|
||||
sumf[row] += dall * ((acc1.s0 + 1.f/256.f * acc1.s1) * sc8[0] +
|
||||
(acc1.s2 + 1.f/256.f * acc1.s3) * sc8[1] * 1.f/16.f +
|
||||
(acc2.s0 + 1.f/256.f * acc2.s1) * sc8[4] +
|
||||
(acc2.s2 + 1.f/256.f * acc2.s3) * sc8[5] * 1.f/16.f) -
|
||||
dmin * (sumy.s0 * sc8[2] + sumy.s1 * sc8[3] + sumy.s2 * sc8[6] + sumy.s3 * sc8[7]);
|
||||
|
||||
q1 += blk*64;
|
||||
sc += blk*6;
|
||||
d += blk;
|
||||
dm += blk;
|
||||
}
|
||||
|
||||
y4 += BLOCK_STRIDE * QK_K;
|
||||
}
|
||||
|
||||
global float * dst_f32 = (global float *) dst + im*ne0*ne1 + r1*ne0;
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = sub_group_reduce_add(sumf[row]);
|
||||
if (first_row + row < ne01) {
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue