diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index efdebe2bba..27b2761ef1 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -398,6 +398,7 @@ struct ggml_backend_opencl_context { int adreno_wave_size; cl_bool non_uniform_workgroups; + size_t image_max_buffer_size; cl_context context; cl_command_queue queue; @@ -407,6 +408,10 @@ struct ggml_backend_opencl_context { ggml_cl_buffer prealloc_scales_trans; ggml_cl_buffer prealloc_act_trans; + // prealloc buffers for src0 and src1 + ggml_cl_buffer prealloc_src0; + ggml_cl_buffer prealloc_src1; + cl_program program_add; cl_program program_add_id; cl_program program_clamp; @@ -2658,6 +2663,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL); GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024); + clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL); + GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size); + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL); GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size); @@ -4711,6 +4719,81 @@ static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct gg (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); } +// Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but +// nb[] is recalculated such that tensor is contiguous. +static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor * src, cl_mem dst, + cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) { + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + const int tensor_type_size = ggml_type_size(src->type); + + const int ne00 = src->ne[0]; + const int ne01 = src->ne[1]; + const int ne02 = src->ne[2]; + const int ne03 = src->ne[3]; + + const cl_ulong nb00 = src->nb[0]; + const cl_ulong nb01 = src->nb[1]; + const cl_ulong nb02 = src->nb[2]; + const cl_ulong nb03 = src->nb[3]; + + const int ne0 = src->ne[0]; + const int ne1 = src->ne[1]; + const int ne2 = src->ne[2]; + const int ne3 = src->ne[3]; + + nb0 = tensor_type_size; + nb1 = tensor_type_size*ne00; + nb2 = tensor_type_size*ne00*ne01; + nb3 = tensor_type_size*ne00*ne01*ne02; + + ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *)src->extra; + + cl_ulong offset0 = extra->offset + src->view_offs; + cl_ulong offsetd = 0; + + cl_kernel kernel; + + switch (src->type) { + case GGML_TYPE_F32: + kernel = backend_ctx->kernel_cpy_f32_f32; + break; + case GGML_TYPE_F16: + kernel = backend_ctx->kernel_cpy_f16_f16; + break; + default: + GGML_ASSERT(false && "not implemented"); + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne3)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb0)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3)); + + const int nth = MIN(64, ne00); + + size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; + size_t local_work_size[] = {(size_t)nth, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src); +} + static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { UNUSED(backend); UNUSED(src0); @@ -7724,9 +7807,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co cl_context context = backend_ctx->context; if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){ - if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0) { + if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0 && + // dst is wrapped with image1d_buffer, the size limit applies, also src0 + (ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4 <= backend_ctx->image_max_buffer_size)) { // For KQ if (ggml_is_permuted(src0) && ggml_is_permuted(src1) && + ((nb01 * ne01 / 4)/4 <= backend_ctx->image_max_buffer_size) && nb00 <= nb02 && nb02 <= nb01 && nb01 <= nb03 && @@ -7737,7 +7823,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co return; } // For KQV - if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { + if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && + ((nb02 * ne02 / 4)/4 <= backend_ctx->image_max_buffer_size)) { ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst); return; } @@ -8043,9 +8130,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co // GEMM using local memory // Current BK = 16, so ne00 % 16 == 0 - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - src1t == GGML_TYPE_F32 && + if (src1t == GGML_TYPE_F32 && ne00 % 16 == 0 && ne11 > 1) { switch(src0t) { @@ -8057,10 +8142,42 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co int batch_stride_b = ne10*ne11; int batch_stride_d = ne0*ne1; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + cl_mem mem_src0 = extra0->data_device; + cl_mem mem_src1 = extra1->data_device; + + cl_ulong nb00_cont = nb00; + cl_ulong nb01_cont = nb01; + cl_ulong nb02_cont = nb02; + cl_ulong nb03_cont = nb03; + + cl_ulong nb10_cont = nb10; + cl_ulong nb11_cont = nb11; + cl_ulong nb12_cont = nb12; + cl_ulong nb13_cont = nb13; + + cl_ulong offset0_cont = offset0; + cl_ulong offset1_cont = offset1; + + if (!ggml_is_contiguous(src0)) { + backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0)); + ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer, + nb00_cont, nb01_cont, nb02_cont, nb03_cont); + mem_src0 = backend_ctx->prealloc_src0.buffer; + offset0_cont = 0; + } + + if (!ggml_is_contiguous(src1)) { + backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1)); + ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer, + nb10_cont, nb11_cont, nb12_cont, nb13_cont); + mem_src1 = backend_ctx->prealloc_src1.buffer; + offset1_cont = 0; + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); @@ -8092,10 +8209,42 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co int batch_stride_b = ne10*ne11; int batch_stride_d = ne0*ne1; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + cl_mem mem_src0 = extra0->data_device; + cl_mem mem_src1 = extra1->data_device; + + cl_ulong nb00_cont = nb00; + cl_ulong nb01_cont = nb01; + cl_ulong nb02_cont = nb02; + cl_ulong nb03_cont = nb03; + + cl_ulong nb10_cont = nb10; + cl_ulong nb11_cont = nb11; + cl_ulong nb12_cont = nb12; + cl_ulong nb13_cont = nb13; + + cl_ulong offset0_cont = offset0; + cl_ulong offset1_cont = offset1; + + if (!ggml_is_contiguous(src0)) { + backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0)); + ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer, + nb00_cont, nb01_cont, nb02_cont, nb03_cont); + mem_src0 = backend_ctx->prealloc_src0.buffer; + offset0_cont = 0; + } + + if (!ggml_is_contiguous(src1)) { + backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1)); + ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer, + nb10_cont, nb11_cont, nb12_cont, nb13_cont); + mem_src1 = backend_ctx->prealloc_src1.buffer; + offset1_cont = 0; + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); @@ -8123,6 +8272,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co if (ne11 < 32) { break; } + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) { + break; + } + kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm; nth0 = 128; // calculated as (BM*BN)/(TM*TN)