diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 298fddc103..9f0efb6535 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -874,4 +874,95 @@ static bool fast_fp16_available(const int cc) { return true; //Intel GPUs always support FP16. } +enum class block_reduce_method { + MAX, + SUM, +}; + +template +struct block_reduce_policy; + +template +inline constexpr bool is_any = (std::is_same_v || ...); + +template +inline constexpr bool ggml_sycl_dependent_false_v = false; + +#define WARP_32_SIZE 32 + +template struct block_reduce_policy { + static T reduce(T val) { + if constexpr (is_any) { + return warp_reduce_sum(val); + } else { + static_assert(ggml_sycl_dependent_false_v, "Unsupported type for block reduce sum"); + } + } + + static T sentinel() { + if constexpr (std::is_same_v) { + return 0.0f; + } else if constexpr (std::is_same_v) { + return sycl::float2(0.0f, 0.0f); + } else if constexpr (std::is_same_v) { + return sycl::half2(0.0f, 0.0f); + } else if constexpr (std::is_same_v) { + return 0; + } else { + static_assert(ggml_sycl_dependent_false_v, "Unsupported type for block reduce sum"); + } + } +}; + +template struct block_reduce_policy { + static T reduce(T val) { + if constexpr (is_any) { + return warp_reduce_max(val); + } else { + static_assert(ggml_sycl_dependent_false_v, "Unsupported type for block reduce max"); + } + } + + static T sentinel() { + if constexpr (std::is_same_v) { + return -INFINITY; + } else if constexpr (std::is_same_v) { + return sycl::half2(-INFINITY, -INFINITY); + } else { + static_assert(ggml_sycl_dependent_false_v, "Unsupported type for block reduce max"); + } + } +}; + + +template +static T block_reduce(T val, T * shared_vals, int block_size_template) { + auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); + val = block_reduce_policy::reduce(val); + const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template; + const int nthreads = item_ct1.get_local_range(2); + const int nwarps = nthreads / WARP_SIZE; + + if (block_size > warp_size) { + assert((block_size <= 1024) && (block_size % warp_size) == 0); + const int warp_id = item_ct1.get_local_id(2) / warp_size; + const int lane_id = item_ct1.get_local_id(2) % warp_size; + if (lane_id == 0) { + shared_vals[warp_id] = val; + } + item_ct1.barrier(sycl::access::fence_space::local_space); + + size_t nreduce = nwarps / WARP_SIZE; + float tmp = 0.f; + if (lane_id < (static_cast(block_size) / warp_size)) { + for (size_t i = 0; i < nreduce; i += 1) + { + tmp += shared_vals[lane_id + i * WARP_SIZE]; + } + } + return block_reduce_policy::reduce(tmp); + } + return val; +} + #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 00d54b83f8..acd51bf45b 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -9,23 +9,32 @@ #define SYCL_LOCAL_ID_CALC(ITEM, IDX) \ (ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX)) +static void acc_f32(const float * x, const float * y, float * dst, const int64_t ne, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, + const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) { + auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); + const int64_t i = SYCL_LOCAL_ID_CALC(item_ct1, 2); -static void acc_f32(const float * x, const float * y, float * dst, const int ne, - const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, int offset, const sycl::nd_item<1> &item_ct1) { - const int i = SYCL_LOCAL_ID_CALC(item_ct1, 0); if (i >= ne) { return; } - int src1_idx = i - offset; - int oz = src1_idx / nb2; - int oy = (src1_idx - (oz * nb2)) / nb1; - int ox = src1_idx % nb1; - if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { - dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; - } else { - dst[i] = x[i]; + + int64_t src1_idx = i - offset; + + int64_t tmp = src1_idx; + const int64_t i13 = tmp / s13; + tmp -= i13 * s13; + const int64_t i12 = tmp / s12; + tmp -= i12 * s12; + const int64_t i11 = tmp / s11; + tmp -= i11 * s11; + const int64_t i10 = tmp; + + float val = x[i]; + if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) { + val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10]; } + dst[i] = val; } /* Unary OP funcs */ @@ -364,18 +373,15 @@ static void gated_op_fused_geglu_quick(const T * x, const T * g, T * dst, const namespace ggml_sycl_detail { static void acc_f32_sycl(const float *x, const float *y, float *dst, - const int n_elements, const int ne10, const int ne11, - const int ne12, const int nb1, const int nb2, - const int offset, queue_ptr stream) { - int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * - sycl::range<1>(SYCL_ACC_BLOCK_SIZE), - sycl::range<1>(SYCL_ACC_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, - item_ct1); - }); + const int64_t n_elements, const int64_t ne10, const int64_t ne11, + const int64_t ne12, const int64_t ne13, const int64_t s1, const int64_t s2, const int64_t s3, + const int64_t offset, queue_ptr stream) { + const int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset); + }); } template @@ -402,25 +408,19 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01, template static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { -#if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif GGML_ASSERT(dst->src[0]->type == dst->type); + dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); switch (dst->type) { -#if defined (GGML_SYCL_F16) case GGML_TYPE_F16: { auto data_pts = cast_data(dst); kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); break; } -#endif case GGML_TYPE_F32: { auto data_pts = cast_data(dst); @@ -434,14 +434,10 @@ static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, template static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { -#if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif GGML_ASSERT(dst->src[0]->type == dst->type); + dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const ggml_tensor * src0 = dst->src[0]; @@ -463,7 +459,6 @@ static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & c GGML_ASSERT(src0->type == src1->type); } switch (dst->type) { -#if defined (GGML_SYCL_F16) case GGML_TYPE_F16: { sycl::half * src0_p = (sycl::half *) src0_d; @@ -484,7 +479,6 @@ static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & c std::forward(args)...); break; } -#endif case GGML_TYPE_F32: { float * src0_p = (float *) src0_d; @@ -513,13 +507,9 @@ static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & c template static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { -#if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif + GGML_ASSERT(dst->src[0]->type == dst->type); dpct::queue_ptr main_stream = ctx.stream(); @@ -530,7 +520,6 @@ static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2]; const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3]; switch (dst->type) { -#if defined (GGML_SYCL_F16) case GGML_TYPE_F16: { auto data_pts = cast_data(dst); @@ -539,7 +528,6 @@ static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx main_stream, std::forward(args)...); break; } -#endif case GGML_TYPE_F32: { auto data_pts = cast_data(dst); @@ -868,22 +856,31 @@ static inline void ggml_sycl_op_trunc(ggml_backend_sycl_context & ctx, ggml_tens } static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + + dpct::queue_ptr stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const float * src0_dd = static_cast(dst->src[0]->data); - const float * src1_dd = static_cast(dst->src[1]->data); - float * dst_dd = static_cast(dst->data); - int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 - int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 - // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused - int offset = dst->op_params[3] / 4; // offset in bytes + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(dst->nb[0] == ggml_element_size(dst)); + GGML_ASSERT(ggml_is_contiguously_allocated(dst)); - ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream); + const int64_t s1 = dst->op_params[0] / sizeof(float); + const int64_t s2 = dst->op_params[1] / sizeof(float); + const int64_t s3 = dst->op_params[2] / sizeof(float); + const int64_t offset = dst->op_params[3] / sizeof(float); + + ggml_sycl_detail::acc_f32_sycl(src0_d, src1_d, dst_d, ggml_nelements(dst), + src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + s1, s2, s3, offset, stream); } static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index dfacde0af3..66dfc4532c 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4872,8 +4872,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g k > 0 && k <= 32; } case GGML_OP_POOL_2D: - case GGML_OP_ACC: return true; + case GGML_OP_ACC: + return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); case GGML_OP_PAD: // TODO: add circular padding support for syscl, see https://github.com/ggml-org/llama.cpp/pull/16985 if (ggml_get_op_params_i32(op, 8) != 0) { diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 00702b5d09..09fce1280a 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -202,47 +202,34 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6 } } -static void l2_norm_f32(const float* x, float* dst, const int ncols, const float eps, - const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - const int tid = item_ct1.get_local_id(2); - const int nthreads = item_ct1.get_local_range(2); - const int nwarps = nthreads / WARP_SIZE; +template +static void l2_norm_f32(const float * x, float * dst, const int ncols, + const int64_t stride_row, const int64_t stride_channel, + const int64_t stride_sample, const float eps, + const sycl::nd_item<3>& item_ct1, float* s_sum, const int block_size) { + const int nrows = item_ct1.get_group_range(2); + const int nchannels = item_ct1.get_group_range(1); + + const int row = item_ct1.get_group(2); + const int channel = item_ct1.get_group(1); + const int sample = item_ct1.get_group(0); + const int tid = item_ct1.get_local_id(2); + + x += sample*stride_sample + channel*stride_channel + row*stride_row; + dst += ((sample*nchannels + channel)*nrows + row)*ncols; + float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row * ncols + col]; + const float xi = x[col]; tmp += xi * xi; } - // sum up partial sums - tmp = warp_reduce_sum(tmp, item_ct1); - if (block_size > WARP_SIZE) { - - int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; - int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - /* - DPCT1118:3: SYCL group functions and algorithms must be encountered in - converged control flow. You may need to adjust the code. - */ - item_ct1.barrier(sycl::access::fence_space::local_space); - size_t nreduce = nwarps / WARP_SIZE; - tmp = 0.f; - for (size_t i = 0; i < nreduce; i += 1) - { - tmp += s_sum[lane_id + i * WARP_SIZE]; - } - tmp = warp_reduce_sum(tmp, item_ct1); - } - - const float scale = sycl::rsqrt(sycl::max(tmp, eps * eps)); + tmp = block_reduce(tmp, s_sum, block_size); + const float scale = sycl::rsqrt(sycl::fmax(tmp, eps * eps)); for (int col = tid; col < ncols; col += block_size) { - dst[row * ncols + col] = scale * x[row * ncols + col]; + dst[col] = scale * x[col]; } } @@ -369,42 +356,50 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const } } -static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols, - const int nrows, const float eps, - queue_ptr stream, int device) { - // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE); +template +static void l2_norm_f32_sycl(const float * x, + float * dst, + const int ncols, + const int nrows, + const int nchannels, + const int nsamples, + const int64_t stride_row, + const int64_t stride_channel, + const int64_t stride_sample, + const float eps, + queue_ptr stream, + int device) { + const dpct::dim3 blocks_num(nrows, nchannels, nsamples); + if (ncols < 1024) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); + const dpct::dim3 block_dims(warp_size, 1, 1); stream->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + sycl::nd_range<3>(blocks_num * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - l2_norm_f32(x, dst, ncols, eps, item_ct1, - nullptr, WARP_SIZE); + [[sycl::reqd_sub_group_size(warp_size)]] { + l2_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, + nullptr, warp_size); }); }); } else { const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; - assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0); + assert(work_group_size % (warp_size * warp_size) == 0); const sycl::range<3> block_dims(1, 1, work_group_size); - /* - DPCT1049:19: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ + int lsm_size = block_dims[2] > warp_size ? work_group_size / warp_size * sizeof(float): 0; stream->submit([&](sycl::handler& cgh) { - sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(lsm_size), cgh); + cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + sycl::nd_range<3>(blocks_num * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - l2_norm_f32(x, dst, ncols, eps, item_ct1, - get_pointer(s_sum_acc_ct1), work_group_size); + [[sycl::reqd_sub_group_size(warp_size)]] { + l2_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, + eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); }); }); } @@ -634,21 +629,28 @@ void ggml_sycl_op_rms_norm_back(ggml_backend_sycl_context & ctx, ggml_tensor * d } void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *) src0->data; + float * dst_d = (float *) dst->data; + dpct::queue_ptr stream = ctx.stream(); - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - - const int64_t ne00 = dst->src[0]->ne[0]; - const int64_t nrows = ggml_nrows(dst->src[0]); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); + GGML_TENSOR_UNARY_OP_LOCALS; float eps; memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps >= 0.0f); - l2_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); + const size_t ts0 = ggml_type_size(src0->type); + GGML_ASSERT(nb00 == ts0); + const int64_t s01 = nb01 / ts0; + const int64_t s02 = nb02 / ts0; + const int64_t s03 = nb03 / ts0; + /*support both WARP_SIZE or WARP_32_SIZE in code + choose by hardware for better performance + */ + l2_norm_f32_sycl(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream, ctx.device); }