mirror of https://github.com/google/gemma.cpp.git
512 lines
17 KiB
Common Lisp
512 lines
17 KiB
Common Lisp
kernel void blur_box(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
// coordinates of the pixel to work on
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
uint4 sum = 0;
|
|
uint num = 0;
|
|
int2 shift;
|
|
for (shift.x = -size; shift.x <= size; ++shift.x)
|
|
for (shift.y = -size; shift.y <= size; ++shift.y) {
|
|
int2 cur = coord + shift;
|
|
if ((0 <= cur.x) && (cur.x < width) && (0 <= cur.y) && (cur.y < height)) {
|
|
++num;
|
|
sum += read_imageui(input_image, cur);
|
|
}
|
|
}
|
|
write_imageui(output_image, coord, (sum + num / 2) / num);
|
|
}
|
|
|
|
|
|
kernel void blur_box_horizontal(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size
|
|
)
|
|
{
|
|
const int width = get_global_size(0);
|
|
const int height = get_global_size(1);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
uint4 sum = 0;
|
|
uint num = 0;
|
|
int2 shift = 0;
|
|
for (shift.x = -size; shift.x <= size; ++shift.x) {
|
|
int2 cur = coord + shift;
|
|
if ((0 <= cur.x) && (cur.x < width)) {
|
|
++num;
|
|
sum += read_imageui(input_image, cur);
|
|
}
|
|
}
|
|
write_imageui(output_image, coord, (sum + num / 2) / num);
|
|
}
|
|
|
|
kernel void blur_box_vertical(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size
|
|
)
|
|
{
|
|
const int width = get_global_size(0);
|
|
const int height = get_global_size(1);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
uint4 sum = 0;
|
|
uint num = 0;
|
|
int2 shift = 0;
|
|
for (shift.y = -size; shift.y <= size; ++shift.y) {
|
|
int2 cur = coord + shift;
|
|
if ((0 <= cur.y) && (cur.y < height)) {
|
|
++num;
|
|
sum += read_imageui(input_image, cur);
|
|
}
|
|
}
|
|
write_imageui(output_image, coord, (sum + num / 2) / num);
|
|
}
|
|
|
|
|
|
kernel void blur_kernel_horizontal(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size,
|
|
constant float * kern
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
float4 sum = 0;
|
|
float weight = 0;
|
|
int2 shift = 0;
|
|
for (shift.x = -size; shift.x <= size; ++shift.x) {
|
|
int2 cur = coord + shift;
|
|
if ((0 <= cur.x) && (cur.x < width)) {
|
|
const float w = kern[size + shift.x];
|
|
weight += w;
|
|
sum += convert_float4(read_imageui(input_image, cur)) * w;
|
|
}
|
|
}
|
|
uint4 res = convert_uint4(round(sum / weight));
|
|
write_imageui(output_image, coord, res);
|
|
}
|
|
|
|
kernel void blur_kernel_vertical(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size,
|
|
constant float * kern
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
float4 sum = 0;
|
|
float weight = 0;
|
|
int2 shift = 0;
|
|
for (shift.y = -size; shift.y <= size; ++shift.y) {
|
|
int2 cur = coord + shift;
|
|
if ((0 <= cur.y) && (cur.y < height)) {
|
|
const float w = kern[size + shift.y];
|
|
weight += w;
|
|
sum += convert_float4(read_imageui(input_image, cur)) * w;
|
|
}
|
|
}
|
|
write_imageui(output_image, coord, convert_uint4(round(sum / weight)));
|
|
}
|
|
|
|
|
|
kernel void blur_box_horizontal_exchange(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size,
|
|
local uchar4 * line
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
const int grs = get_local_size(0);
|
|
const int lid = get_local_id(0);
|
|
// coordinates of the leftmost and rightmost pixels needed for the workgroup
|
|
const int start = get_group_id(0) * grs;
|
|
const int2 start_coord = { max(start - size, 0), coord.y };
|
|
const int2 end_coord = { min(start + grs + size, width - 1), coord.y};
|
|
|
|
// copy all pixels needed for workgroup into local memory
|
|
int2 cur = start_coord + (int2)(lid, 0);
|
|
uint pos = lid;
|
|
while (cur.x <= end_coord.x) {
|
|
line[pos] = convert_uchar4(read_imageui(input_image, cur));
|
|
cur.x += grs;
|
|
pos += grs;
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// blur
|
|
if (coord.x < width) {
|
|
uint4 sum = 0;
|
|
uint num = 0;
|
|
pos = lid + (start - size - start_coord.x);
|
|
for (cur.x = coord.x - size; cur.x <= coord.x + size; ++cur.x, ++pos)
|
|
if ((0 <= cur.x) && (cur.x < width)) {
|
|
++num;
|
|
sum += convert_uint4(line[pos]);
|
|
}
|
|
write_imageui(output_image, coord, (sum + num / 2) / num);
|
|
}
|
|
}
|
|
|
|
kernel void blur_box_vertical_exchange(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size,
|
|
local uchar4 * line
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
const int grs = get_local_size(1);
|
|
const int lid = get_local_id(1);
|
|
// coordinates of the topmost and lowest pixels needed for the workgroup
|
|
const int start = get_group_id(1) * grs;
|
|
const int2 start_coord = { coord.x, max(start - size, 0) };
|
|
const int2 end_coord = { coord.x, min(start + grs + size, height - 1) };
|
|
|
|
// copy all pixels needed for workgroup into local memory
|
|
int2 cur = start_coord + (int2)(0, lid);
|
|
uint pos = lid;
|
|
while (cur.y <= end_coord.y) {
|
|
line[pos] = convert_uchar4(read_imageui(input_image, cur));
|
|
cur.y += grs;
|
|
pos += grs;
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// blur
|
|
if (coord.y < height) {
|
|
uint4 sum = 0;
|
|
uint num = 0;
|
|
pos = lid + (start - size - start_coord.y);
|
|
for (cur.y = coord.y - size; cur.y <= coord.y + size; ++cur.y, ++pos)
|
|
if ((0 <= cur.y) && (cur.y < height)) {
|
|
++num;
|
|
sum += convert_uint4(line[pos]);
|
|
}
|
|
write_imageui(output_image, coord, (sum + num / 2) / num);
|
|
}
|
|
}
|
|
|
|
|
|
kernel void blur_kernel_horizontal_exchange(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size,
|
|
constant float * kern,
|
|
local uchar4 * line
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
const int grs = get_local_size(0);
|
|
const int lid = get_local_id(0);
|
|
// coordinates of the leftmost and rightmost pixels needed for the workgroup
|
|
const int start = get_group_id(0) * grs;
|
|
const int2 start_coord = { max(start - size, 0), coord.y };
|
|
const int2 end_coord = { min(start + grs + size, width - 1), coord.y};
|
|
|
|
// copy all pixels needed for workgroup into local memory
|
|
int2 cur = start_coord + (int2)(lid, 0);
|
|
uint pos = lid;
|
|
while (cur.x <= end_coord.x) {
|
|
line[pos] = convert_uchar4(read_imageui(input_image, cur));
|
|
cur.x += grs;
|
|
pos += grs;
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// blur
|
|
if (coord.x < width) {
|
|
float4 sum = 0;
|
|
float weight = 0;
|
|
pos = lid + (start - size - start_coord.x);
|
|
for (int shift = -size; shift <= size; ++shift, ++pos) {
|
|
cur.x = coord.x + shift;
|
|
if ((0 <= cur.x) && (cur.x < width)) {
|
|
float w = kern[size + shift];
|
|
weight += w;
|
|
sum += convert_float4(line[pos]) * w;
|
|
}
|
|
}
|
|
write_imageui(output_image, coord, convert_uint4(round(sum / weight)));
|
|
}
|
|
}
|
|
|
|
kernel void blur_kernel_vertical_exchange(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size,
|
|
constant float * kern,
|
|
local uchar4 * line
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
const int grs = get_local_size(1);
|
|
const int lid = get_local_id(1);
|
|
// coordinates of the topmost and lowest pixels needed for the workgroup
|
|
const int start = get_group_id(1) * grs;
|
|
const int2 start_coord = { coord.x, max(start - size, 0) };
|
|
const int2 end_coord = { coord.x, min(start + grs + size, height - 1) };
|
|
|
|
// copy all pixels needed for workgroup into local memory
|
|
int2 cur = start_coord + (int2)(0, lid);
|
|
uint pos = lid;
|
|
while (cur.y <= end_coord.y) {
|
|
line[pos] = convert_uchar4(read_imageui(input_image, cur));
|
|
cur.y += grs;
|
|
pos += grs;
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// blur
|
|
if (coord.y < height) {
|
|
float4 sum = 0;
|
|
float weight = 0;
|
|
pos = lid + (start - size - start_coord.y);
|
|
for (int shift = -size; shift <= size; ++shift, ++pos) {
|
|
cur.y = coord.y + shift;
|
|
if ((0 <= cur.y) && (cur.y < height)) {
|
|
float w = kern[size + shift];
|
|
weight += w;
|
|
sum += convert_float4(line[pos]) * w;
|
|
}
|
|
}
|
|
write_imageui(output_image, coord, convert_uint4(round(sum / weight)));
|
|
}
|
|
}
|
|
|
|
|
|
#if defined(USE_SUBGROUP_EXCHANGE_RELATIVE) || defined(USE_SUBGROUP_EXCHANGE)
|
|
|
|
kernel void blur_box_horizontal_subgroup_exchange(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
if (coord.x < width) {
|
|
uint4 pixel = 0;
|
|
uint4 sum = 0;
|
|
uint num = 0;
|
|
|
|
// initial read
|
|
int2 cur = coord + (int2)(size, 0);
|
|
if (cur.x < width) {
|
|
pixel = read_imageui(input_image, cur);
|
|
sum = pixel;
|
|
num = 1;
|
|
}
|
|
// shifts and reads
|
|
const uint sglid = get_sub_group_local_id();
|
|
const uint shift = (sglid != 0);
|
|
for (int i = size - 1; i >= -size; --i) {
|
|
--cur.x;
|
|
#if defined(USE_SUBGROUP_EXCHANGE_RELATIVE)
|
|
pixel = (uint4)(sub_group_shuffle_up(pixel.s0, shift),
|
|
sub_group_shuffle_up(pixel.s1, shift),
|
|
sub_group_shuffle_up(pixel.s2, shift),
|
|
sub_group_shuffle_up(pixel.s3, shift));
|
|
#elif defined(USE_SUBGROUP_EXCHANGE)
|
|
pixel = (uint4)(sub_group_shuffle(pixel.s0, sglid - shift),
|
|
sub_group_shuffle(pixel.s1, sglid - shift),
|
|
sub_group_shuffle(pixel.s2, sglid - shift),
|
|
sub_group_shuffle(pixel.s3, sglid - shift));
|
|
#endif
|
|
if ((cur.x >= 0) && (cur.x < width)) {
|
|
if (!shift) // 0th workitem reads new pixel
|
|
pixel = read_imageui(input_image, cur);
|
|
sum += pixel;
|
|
++num;
|
|
}
|
|
}
|
|
|
|
write_imageui(output_image, coord, (sum + num / 2) / num);
|
|
}
|
|
}
|
|
|
|
kernel void blur_box_vertical_subgroup_exchange(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
if (coord.y < height) {
|
|
uint4 pixel = 0;
|
|
uint4 sum = 0;
|
|
uint num = 0;
|
|
|
|
// initial read
|
|
int2 cur = coord + (int2)(0, size);
|
|
if (cur.y < height) {
|
|
pixel = read_imageui(input_image, cur);
|
|
sum = pixel;
|
|
num = 1;
|
|
}
|
|
// shifts and reads
|
|
const uint sglid = get_sub_group_local_id();
|
|
const uint shift = (sglid != 0);
|
|
for (int i = size - 1; i >= -size; --i) {
|
|
--cur.y;
|
|
#if defined(USE_SUBGROUP_EXCHANGE_RELATIVE)
|
|
pixel = (uint4)(sub_group_shuffle_up(pixel.s0, shift),
|
|
sub_group_shuffle_up(pixel.s1, shift),
|
|
sub_group_shuffle_up(pixel.s2, shift),
|
|
sub_group_shuffle_up(pixel.s3, shift));
|
|
#elif defined(USE_SUBGROUP_EXCHANGE)
|
|
pixel = (uint4)(sub_group_shuffle(pixel.s0, sglid - shift),
|
|
sub_group_shuffle(pixel.s1, sglid - shift),
|
|
sub_group_shuffle(pixel.s2, sglid - shift),
|
|
sub_group_shuffle(pixel.s3, sglid - shift));
|
|
#endif
|
|
if ((cur.y >= 0) && (cur.y < height)) {
|
|
if (!shift) // 0th workitem reads new pixel
|
|
pixel = read_imageui(input_image, cur);
|
|
sum += pixel;
|
|
++num;
|
|
}
|
|
}
|
|
|
|
write_imageui(output_image, coord, (sum + num / 2) / num);
|
|
}
|
|
}
|
|
|
|
|
|
kernel void blur_kernel_horizontal_subgroup_exchange(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size,
|
|
constant float * kern
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
if (coord.x < width) {
|
|
uint4 pixel = 0;
|
|
float4 sum = 0;
|
|
float weight = 0;
|
|
|
|
// initial read
|
|
int2 cur = coord + (int2)(size, 0);
|
|
if (cur.x < width) {
|
|
pixel = read_imageui(input_image, cur);
|
|
weight = kern[2 * size];
|
|
sum = convert_float4(pixel) * weight;
|
|
}
|
|
// shifts and reads
|
|
const uint sglid = get_sub_group_local_id();
|
|
const uint shift = (sglid != 0);
|
|
for (int i = size - 1; i >= -size; --i) {
|
|
--cur.x;
|
|
#if defined(USE_SUBGROUP_EXCHANGE_RELATIVE)
|
|
pixel = (uint4)(sub_group_shuffle_up(pixel.s0, shift),
|
|
sub_group_shuffle_up(pixel.s1, shift),
|
|
sub_group_shuffle_up(pixel.s2, shift),
|
|
sub_group_shuffle_up(pixel.s3, shift));
|
|
#elif defined(USE_SUBGROUP_EXCHANGE)
|
|
pixel = (uint4)(sub_group_shuffle(pixel.s0, sglid - shift),
|
|
sub_group_shuffle(pixel.s1, sglid - shift),
|
|
sub_group_shuffle(pixel.s2, sglid - shift),
|
|
sub_group_shuffle(pixel.s3, sglid - shift));
|
|
#endif
|
|
if ((cur.x >= 0) && (cur.x < width)) {
|
|
if (!shift) // 0th workitem reads new pixel
|
|
pixel = read_imageui(input_image, cur);
|
|
float w = kern[size + i];
|
|
weight += w;
|
|
sum += convert_float4(pixel) * w;
|
|
}
|
|
}
|
|
|
|
write_imageui(output_image, coord, convert_uint4(round(sum / weight)));
|
|
}
|
|
}
|
|
|
|
kernel void blur_kernel_vertical_subgroup_exchange(
|
|
read_only image2d_t input_image,
|
|
write_only image2d_t output_image,
|
|
int size,
|
|
constant float * kern
|
|
)
|
|
{
|
|
const int width = get_image_width(input_image);
|
|
const int height = get_image_height(input_image);
|
|
const int2 coord = { get_global_id(0), get_global_id(1) };
|
|
|
|
if (coord.y < height) {
|
|
uint4 pixel = 0;
|
|
float4 sum = 0;
|
|
float weight = 0;
|
|
|
|
// initial read
|
|
int2 cur = coord + (int2)(0, size);
|
|
if (cur.y < height) {
|
|
pixel = read_imageui(input_image, cur);
|
|
weight = kern[2 * size];
|
|
sum = convert_float4(pixel) * weight;
|
|
}
|
|
// shifts and reads
|
|
const uint sglid = get_sub_group_local_id();
|
|
const uint shift = (sglid != 0);
|
|
for (int i = size - 1; i >= -size; --i) {
|
|
--cur.y;
|
|
#if defined(USE_SUBGROUP_EXCHANGE_RELATIVE)
|
|
pixel = (uint4)(sub_group_shuffle_up(pixel.s0, shift),
|
|
sub_group_shuffle_up(pixel.s1, shift),
|
|
sub_group_shuffle_up(pixel.s2, shift),
|
|
sub_group_shuffle_up(pixel.s3, shift));
|
|
#elif defined(USE_SUBGROUP_EXCHANGE)
|
|
pixel = (uint4)(sub_group_shuffle(pixel.s0, sglid - shift),
|
|
sub_group_shuffle(pixel.s1, sglid - shift),
|
|
sub_group_shuffle(pixel.s2, sglid - shift),
|
|
sub_group_shuffle(pixel.s3, sglid - shift));
|
|
#endif
|
|
if ((cur.y >= 0) && (cur.y < height)) {
|
|
if (!shift) // 0th workitem reads new pixel
|
|
pixel = read_imageui(input_image, cur);
|
|
float w = kern[size + i];
|
|
sum += convert_float4(pixel) * w;
|
|
weight += w;
|
|
}
|
|
}
|
|
|
|
write_imageui(output_image, coord, convert_uint4(round(sum / weight)));
|
|
}
|
|
}
|
|
|
|
#endif // USE_SUBGROUP_EXCHANGE_RELATIVE || USE_SUBGROUP_EXCHANGE
|