vulkan: Add copy_transpose shader (#17371)
This commit is contained in:
parent
99c53d6558
commit
2eba631b81
|
|
@ -638,6 +638,7 @@ struct vk_device_struct {
|
||||||
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32;
|
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32;
|
||||||
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
|
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
|
||||||
vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
|
vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
|
||||||
|
vk_pipeline pipeline_cpy_transpose_16, pipeline_cpy_transpose_32;
|
||||||
vk_pipeline pipeline_set_rows_i32[GGML_TYPE_COUNT];
|
vk_pipeline pipeline_set_rows_i32[GGML_TYPE_COUNT];
|
||||||
vk_pipeline pipeline_set_rows_i64[GGML_TYPE_COUNT];
|
vk_pipeline pipeline_set_rows_i64[GGML_TYPE_COUNT];
|
||||||
vk_pipeline pipeline_norm_f32;
|
vk_pipeline pipeline_norm_f32;
|
||||||
|
|
@ -3697,6 +3698,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_i32_f32, "contig_cpy_i32_f32", contig_cpy_i32_f32_len, contig_cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_i32_f32, "contig_cpy_i32_f32", contig_cpy_i32_f32_len, contig_cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_i32, "contig_cpy_f32_i32", contig_cpy_f32_i32_len, contig_cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_i32, "contig_cpy_f32_i32", contig_cpy_f32_i32_len, contig_cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_32, "cpy_transpose_32", cpy_transpose_32_len, cpy_transpose_32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_16, "cpy_transpose_16", cpy_transpose_16_len, cpy_transpose_16_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
|
||||||
|
|
||||||
if (device->float_controls_rte_fp16) {
|
if (device->float_controls_rte_fp16) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||||
|
|
@ -6247,6 +6251,17 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
|
||||||
// Choose "contiguous copy" shader if src/dst are contiguous
|
// Choose "contiguous copy" shader if src/dst are contiguous
|
||||||
bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));
|
bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));
|
||||||
|
|
||||||
|
// Use optimized "transpose" shader if src dim1 is the innermost dimension.
|
||||||
|
bool transpose = dst && src->nb[1] == ggml_type_size(to) && ggml_are_same_shape(dst, src);
|
||||||
|
|
||||||
|
if (transpose && src->type == to) {
|
||||||
|
if (ggml_type_size(to) == 4) {
|
||||||
|
return ctx->device->pipeline_cpy_transpose_32;
|
||||||
|
} else if (ggml_type_size(to) == 2) {
|
||||||
|
return ctx->device->pipeline_cpy_transpose_16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
|
if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
|
||||||
if (contig) {
|
if (contig) {
|
||||||
return ctx->device->pipeline_contig_cpy_f32_f32;
|
return ctx->device->pipeline_contig_cpy_f32_f32;
|
||||||
|
|
@ -8858,6 +8873,17 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||||
} else {
|
} else {
|
||||||
elements = { ne, 1, 1 };
|
elements = { ne, 1, 1 };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (pipeline == ctx->device->pipeline_cpy_transpose_32 ||
|
||||||
|
pipeline == ctx->device->pipeline_cpy_transpose_16) {
|
||||||
|
// 32x32 tiles
|
||||||
|
elements[0] = (uint32_t)CEIL_DIV(dst->ne[0], 32);
|
||||||
|
elements[1] = (uint32_t)CEIL_DIV(dst->ne[1], 32);
|
||||||
|
elements[2] = (uint32_t)(dst->ne[2]*dst->ne[3]);
|
||||||
|
elements[0] = std::min(elements[0], ctx->device->properties.limits.maxComputeWorkGroupCount[0]);
|
||||||
|
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
||||||
|
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ADD_ID:
|
case GGML_OP_ADD_ID:
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "types.glsl"
|
||||||
|
#include "generic_unary_head.glsl"
|
||||||
|
|
||||||
|
// workgroup does 32x32 tile, but uses 32x8 threads
|
||||||
|
#define TILE_DIM 32
|
||||||
|
layout(local_size_x = 32, local_size_y = 8, local_size_z = 1) in;
|
||||||
|
|
||||||
|
shared uint sh[TILE_DIM][TILE_DIM + 1];
|
||||||
|
|
||||||
|
void iter(uvec3 wg_id) {
|
||||||
|
const uint tile_col = wg_id.x;
|
||||||
|
const uint tile_row = wg_id.y;
|
||||||
|
|
||||||
|
const uint tid_col = gl_LocalInvocationID.x;
|
||||||
|
const uint tid_row = gl_LocalInvocationID.y;
|
||||||
|
|
||||||
|
const uint i2 = wg_id.z % p.ne12;
|
||||||
|
const uint i3 = wg_id.z / p.ne12;
|
||||||
|
const uint i02 = i2;
|
||||||
|
const uint i03 = i3;
|
||||||
|
|
||||||
|
// The workgroup does TILE_DIM x TILE_DIM, but swaps the LSBs of the
|
||||||
|
// src coords to make memory accesses contiguous, dst has tid.x in i0,
|
||||||
|
// src has tid.x in i01
|
||||||
|
|
||||||
|
[[unroll]] for (uint y = 0; y < 4; ++y) {
|
||||||
|
const uint i00 = tile_col * TILE_DIM + tid_row + 8 * y;
|
||||||
|
const uint i01 = tile_row * TILE_DIM + tid_col;
|
||||||
|
if (i00 < p.ne00 && i01 < p.ne01 && i02 < p.ne02 && i03 < p.ne03) {
|
||||||
|
const uint src_idx = i00 * p.nb00 + i01 * p.nb01 + i02 * p.nb02 + i03 * p.nb03;
|
||||||
|
sh[tid_row + 8 * y][tid_col] = uint(data_a[get_aoffset() + src_idx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier();
|
||||||
|
|
||||||
|
[[unroll]] for (uint y = 0; y < 4; ++y) {
|
||||||
|
const uint i0 = tile_col * TILE_DIM + tid_col;
|
||||||
|
const uint i1 = tile_row * TILE_DIM + tid_row + 8 * y;
|
||||||
|
if (i0 < p.ne10 && i1 < p.ne11 && i2 < p.ne12 && i3 < p.ne13) {
|
||||||
|
const uint dst_idx = i0 * p.nb10 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
|
||||||
|
// load transposed
|
||||||
|
data_d[get_doffset() + dst_idx] = D_TYPE(sh[tid_col][tid_row + 8 * y]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
uint z = gl_WorkGroupID.z;
|
||||||
|
uint y = gl_WorkGroupID.y;
|
||||||
|
bool need_barrier = false;
|
||||||
|
for (uint z = gl_WorkGroupID.z; z < p.ne12 * p.ne13; z += gl_NumWorkGroups.z) {
|
||||||
|
for (uint y = gl_WorkGroupID.y; y < CEIL_DIV(p.ne11, TILE_DIM); y += gl_NumWorkGroups.y) {
|
||||||
|
for (uint x = gl_WorkGroupID.x; x < CEIL_DIV(p.ne10, TILE_DIM); x += gl_NumWorkGroups.x) {
|
||||||
|
if (need_barrier) {
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
need_barrier = true;
|
||||||
|
iter(uvec3(x, y, z));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -734,6 +734,9 @@ void process_shaders() {
|
||||||
string_to_spv("cpy_f32_i32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}});
|
string_to_spv("cpy_f32_i32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}});
|
||||||
string_to_spv("cpy_i32_f32", "copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}});
|
string_to_spv("cpy_i32_f32", "copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}});
|
||||||
|
|
||||||
|
string_to_spv("cpy_transpose_16", "copy_transpose.comp", {{"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
|
||||||
|
string_to_spv("cpy_transpose_32", "copy_transpose.comp", {{"A_TYPE", "uint"}, {"D_TYPE", "uint"}});
|
||||||
|
|
||||||
for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
|
for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
|
||||||
string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||||
string_to_spv("cpy_f32_" + t + "_rte", "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
|
string_to_spv("cpy_f32_" + t + "_rte", "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue