From 8c70b8fece445cdc9a8c660dbddbf201e52da2bb Mon Sep 17 00:00:00 2001 From: James Contini Date: Wed, 15 Oct 2025 16:14:20 -0700 Subject: [PATCH] responded and dealt with PR comments --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 258 ++++----- .../ggml-webgpu/wgsl-shaders/unary_op.wgsl | 488 ++++++------------ 2 files changed, 296 insertions(+), 450 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 6a4d520203..2f4fdc1c3c 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -62,7 +62,6 @@ #define WEBGPU_MUL_MAT_WG_SIZE 256 #define WEBGPU_NUM_PARAM_BUFS 32u #define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8u -#define WEBGPU_WAIT_ANY_BATCH_SIZE 64 #define WEBGPU_WAIT_ANY_TIMEOUT_MS 0 // Maximum number of in-flight submissions per-thread, to avoid exhausting the parameter buffer pool #define WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD WEBGPU_NUM_PARAM_BUFS / WEBGPU_COMMAND_SUBMIT_BATCH_SIZE @@ -252,19 +251,18 @@ struct webgpu_context_struct { webgpu_pipeline set_rows_pipeline; webgpu_pipeline get_rows_pipeline[30]; webgpu_pipeline get_rows_f32_no_vec_pipeline; - webgpu_pipeline cpy_pipeline[2][2]; // src type, dst type - webgpu_pipeline add_pipeline[2][2]; // type, inplace - webgpu_pipeline sub_pipeline[2][2]; // type, inplace - webgpu_pipeline mul_pipeline[2][2]; // type, inplace - webgpu_pipeline div_pipeline[2][2]; // type, inplace - webgpu_pipeline rms_norm_pipeline[2]; // inplace - webgpu_pipeline rope_pipeline[2][2][2]; // type, ff, inplace - webgpu_pipeline glu_pipeline[7][2][2]; // glu-op, type, split - webgpu_pipeline scale_pipeline[2]; // inplace + webgpu_pipeline cpy_pipeline[2][2]; // src type, dst type + webgpu_pipeline add_pipeline[2][2]; // type, inplace + webgpu_pipeline sub_pipeline[2][2]; // type, inplace + webgpu_pipeline mul_pipeline[2][2]; // type, inplace + webgpu_pipeline div_pipeline[2][2]; // type, inplace + webgpu_pipeline rms_norm_pipeline[2]; // inplace + webgpu_pipeline rope_pipeline[2][2][2]; // type, ff, inplace + webgpu_pipeline glu_pipeline[7][2][2]; // glu-op, type, split + webgpu_pipeline scale_pipeline[2]; // inplace webgpu_pipeline soft_max_pipeline[3][2][2]; // (no_mask, f32_mask, f16_mask), has_sink, inplace webgpu_pipeline unary_pipeline[GGML_UNARY_OP_COUNT][2][2]; - size_t memset_bytes_per_thread; // Staging buffer for reading data from the GPU @@ -345,7 +343,6 @@ static void ggml_webgpu_create_pipeline(wgpu::Device & pipeline_desc.compute.constantCount = constants.size(); } - pipeline = { device.CreateComputePipeline(&pipeline_desc), label }; } @@ -721,10 +718,8 @@ static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, g size_t max_wg_size = ctx->max_wg_size_x; uint32_t wg_x = (ne + max_wg_size - 1) / max_wg_size; return ggml_backend_webgpu_build(ctx, ctx->cpy_pipeline[src->type][dst->type], params, entries, wg_x); - } - static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, @@ -865,14 +860,12 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx, return ggml_backend_webgpu_build(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x); } -static webgpu_command ggml_webgpu_unary_op( webgpu_context & ctx, - ggml_tensor * src, - ggml_tensor * dst, - webgpu_pipeline & pipeline, - bool in_place, - bool additional_params=false) { - - +static webgpu_command ggml_webgpu_unary_op(webgpu_context & ctx, + ggml_tensor * src, + ggml_tensor * dst, + webgpu_pipeline & pipeline, + bool in_place, + const std::vector & xielu_params = {}) { uint32_t ne = (uint32_t) ggml_nelements(dst); std::vector params = { @@ -888,18 +881,13 @@ static webgpu_command ggml_webgpu_unary_op( webgpu_context & ctx, (uint32_t) dst->ne[1], (uint32_t) dst->ne[2] }; - if (additional_params) { - for (uint i = 1; i < 5; i++) { - params.push_back((uint32_t)(ggml_get_op_params_f32(dst, i))); // alpha_n, alpha_p, beta, eps - } - } + params.insert(params.end(), xielu_params.begin(), xielu_params.end()); std::vector entries = { { .binding = 0, .buffer = ggml_webgpu_tensor_buf(src), .offset = ggml_webgpu_tensor_align_offset(ctx, src), .size = ggml_webgpu_tensor_binding_size(ctx, src) }, - }; if (!in_place) { entries.push_back({ .binding = 1, @@ -1258,8 +1246,6 @@ static std::optional ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * src1 = node->src[1]; ggml_tensor * src2 = node->src[2]; - - switch (node->op) { // no-ops case GGML_OP_NONE: @@ -1309,11 +1295,24 @@ static std::optional ggml_webgpu_encode_node(webgpu_context ctx, return ggml_webgpu_soft_max(ctx, src0, src1, src2, node); case GGML_OP_UNARY: { - const ggml_unary_op UNARY_OP = ggml_get_unary_op(node); + const ggml_unary_op UNARY_OP = ggml_get_unary_op(node); + int in_place = ggml_webgpu_tensor_equal(src0, node); + std::vector xielu_params; - int in_place = ggml_webgpu_tensor_equal(src0, node); - bool XIELU = (UNARY_OP == GGML_UNARY_OP_XIELU); - return ggml_webgpu_unary_op(ctx, src0, node, ctx->unary_pipeline[UNARY_OP][node->type][in_place], in_place, XIELU); + switch (UNARY_OP) { + case GGML_UNARY_OP_XIELU: + xielu_params = { + static_cast(ggml_get_op_params_f32(node, 1)), // alpha_n + static_cast(ggml_get_op_params_f32(node, 2)), // alpha_p + static_cast(ggml_get_op_params_f32(node, 3)), // beta + static_cast(ggml_get_op_params_f32(node, 4)) // eps + }; + break; + default: + break; + } + return ggml_webgpu_unary_op(ctx, src0, node, ctx->unary_pipeline[UNARY_OP][node->type][in_place], + in_place, xielu_params); } default: @@ -1322,7 +1321,6 @@ static std::optional ggml_webgpu_encode_node(webgpu_context ctx, } static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_webgpu_context * backend_ctx = static_cast(backend->context); @@ -1541,8 +1539,6 @@ static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")"); ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); @@ -1554,8 +1550,6 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf); - - return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size); } @@ -1886,163 +1880,179 @@ static void ggml_webgpu_init_unary_pipeline(webgpu_context & webgpu_ctx) { // ABS ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_ABS][GGML_TYPE_F32][0], - wgsl_abs_f32, "abs_f32", constants); + wgsl_abs_f32, "abs_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_ABS][GGML_TYPE_F16][0], - wgsl_abs_f16, "abs_f16", constants); + wgsl_abs_f16, "abs_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_ABS][GGML_TYPE_F32][1], - wgsl_abs_in_place_f32, "abs_in_place_f32", constants); + wgsl_abs_in_place_f32, "abs_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_ABS][GGML_TYPE_F16][1], - wgsl_abs_in_place_f16, "abs_in_place_f16", constants); + wgsl_abs_in_place_f16, "abs_in_place_f16", constants); // SGN ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SGN][GGML_TYPE_F32][0], - wgsl_sgn_f32, "sgn_f32", constants); + wgsl_sgn_f32, "sgn_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SGN][GGML_TYPE_F16][0], - wgsl_sgn_f16, "sgn_f16", constants); + wgsl_sgn_f16, "sgn_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SGN][GGML_TYPE_F32][1], - wgsl_sgn_in_place_f32, "sgn_in_place_f32", constants); + wgsl_sgn_in_place_f32, "sgn_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SGN][GGML_TYPE_F16][1], - wgsl_sgn_in_place_f16, "sgn_in_place_f16", constants); + wgsl_sgn_in_place_f16, "sgn_in_place_f16", constants); // NEG ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_NEG][GGML_TYPE_F32][0], - wgsl_neg_f32, "neg_f32", constants); + wgsl_neg_f32, "neg_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_NEG][GGML_TYPE_F16][0], - wgsl_neg_f16, "neg_f16", constants); + wgsl_neg_f16, "neg_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_NEG][GGML_TYPE_F32][1], - wgsl_neg_in_place_f32, "neg_in_place_f32", constants); + wgsl_neg_in_place_f32, "neg_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_NEG][GGML_TYPE_F16][1], - wgsl_neg_in_place_f16, "neg_in_place_f16", constants); + wgsl_neg_in_place_f16, "neg_in_place_f16", constants); // STEP ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_STEP][GGML_TYPE_F32][0], - wgsl_step_f32, "step_f32", constants); + wgsl_step_f32, "step_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_STEP][GGML_TYPE_F16][0], - wgsl_step_f16, "step_f16", constants); + wgsl_step_f16, "step_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_STEP][GGML_TYPE_F32][1], - wgsl_step_in_place_f32, "step_in_place_f32", constants); + wgsl_step_in_place_f32, "step_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_STEP][GGML_TYPE_F16][1], - wgsl_step_in_place_f16, "step_in_place_f16", constants); + wgsl_step_in_place_f16, "step_in_place_f16", constants); // TANH ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_TANH][GGML_TYPE_F32][0], - wgsl_tanh_f32, "tanh_f32", constants); + wgsl_tanh_f32, "tanh_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_TANH][GGML_TYPE_F16][0], - wgsl_tanh_f16, "tanh_f16", constants); + wgsl_tanh_f16, "tanh_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_TANH][GGML_TYPE_F32][1], - wgsl_tanh_in_place_f32, "tanh_in_place_f32", constants); + wgsl_tanh_in_place_f32, "tanh_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_TANH][GGML_TYPE_F16][1], - wgsl_tanh_in_place_f16, "tanh_in_place_f16", constants); + wgsl_tanh_in_place_f16, "tanh_in_place_f16", constants); // ELU ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_ELU][GGML_TYPE_F32][0], - wgsl_elu_f32, "elu_f32", constants); + wgsl_elu_f32, "elu_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_ELU][GGML_TYPE_F16][0], - wgsl_elu_f16, "elu_f16", constants); + wgsl_elu_f16, "elu_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_ELU][GGML_TYPE_F32][1], - wgsl_elu_in_place_f32, "elu_in_place_f32", constants); + wgsl_elu_in_place_f32, "elu_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_ELU][GGML_TYPE_F16][1], - wgsl_elu_in_place_f16, "elu_in_place_f16", constants); + wgsl_elu_in_place_f16, "elu_in_place_f16", constants); // RELU ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_RELU][GGML_TYPE_F32][0], - wgsl_relu_f32, "relu_f32", constants); + wgsl_relu_f32, "relu_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_RELU][GGML_TYPE_F16][0], - wgsl_relu_f16, "relu_f16", constants); + wgsl_relu_f16, "relu_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_RELU][GGML_TYPE_F32][1], - wgsl_relu_in_place_f32, "relu_in_place_f32", constants); + wgsl_relu_in_place_f32, "relu_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_RELU][GGML_TYPE_F16][1], - wgsl_relu_in_place_f16, "relu_in_place_f16", constants); + wgsl_relu_in_place_f16, "relu_in_place_f16", constants); // SIGMOID ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SIGMOID][GGML_TYPE_F32][0], - wgsl_sigmoid_f32, "sigmoid_f32", constants); + wgsl_sigmoid_f32, "sigmoid_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SIGMOID][GGML_TYPE_F16][0], - wgsl_sigmoid_f16, "sigmoid_f16", constants); + wgsl_sigmoid_f16, "sigmoid_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SIGMOID][GGML_TYPE_F32][1], - wgsl_sigmoid_in_place_f32, "sigmoid_in_place_f32", constants); + wgsl_sigmoid_in_place_f32, "sigmoid_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SIGMOID][GGML_TYPE_F16][1], - wgsl_sigmoid_in_place_f16, "sigmoid_in_place_f16", constants); + wgsl_sigmoid_in_place_f16, "sigmoid_in_place_f16", constants); // GELU ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU][GGML_TYPE_F32][0], - wgsl_gelu_f32, "gelu_f32", constants); + wgsl_gelu_f32, "gelu_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU][GGML_TYPE_F16][0], - wgsl_gelu_f16, "gelu_f16", constants); + wgsl_gelu_f16, "gelu_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU][GGML_TYPE_F32][1], - wgsl_gelu_in_place_f32, "gelu_in_place_f32", constants); + wgsl_gelu_in_place_f32, "gelu_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU][GGML_TYPE_F16][1], - wgsl_gelu_in_place_f16, "gelu_in_place_f16", constants); + wgsl_gelu_in_place_f16, "gelu_in_place_f16", constants); // GELU_QUICK - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_QUICK][GGML_TYPE_F32][0], - wgsl_gelu_quick_f32, "gelu_quick_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_QUICK][GGML_TYPE_F16][0], - wgsl_gelu_quick_f16, "gelu_quick_f16", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_QUICK][GGML_TYPE_F32][1], - wgsl_gelu_quick_in_place_f32, "gelu_quick_in_place_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_QUICK][GGML_TYPE_F16][1], - wgsl_gelu_quick_in_place_f16, "gelu_quick_in_place_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_QUICK][GGML_TYPE_F32][0], + wgsl_gelu_quick_f32, "gelu_quick_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_QUICK][GGML_TYPE_F16][0], + wgsl_gelu_quick_f16, "gelu_quick_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_QUICK][GGML_TYPE_F32][1], + wgsl_gelu_quick_in_place_f32, "gelu_quick_in_place_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_QUICK][GGML_TYPE_F16][1], + wgsl_gelu_quick_in_place_f16, "gelu_quick_in_place_f16", constants); // SILU ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SILU][GGML_TYPE_F32][0], - wgsl_silu_f32, "silu_f32", constants); + wgsl_silu_f32, "silu_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SILU][GGML_TYPE_F16][0], - wgsl_silu_f16, "silu_f16", constants); + wgsl_silu_f16, "silu_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SILU][GGML_TYPE_F32][1], - wgsl_silu_in_place_f32, "silu_in_place_f32", constants); + wgsl_silu_in_place_f32, "silu_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_SILU][GGML_TYPE_F16][1], - wgsl_silu_in_place_f16, "silu_in_place_f16", constants); + wgsl_silu_in_place_f16, "silu_in_place_f16", constants); // HARDSWISH - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSWISH][GGML_TYPE_F32][0], - wgsl_hardswish_f32, "hardswish_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSWISH][GGML_TYPE_F16][0], - wgsl_hardswish_f16, "hardswish_f16", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSWISH][GGML_TYPE_F32][1], - wgsl_hardswish_in_place_f32, "hardswish_in_place_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSWISH][GGML_TYPE_F16][1], - wgsl_hardswish_in_place_f16, "hardswish_in_place_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSWISH][GGML_TYPE_F32][0], + wgsl_hardswish_f32, "hardswish_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSWISH][GGML_TYPE_F16][0], + wgsl_hardswish_f16, "hardswish_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSWISH][GGML_TYPE_F32][1], + wgsl_hardswish_in_place_f32, "hardswish_in_place_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSWISH][GGML_TYPE_F16][1], + wgsl_hardswish_in_place_f16, "hardswish_in_place_f16", constants); // HARDSIGMOID - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSIGMOID][GGML_TYPE_F32][0], - wgsl_hardsigmoid_f32, "hardsigmoid_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSIGMOID][GGML_TYPE_F16][0], - wgsl_hardsigmoid_f16, "hardsigmoid_f16", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSIGMOID][GGML_TYPE_F32][1], - wgsl_hardsigmoid_in_place_f32, "hardsigmoid_in_place_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSIGMOID][GGML_TYPE_F16][1], - wgsl_hardsigmoid_in_place_f16, "hardsigmoid_in_place_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSIGMOID][GGML_TYPE_F32][0], + wgsl_hardsigmoid_f32, "hardsigmoid_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSIGMOID][GGML_TYPE_F16][0], + wgsl_hardsigmoid_f16, "hardsigmoid_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSIGMOID][GGML_TYPE_F32][1], + wgsl_hardsigmoid_in_place_f32, "hardsigmoid_in_place_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_HARDSIGMOID][GGML_TYPE_F16][1], + wgsl_hardsigmoid_in_place_f16, "hardsigmoid_in_place_f16", constants); // EXP ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_EXP][GGML_TYPE_F32][0], - wgsl_exp_f32, "exp_f32", constants); + wgsl_exp_f32, "exp_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_EXP][GGML_TYPE_F16][0], - wgsl_exp_f16, "exp_f16", constants); + wgsl_exp_f16, "exp_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_EXP][GGML_TYPE_F32][1], - wgsl_exp_in_place_f32, "exp_in_place_f32", constants); + wgsl_exp_in_place_f32, "exp_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_EXP][GGML_TYPE_F16][1], - wgsl_exp_in_place_f16, "exp_in_place_f16", constants); + wgsl_exp_in_place_f16, "exp_in_place_f16", constants); // GELU_ERF - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_ERF][GGML_TYPE_F32][0], - wgsl_gelu_erf_f32, "gelu_erf_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_ERF][GGML_TYPE_F16][0], - wgsl_gelu_erf_f16, "gelu_erf_f16", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_ERF][GGML_TYPE_F32][1], - wgsl_gelu_erf_in_place_f32, "gelu_erf_in_place_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_ERF][GGML_TYPE_F16][1], - wgsl_gelu_erf_in_place_f16, "gelu_erf_in_place_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_ERF][GGML_TYPE_F32][0], wgsl_gelu_erf_f32, + "gelu_erf_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_ERF][GGML_TYPE_F16][0], wgsl_gelu_erf_f16, + "gelu_erf_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_ERF][GGML_TYPE_F32][1], + wgsl_gelu_erf_in_place_f32, "gelu_erf_in_place_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->unary_pipeline[GGML_UNARY_OP_GELU_ERF][GGML_TYPE_F16][1], + wgsl_gelu_erf_in_place_f16, "gelu_erf_in_place_f16", constants); // XIELU ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_XIELU][GGML_TYPE_F32][0], - wgsl_xielu_f32, "xielu_f32", constants); + wgsl_xielu_f32, "xielu_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_XIELU][GGML_TYPE_F16][0], - wgsl_xielu_f16, "xielu_f16", constants); + wgsl_xielu_f16, "xielu_f16", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_XIELU][GGML_TYPE_F32][1], - wgsl_xielu_in_place_f32, "xielu_in_place_f32", constants); + wgsl_xielu_in_place_f32, "xielu_in_place_f32", constants); ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->unary_pipeline[GGML_UNARY_OP_XIELU][GGML_TYPE_F16][1], - wgsl_xielu_in_place_f16, "xielu_in_place_f16", constants); + wgsl_xielu_in_place_f16, "xielu_in_place_f16", constants); } static void ggml_webgpu_init_scale_pipeline(webgpu_context & webgpu_ctx) { @@ -2084,7 +2094,6 @@ static void ggml_webgpu_init_soft_max_pipeline(webgpu_context & webgpu_ctx) { } static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) { - GGML_UNUSED(params); WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_init()"); @@ -2103,7 +2112,6 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co /* .device = */ dev, /* .context = */ &backend_ctx, }; - //tried return &backend; } @@ -2160,7 +2168,6 @@ static bool ggml_webgpu_supported_qtype(ggml_type type) { } static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); webgpu_context webgpu_ctx = ctx->webgpu_ctx; @@ -2294,9 +2301,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_XIELU: - supports_op = supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type); + supports_op = supports_op = + (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type); break; - case GGML_UNARY_OP_COUNT: default: break; } @@ -2448,7 +2455,6 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage, wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead); - ggml_webgpu_init_memset_pipeline(ctx); ggml_webgpu_init_mul_mat_pipeline(ctx); ggml_webgpu_init_set_rows_pipeline(ctx); @@ -2505,7 +2511,6 @@ static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = { /* End GGML Backend Registration Interface */ ggml_backend_reg_t ggml_backend_webgpu_reg() { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()"); webgpu_context webgpu_ctx = std::make_shared(); @@ -2531,7 +2536,6 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() { } ggml_backend_t ggml_backend_webgpu_init(void) { - ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_webgpu_reg(), 0); return ggml_backend_webgpu_device_init(dev, nullptr); diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl index f9fe9cb3c4..7f632a24e5 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl @@ -3,396 +3,339 @@ [ { "SHADER_NAME": "abs_f32", - "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = abs(src[src_i]);" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] - + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = abs(src[src_i]);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "abs_f16", - "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = abs(src[src_i]);" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = abs(src[src_i]);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "abs_in_place_f32", - "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = abs(src[src_i]);" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = abs(src[src_i]);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "abs_in_place_f16", - "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = abs(src[src_i]);" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = abs(src[src_i]);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "sgn_f32", - "REPLS": { - "TYPE": "f32", - "FUNC": "dst[dst_i] = select(select(0.0, -1.0, src[src_i] < 0.0), 1.0, src[src_i] > 0.0);" - }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = select(select(0.0, -1.0, src[src_i] < 0.0), 1.0, src[src_i] > 0.0);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "sgn_f16", - "REPLS": { - "TYPE": "f16", - "FUNC": "dst[dst_i] = select(select(0.0h, -1.0h, src[src_i] < 0.0h), 1.0h, src[src_i] > 0.0h);" - }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = select(select(0.0h, -1.0h, src[src_i] < 0.0h), 1.0h, src[src_i] > 0.0h);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "sgn_in_place_f32", - "REPLS": { - "TYPE": "f32", - "FUNC": "src[dst_i] = select(select(0.0, -1.0, src[src_i] < 0.0), 1.0, src[src_i] > 0.0);" - }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = select(select(0.0, -1.0, src[src_i] < 0.0), 1.0, src[src_i] > 0.0);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "sgn_in_place_f16", - "REPLS": { - "TYPE": "f16", - "FUNC": "src[dst_i] = select(select(0.0h, -1.0h, src[src_i] < 0.0h), 1.0h, src[src_i] > 0.0h);" - }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = select(select(0.0h, -1.0h, src[src_i] < 0.0h), 1.0h, src[src_i] > 0.0h);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "neg_f32", - "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = -src[src_i];" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = -src[src_i];", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "neg_f16", - "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = -src[src_i];" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = -src[src_i];", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "neg_in_place_f32", - "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = -src[src_i];" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = -src[src_i];", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "neg_in_place_f16", - "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = -src[src_i];" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = -src[src_i];", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "step_f32", - "REPLS": { - "TYPE": "f32", - "FUNC": "dst[dst_i] = select(0.0, 1.0, src[src_i] > 0.0);" - }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = select(0.0, 1.0, src[src_i] > 0.0);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "step_f16", - "REPLS": { - "TYPE": "f16", - "FUNC": "dst[dst_i] = select(0.0h, 1.0h, src[src_i] > 0.0h);" - }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = select(0.0h, 1.0h, src[src_i] > 0.0h);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "step_in_place_f32", - "REPLS": { - "TYPE": "f32", - "FUNC": "src[dst_i] = select(0.0, 1.0, src[src_i] > 0.0);" - }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = select(0.0, 1.0, src[src_i] > 0.0);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "step_in_place_f16", - "REPLS": { - "TYPE": "f16", - "FUNC": "src[dst_i] = select(0.0h, 1.0h, src[src_i] > 0.0h);" - }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = select(0.0h, 1.0h, src[src_i] > 0.0h);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "tanh_f32", - "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = tanh(clamp(src[src_i], -9.010913, 9.010913)); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = tanh(clamp(src[src_i], -9.010913, 9.010913)); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "tanh_f16", - "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = tanh(clamp(src[src_i], -9.010913, 9.010913)); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = tanh(clamp(src[src_i], -9.010913, 9.010913)); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "tanh_in_place_f32", - "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = tanh(clamp(src[src_i], -9.010913, 9.010913)); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = tanh(clamp(src[src_i], -9.010913, 9.010913)); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "tanh_in_place_f16", - "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = tanh(clamp(src[src_i], -9.010913, 9.010913)); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = tanh(clamp(src[src_i], -9.010913, 9.010913)); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "elu_f32", - "REPLS": { - "TYPE": "f32", - "FUNC": "dst[dst_i] = select(exp(src[src_i]) - 1.0, src[src_i], src[src_i] > 0.0);" - }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = select(exp(src[src_i]) - 1.0, src[src_i], src[src_i] > 0.0);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "elu_f16", - "REPLS": { - "TYPE": "f16", - "FUNC": "dst[dst_i] = select(exp(src[src_i]) - 1.0h, src[src_i], src[src_i] > 0.0h);" - }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = select(exp(src[src_i]) - 1.0h, src[src_i], src[src_i] > 0.0h);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "elu_in_place_f32", - "REPLS": { - "TYPE": "f32", - "FUNC": "src[dst_i] = select(exp(src[src_i]) - 1.0, src[src_i], src[src_i] > 0.0);" - }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = select(exp(src[src_i]) - 1.0, src[src_i], src[src_i] > 0.0);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "elu_in_place_f16", - "REPLS": { - "TYPE": "f16", - "FUNC": "src[dst_i] = select(exp(src[src_i]) - 1.0h, src[src_i], src[src_i] > 0.0h);" - }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = select(exp(src[src_i]) - 1.0h, src[src_i], src[src_i] > 0.0h);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "relu_f32", - "REPLS": { - "TYPE": "f32", - "FUNC": "dst[dst_i] = select(0.0, src[src_i], src[src_i] > 0.0);" - }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = select(0.0, src[src_i], src[src_i] > 0.0);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "relu_f16", - "REPLS": { - "TYPE": "f16", - "FUNC": "dst[dst_i] = select(0.0h, src[src_i], src[src_i] > 0.0h);" - }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = select(0.0h, src[src_i], src[src_i] > 0.0h);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "relu_in_place_f32", - "REPLS": { - "TYPE": "f32", - "FUNC": "src[dst_i] = select(0.0, src[src_i], src[src_i] > 0.0);" - }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = select(0.0, src[src_i], src[src_i] > 0.0);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "relu_in_place_f16", - "REPLS": { - "TYPE": "f16", - "FUNC": "src[dst_i] = select(0.0h, src[src_i], src[src_i] > 0.0h);" - }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = select(0.0h, src[src_i], src[src_i] > 0.0h);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "sigmoid_f32", - "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = 1.0 / (1.0 + exp(-src[src_i]));" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = 1.0 / (1.0 + exp(-src[src_i]));", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "sigmoid_f16", - "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = 1.0h / (1.0h + exp(-src[src_i]));" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = 1.0h / (1.0h + exp(-src[src_i]));", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "sigmoid_in_place_f32", - "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = 1.0 / (1.0 + exp(-src[src_i]));" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = 1.0 / (1.0 + exp(-src[src_i]));", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "sigmoid_in_place_f16", - "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = 1.0h / (1.0h + exp(-src[src_i]));" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = 1.0h / (1.0h + exp(-src[src_i]));", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "gelu_f32", - "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(sqrt(2.0 / 3.14159265) * (src[src_i] + 0.044715 * pow(src[src_i], 3.0)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(sqrt(2.0 / 3.14159265) * (src[src_i] + 0.044715 * pow(src[src_i], 3.0)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "gelu_f16", - "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = 0.5h * src[src_i] * (1.0h + tanh(clamp(sqrt(2.0h / 3.14159265h) * (src[src_i] + 0.044715h * pow(src[src_i], 3.0h)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = 0.5h * src[src_i] * (1.0h + tanh(clamp(sqrt(2.0h / 3.14159265h) * (src[src_i] + 0.044715h * pow(src[src_i], 3.0h)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "gelu_in_place_f32", - "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(sqrt(2.0 / 3.14159265) * (src[src_i] + 0.044715 * pow(src[src_i], 3.0)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(sqrt(2.0 / 3.14159265) * (src[src_i] + 0.044715 * pow(src[src_i], 3.0)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "gelu_in_place_f16", - "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = 0.5h * src[src_i] * (1.0h + tanh(clamp(sqrt(2.0h / 3.14159265h) * (src[src_i] + 0.044715h * pow(src[src_i], 3.0h)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = 0.5h * src[src_i] * (1.0h + tanh(clamp(sqrt(2.0h / 3.14159265h) * (src[src_i] + 0.044715h * pow(src[src_i], 3.0h)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "gelu_quick_f32", - "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = src[src_i] * 0.5 * (1.0 + tanh(clamp(0.79788456 * src[src_i] * (1.0 + 0.044715 * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = src[src_i] * 0.5 * (1.0 + tanh(clamp(0.79788456 * src[src_i] * (1.0 + 0.044715 * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "gelu_quick_f16", - "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = src[src_i] * 0.5h * (1.0h + tanh(clamp(0.79788456h * src[src_i] * (1.0h + 0.044715h * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = src[src_i] * 0.5h * (1.0h + tanh(clamp(0.79788456h * src[src_i] * (1.0h + 0.044715h * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "gelu_quick_in_place_f32", - "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = src[src_i] * 0.5 * (1.0 + tanh(clamp(0.79788456 * src[src_i] * (1.0 + 0.044715 * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = src[src_i] * 0.5 * (1.0 + tanh(clamp(0.79788456 * src[src_i] * (1.0 + 0.044715 * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "gelu_quick_in_place_f16", - "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = src[src_i] * 0.5h * (1.0h + tanh(0.79788456h * src[src_i] * (1.0h + 0.044715h * src[src_i] * src[src_i]))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = src[src_i] * 0.5h * (1.0h + tanh(0.79788456h * src[src_i] * (1.0h + 0.044715h * src[src_i] * src[src_i]))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "silu_f32", - "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = src[src_i] / (1.0 + exp(-src[src_i]));" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = src[src_i] / (1.0 + exp(-src[src_i]));", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "silu_f16", - "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = src[src_i] / (1.0h + exp(-src[src_i]));" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = src[src_i] / (1.0h + exp(-src[src_i]));", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "silu_in_place_f32", - "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = src[src_i] / (1.0 + exp(-src[src_i]));" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = src[src_i] / (1.0 + exp(-src[src_i]));", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "silu_in_place_f16", - "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = src[src_i] / (1.0h + exp(-src[src_i]));" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = src[src_i] / (1.0h + exp(-src[src_i]));", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "hardswish_f32", - "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = src[src_i] * min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = src[src_i] * min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "hardswish_f16", - "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = src[src_i] * min(1.0h, max(0.0h, (src[src_i] + 3.0h) / 6.0h));" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = src[src_i] * min(1.0h, max(0.0h, (src[src_i] + 3.0h) / 6.0h));", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "hardswish_in_place_f32", - "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = src[src_i] * min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = src[src_i] * min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "hardswish_in_place_f16", - "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = src[src_i] * min(1.0h, max(0.0h, (src[src_i] + 3.0h) / 6.0h));" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = src[src_i] * min(1.0h, max(0.0h, (src[src_i] + 3.0h) / 6.0h));", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "hardsigmoid_f32", - "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "hardsigmoid_f16", - "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = min(1.0h, max(0.0h, (src[src_i] + 3.0h) / 6.0h));" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = min(1.0h, max(0.0h, (src[src_i] + 3.0h) / 6.0h));", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "hardsigmoid_in_place_f32", - "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "hardsigmoid_in_place_f16", - "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = min(1.0h, max(0.0h, (src[src_i] + 3.0h) / 6.0h));" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = min(1.0h, max(0.0h, (src[src_i] + 3.0h) / 6.0h));", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "exp_f32", - "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = exp(src[src_i]);" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = exp(src[src_i]);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "exp_f16", - "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = exp(src[src_i]);" }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = exp(src[src_i]);", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "exp_in_place_f32", - "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = exp(src[src_i]);" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = exp(src[src_i]);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "exp_in_place_f16", - "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = exp(src[src_i]);" }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = exp(src[src_i]);", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "gelu_erf_f32", - "REPLS": { - "TYPE": "f32", - "FUNC": "dst[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" - }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "dst[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "gelu_erf_f16", - "REPLS": { - "TYPE": "f16", - "FUNC": "dst[dst_i] = 0.5h * src[src_i] * (1.0h + tanh(clamp(0.79788456h * (src[src_i] + 0.044715h * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" - }, - "DECLS": ["NOT_INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "dst[dst_i] = 0.5h * src[src_i] * (1.0h + tanh(clamp(0.79788456h * (src[src_i] + 0.044715h * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "gelu_erf_in_place_f32", - "REPLS": { - "TYPE": "f32", - "FUNC": "src[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" - }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f32", "FUNC": "src[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "gelu_erf_in_place_f16", - "REPLS": { - "TYPE": "f16", - "FUNC": "src[dst_i] = 0.5h * src[src_i] * (1.0h + tanh(clamp(0.79788456h * (src[src_i] + 0.044715h * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458" - }, - "DECLS": ["INPLACE_DFLT_PARAMS"] + "REPLS": { "TYPE": "f16", "FUNC": "src[dst_i] = 0.5h * src[src_i] * (1.0h + tanh(clamp(0.79788456h * (src[src_i] + 0.044715h * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458", "EXT_PARAMS": "" }, + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "xielu_f32", "REPLS": { "TYPE": "f32", - "FUNC": "dst[dst_i] = select(((exp(min(src[src_i], f32(params.eps))) - 1.0) - src[src_i]) * f32(params.alpha_n) + f32(params.beta) * src[src_i], f32(params.alpha_p) * src[src_i] * src[src_i] + f32(params.beta) * src[src_i], src[src_i] > 0.0);" + "FUNC": "dst[dst_i] = select(((exp(min(src[src_i], f32(params.eps))) - 1.0) - src[src_i]) * f32(params.alpha_n) + f32(params.beta) * src[src_i], f32(params.alpha_p) * src[src_i] * src[src_i] + f32(params.beta) * src[src_i], src[src_i] > 0.0);", + "EXT_PARAMS": "alpha_n: u32, alpha_p: u32, beta: u32, eps: u32" }, - "DECLS": ["NOT_INPLACE_EXT_PARAMS"] + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "xielu_f16", "REPLS": { "TYPE": "f16", - "FUNC": "dst[dst_i] = select(((exp(min(src[src_i], f16(params.eps))) - 1.0h) - src[src_i]) * f16(params.alpha_n) + f16(params.beta) * src[src_i], f16(params.alpha_p) * src[src_i] * src[src_i] + f16(params.beta) * src[src_i], src[src_i] > 0.0h);" + "FUNC": "dst[dst_i] = select(((exp(min(src[src_i], f16(params.eps))) - 1.0h) - src[src_i]) * f16(params.alpha_n) + f16(params.beta) * src[src_i], f16(params.alpha_p) * src[src_i] * src[src_i] + f16(params.beta) * src[src_i], src[src_i] > 0.0h);", + "EXT_PARAMS": "alpha_n: u32, alpha_p: u32, beta: u32, eps: u32" }, - "DECLS": ["NOT_INPLACE_EXT_PARAMS"] + "DECLS": ["NOT_INPLACE"] }, { "SHADER_NAME": "xielu_in_place_f32", "REPLS": { "TYPE": "f32", - "FUNC": "src[dst_i] = select(((exp(min(src[src_i], f32(params.eps))) - 1.0) - src[src_i]) * f32(params.alpha_n) + f32(params.beta) * src[src_i], f32(params.alpha_p) * src[src_i] * src[src_i] + f32(params.beta) * src[src_i], src[src_i] > 0.0);" + "FUNC": "src[dst_i] = select(((exp(min(src[src_i], f32(params.eps))) - 1.0) - src[src_i]) * f32(params.alpha_n) + f32(params.beta) * src[src_i], f32(params.alpha_p) * src[src_i] * src[src_i] + f32(params.beta) * src[src_i], src[src_i] > 0.0);", + "EXT_PARAMS": "alpha_n: u32, alpha_p: u32, beta: u32, eps: u32" }, - "DECLS": ["INPLACE_EXT_PARAMS"] + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "xielu_in_place_f16", "REPLS": { "TYPE": "f16", - "FUNC": "src[dst_i] = select(((exp(min(src[src_i], f16(params.eps))) - 1.0h) - src[src_i]) * f16(params.alpha_n) + f16(params.beta) * src[src_i], f16(params.alpha_p) * src[src_i] * src[src_i] + f16(params.beta) * src[src_i], src[src_i] > 0.0h);" + "FUNC": "src[dst_i] = select(((exp(min(src[src_i], f16(params.eps))) - 1.0h) - src[src_i]) * f16(params.alpha_n) + f16(params.beta) * src[src_i], f16(params.alpha_p) * src[src_i] * src[src_i] + f16(params.beta) * src[src_i], src[src_i] > 0.0h);", + "EXT_PARAMS": "alpha_n: u32, alpha_p: u32, beta: u32, eps: u32" }, - "DECLS": ["INPLACE_EXT_PARAMS"] + "DECLS": ["INPLACE"] } ] @@ -400,7 +343,7 @@ #define(DECLS) -#decl(NOT_INPLACE_DFLT_PARAMS) +#decl(INPLACE) @group(0) @binding(1) var dst: array<{{TYPE}}>; @@ -408,68 +351,9 @@ var dst: array<{{TYPE}}>; @group(0) @binding(2) var params: Params; -struct Params { - ne: u32, // total number of elements - offset_src: u32, // in elements - offset_dst: u32, // in elements +#enddecl(INPLACE) - // Strides (in elements) — may be permuted - stride_src0: u32, - stride_src1: u32, - stride_src2: u32, - stride_src3: u32, - - stride_dst0: u32, - stride_dst1: u32, - stride_dst2: u32, - stride_dst3: u32, - - // Logical shapes - src_ne0: u32, - src_ne1: u32, - src_ne2: u32, - - dst_ne0: u32, - dst_ne1: u32, - dst_ne2: u32 -}; - -#enddecl(NOT_INPLACE_DFLT_PARAMS) - -#decl(INPLACE_DFLT_PARAMS) - -@group(0) @binding(1) -var params: Params; - -struct Params { - ne: u32, // total number of elements - offset_src: u32, // in elements - offset_dst: u32, // in elements - - // Strides (in elements) — may be permuted - stride_src0: u32, - stride_src1: u32, - stride_src2: u32, - stride_src3: u32, - - stride_dst0: u32, - stride_dst1: u32, - stride_dst2: u32, - stride_dst3: u32, - - // Logical shapes - src_ne0: u32, - src_ne1: u32, - src_ne2: u32, - - dst_ne0: u32, - dst_ne1: u32, - dst_ne2: u32 -}; - -#enddecl(INPLACE_DFLT_PARAMS) - -#decl(NOT_INPLACE_EXT_PARAMS) +#decl(NOT_INPLACE) @group(0) @binding(1) var dst: array<{{TYPE}}>; @@ -477,78 +361,7 @@ var dst: array<{{TYPE}}>; @group(0) @binding(2) var params: Params; -struct Params { - ne: u32, // total number of elements - offset_src: u32, // in elements - offset_dst: u32, // in elements - - // Strides (in elements) — may be permuted - stride_src0: u32, - stride_src1: u32, - stride_src2: u32, - stride_src3: u32, - - stride_dst0: u32, - stride_dst1: u32, - stride_dst2: u32, - stride_dst3: u32, - - // Logical shapes - src_ne0: u32, - src_ne1: u32, - src_ne2: u32, - - dst_ne0: u32, - dst_ne1: u32, - dst_ne2: u32, - - // XIELU params - alpha_n: u32, - alpha_p: u32, - beta: u32, - eps: u32 -}; - -#enddecl(NOT_INPLACE_EXT_PARAMS) - -#decl(INPLACE_EXT_PARAMS) - -@group(0) @binding(1) -var params: Params; - -struct Params { - ne: u32, // total number of elements - offset_src: u32, // in elements - offset_dst: u32, // in elements - - // Strides (in elements) — may be permuted - stride_src0: u32, - stride_src1: u32, - stride_src2: u32, - stride_src3: u32, - - stride_dst0: u32, - stride_dst1: u32, - stride_dst2: u32, - stride_dst3: u32, - - // Logical shapes - src_ne0: u32, - src_ne1: u32, - src_ne2: u32, - - dst_ne0: u32, - dst_ne1: u32, - dst_ne2: u32, - - // XIELU params - alpha_n: u32, - alpha_p: u32, - beta: u32, - eps: u32 -}; - -#enddecl(INPLACE_EXT_PARAMS) +#enddecl(NOT_INPLACE) #end(DECLS) @@ -565,6 +378,34 @@ var src: array<{{TYPE}}>; DECLS +struct Params { + ne: u32, // total number of elements + offset_src: u32, // in elements + offset_dst: u32, // in elements + + // Strides (in elements) — may be permuted + stride_src0: u32, + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_dst0: u32, + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // Logical shapes + src_ne0: u32, + src_ne1: u32, + src_ne2: u32, + + dst_ne0: u32, + dst_ne1: u32, + dst_ne2: u32, + + {{EXT_PARAMS}} +}; + override wg_size: u32; @compute @workgroup_size(wg_size) fn main(@builtin(global_invocation_id) gid: vec3) { @@ -599,3 +440,4 @@ fn main(@builtin(global_invocation_id) gid: vec3) { } #end(SHADER) +