neg f16xf32xip builds and runs, havent actually ran a model that uses neg kernel yet though

2025-09-30 23:55:27 -07:00 · 2025-09-30 23:55:27 -07:00 · aa1c9b2f88
parent 5d8e6784e2
commit aa1c9b2f88
3 changed files with 139 additions and 0 deletions
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@ -144,6 +144,8 @@ struct webgpu_context_struct {
    wgpu::ComputePipeline glu_pipeline[7][2][2];       // glu-op, type, split
    wgpu::ComputePipeline scale_pipeline[2];           // inplace
    wgpu::ComputePipeline soft_max_pipeline[3][2][2];  // (no_mask, f32_mask, f16_mask), has_sink, inplace
+    wgpu::ComputePipeline neg_pipeline;                
+    wgpu::ComputePipeline neg_ip_pipeline;

    size_t memset_bytes_per_thread;

@ -992,6 +994,36 @@ static void ggml_webgpu_soft_max(webgpu_context & ctx,
                                          ggml_nrows(dst), ggml_op_name(dst->op));
 }

+static void ggml_webgpu_neg(      webgpu_context &        ctx,
+                                  ggml_tensor *           src,
+                                  ggml_tensor *           dst,
+                                  wgpu::ComputePipeline & pipeline,
+                                  bool                    in_place) {
+    std::vector<uint32_t> params = {
+        (uint32_t) ggml_nelements(dst)
+    };
+
+    std::vector<wgpu::BindGroupEntry> entries = {
+        { .binding = 0,
+         .buffer  = ggml_webgpu_tensor_buf(src),
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, src),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, src) },
+
+    };
+    if (!in_place) {
+        entries.push_back({ .binding = 1,
+                            .buffer  = ggml_webgpu_tensor_buf(dst),
+                            .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
+                            .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
+    }
+
+    size_t   max_wg_size = ctx->max_wg_size_x;
+    uint32_t wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
+    
+    ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
+}
+
+
 // Returns true if node has enqueued work into the queue, false otherwise
 static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
    if (ggml_is_empty(node)) {
@ -1060,6 +1092,22 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
        case GGML_OP_SCALE:
            ggml_webgpu_scale(ctx, src0, node);
            break;
+        case GGML_OP_UNARY: {
+            // if unary, switch on unary operators
+            const ggml_unary_op unary_op = ggml_get_unary_op(node);
+            switch (unary_op) {
+                case GGML_UNARY_OP_NEG:
+                    if (ggml_webgpu_tensor_equal(src0, node)) {
+                        ggml_webgpu_neg(ctx, src0, node, ctx->neg_ip_pipeline, true);
+                    } else {
+                        ggml_webgpu_neg(ctx, src0, src1, ctx->neg_pipeline, false);
+                    }
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        }
        default:
            return false;
    }
@ -1622,6 +1670,18 @@ static void ggml_webgpu_init_soft_max_pipeline(webgpu_context & webgpu_ctx) {
                                constants);
 }

+static void ggml_webgpu_init_neg_pipeline(webgpu_context & webgpu_ctx) {
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->neg_pipeline, wgsl_neg_f32, "neg_f32",
+                                ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->neg_pipeline, wgsl_neg_f16, "neg_f16",
+                                ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->neg_ip_pipeline, wgsl_neg_in_place_f32, "neg_in_place_f32",
+                                ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->neg_ip_pipeline, wgsl_neg_in_place_f16, "neg_in_place_f16",
+                                ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
+    
+}
+
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
    GGML_UNUSED(params);

--- a/ggml/src/ggml-webgpu/wgsl-shaders/neg.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/neg.wgsl
@ -0,0 +1,41 @@
+#define(VARIANTS)
+
+[
+  {
+    "REPLS": {
+      "TYPE" : "f32",
+    }
+  },
+  {
+    "REPLS": {
+      "TYPE" : "f16",
+    }
+  }
+]
+
+#end(VARIANTS)
+
+#define(SHADER)
+
+enable f16;
+
+@group(0) @binding(0)
+var<storage, read_write> src: array<{{TYPE}}>;
+
+@group(0) @binding(1)
+var<storage, read_write> dst: array<{{TYPE}}>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+
+override wg_size: u32;
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    if (gid.x < params.ne) {
+        dst[gid.x] = -src[gid.x];
+    }
+
+}
+
+#end(SHADER)
--- a/ggml/src/ggml-webgpu/wgsl-shaders/neg_in_place.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/neg_in_place.wgsl
@ -0,0 +1,38 @@
+#define(VARIANTS)
+
+[
+  {
+    "REPLS": {
+      "TYPE" : "f32",
+    }
+  },
+  {
+    "REPLS": {
+      "TYPE" : "f16",
+    }
+  }
+]
+
+#end(VARIANTS)
+
+#define(SHADER)
+
+enable f16;
+
+@group(0) @binding(0)
+var<storage, read_write> src: array<{{TYPE}}>;
+
+@group(0) @binding(1)
+var<uniform> params: Params;
+
+
+override wg_size: u32;
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    if (gid.x < params.ne) {
+        src[gid.x] = -src[gid.x];
+    }
+
+}
+
+#end(SHADER)