neg f16xf32xip builds and runs, havent actually ran a model that uses neg kernel yet though

2025-09-30 23:55:27 -07:00 · 2025-09-30 23:55:27 -07:00 · aa1c9b2f88
parent 5d8e6784e2
commit aa1c9b2f88
3 changed files with 139 additions and 0 deletions
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@ -144,6 +144,8 @@ struct webgpu_context_struct {
    wgpu::ComputePipeline glu_pipeline[7][2][2];       // glu-op, type, split
    wgpu::ComputePipeline scale_pipeline[2];           // inplace
    wgpu::ComputePipeline soft_max_pipeline[3][2][2];  // (no_mask, f32_mask, f16_mask), has_sink, inplace
    wgpu::ComputePipeline neg_pipeline;                
    wgpu::ComputePipeline neg_ip_pipeline;
    size_t memset_bytes_per_thread;
@ -992,6 +994,36 @@ static void ggml_webgpu_soft_max(webgpu_context & ctx,
                                          ggml_nrows(dst), ggml_op_name(dst->op));
 }
 static void ggml_webgpu_neg(      webgpu_context &        ctx,
                                  ggml_tensor *           src,
                                  ggml_tensor *           dst,
                                  wgpu::ComputePipeline & pipeline,
                                  bool                    in_place) {
    std::vector<uint32_t> params = {
        (uint32_t) ggml_nelements(dst)
    };
    std::vector<wgpu::BindGroupEntry> entries = {
        { .binding = 0,
         .buffer  = ggml_webgpu_tensor_buf(src),
         .offset  = ggml_webgpu_tensor_align_offset(ctx, src),
         .size    = ggml_webgpu_tensor_binding_size(ctx, src) },
    };
    if (!in_place) {
        entries.push_back({ .binding = 1,
                            .buffer  = ggml_webgpu_tensor_buf(dst),
                            .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
                            .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
    }
    size_t   max_wg_size = ctx->max_wg_size_x;
    uint32_t wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
    ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
 }
 // Returns true if node has enqueued work into the queue, false otherwise
 static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
    if (ggml_is_empty(node)) {
@ -1060,6 +1092,22 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
        case GGML_OP_SCALE:
            ggml_webgpu_scale(ctx, src0, node);
            break;
        case GGML_OP_UNARY: {
            // if unary, switch on unary operators
            const ggml_unary_op unary_op = ggml_get_unary_op(node);
            switch (unary_op) {
                case GGML_UNARY_OP_NEG:
                    if (ggml_webgpu_tensor_equal(src0, node)) {
                        ggml_webgpu_neg(ctx, src0, node, ctx->neg_ip_pipeline, true);
                    } else {
                        ggml_webgpu_neg(ctx, src0, src1, ctx->neg_pipeline, false);
                    }
                    break;
                default:
                    return false;
            }
            break;
        }
        default:
            return false;
    }
@ -1622,6 +1670,18 @@ static void ggml_webgpu_init_soft_max_pipeline(webgpu_context & webgpu_ctx) {
                                constants);
 }
 static void ggml_webgpu_init_neg_pipeline(webgpu_context & webgpu_ctx) {
    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->neg_pipeline, wgsl_neg_f32, "neg_f32",
                                ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->neg_pipeline, wgsl_neg_f16, "neg_f16",
                                ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->neg_ip_pipeline, wgsl_neg_in_place_f32, "neg_in_place_f32",
                                ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->neg_ip_pipeline, wgsl_neg_in_place_f16, "neg_in_place_f16",
                                ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
 }
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
    GGML_UNUSED(params);
--- a/ggml/src/ggml-webgpu/wgsl-shaders/neg.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/neg.wgsl
@ -0,0 +1,41 @@
 #define(VARIANTS)
 [
  {
    "REPLS": {
      "TYPE" : "f32",
    }
  },
  {
    "REPLS": {
      "TYPE" : "f16",
    }
  }
 ]
 #end(VARIANTS)
 #define(SHADER)
 enable f16;
@group(0) @binding(0)
 var<storage, read_write> src: array<{{TYPE}}>;
@group(0) @binding(1)
 var<storage, read_write> dst: array<{{TYPE}}>;
@group(0) @binding(2)
 var<uniform> params: Params;
 override wg_size: u32;
@compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    if (gid.x < params.ne) {
        dst[gid.x] = -src[gid.x];
    }
 }
 #end(SHADER)
--- a/ggml/src/ggml-webgpu/wgsl-shaders/neg_in_place.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/neg_in_place.wgsl
@ -0,0 +1,38 @@
 #define(VARIANTS)
 [
  {
    "REPLS": {
      "TYPE" : "f32",
    }
  },
  {
    "REPLS": {
      "TYPE" : "f16",
    }
  }
 ]
 #end(VARIANTS)
 #define(SHADER)
 enable f16;
@group(0) @binding(0)
 var<storage, read_write> src: array<{{TYPE}}>;
@group(0) @binding(1)
 var<uniform> params: Params;
 override wg_size: u32;
@compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    if (gid.x < params.ne) {
        src[gid.x] = -src[gid.x];
    }
 }
 #end(SHADER)