Merge 17c99108e6 into 9db77a020c

2026-04-16 15:50:41 +00:00 · 2026-04-16 15:50:41 +00:00 · a984d87457
parent 9db77a020c 17c99108e6
commit a984d87457
3 changed files with 338 additions and 11 deletions
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@ -248,6 +248,27 @@ struct ggml_webgpu_ssm_conv_pipeline_key {
    }
 };
 /** CONV 2D */
 struct ggml_webgpu_conv2d_pipeline_key {
    ggml_type weight_type;
    ggml_type input_type;
    ggml_type output_type;
    bool operator==(const ggml_webgpu_conv2d_pipeline_key & other) const {
        return weight_type == other.weight_type && input_type == other.input_type && output_type == other.output_type;
    }
 };
 struct ggml_webgpu_conv2d_pipeline_key_hash {
    size_t operator()(const ggml_webgpu_conv2d_pipeline_key & key) const {
        size_t seed = 0;
        ggml_webgpu_hash_combine(seed, key.weight_type);
        ggml_webgpu_hash_combine(seed, key.input_type);
        ggml_webgpu_hash_combine(seed, key.output_type);
        return seed;
    }
 };
 /** Gated Delta Net **/
 struct ggml_webgpu_gated_delta_net_pipeline_key {
    int type;
@ -831,6 +852,8 @@ class ggml_webgpu_shader_lib {
        rope_pipelines;
    std::unordered_map<ggml_webgpu_soft_max_pipeline_key, webgpu_pipeline, ggml_webgpu_soft_max_pipeline_key_hash>
        soft_max_pipelines;
    std::unordered_map<ggml_webgpu_conv2d_pipeline_key, webgpu_pipeline, ggml_webgpu_conv2d_pipeline_key_hash>
        conv2d_pipelines;
  public:
    ggml_webgpu_shader_lib(wgpu::Device device) { this->device = device; }
@ -1115,8 +1138,7 @@ class ggml_webgpu_shader_lib {
                    std::string type_upper = type_str;
                    std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
-                    switch (key.src_type)
+                    switch (key.src_type) {
                    {
                        case GGML_TYPE_Q4_0:
                        case GGML_TYPE_Q5_0:
                        case GGML_TYPE_Q8_0:
@ -1136,9 +1158,9 @@ class ggml_webgpu_shader_lib {
                                break;
                            }
                        default:
-                        {
+                            {
-                            defines.push_back(std::string("SRC_TYPE=") + type_str);
+                                defines.push_back(std::string("SRC_TYPE=") + type_str);
-                        }
+                            }
                    }
                    defines.push_back("BYTE_HELPERS");
@ -1621,8 +1643,7 @@ class ggml_webgpu_shader_lib {
                    std::string type_upper = src0_name;
                    std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
-                    switch (context.src0->type)
+                    switch (context.src0->type) {
                    {
                        case GGML_TYPE_Q4_0:
                        case GGML_TYPE_Q5_0:
                        case GGML_TYPE_Q8_0:
@ -1642,9 +1663,9 @@ class ggml_webgpu_shader_lib {
                                break;
                            }
                        default:
-                        {
+                            {
-                            defines.push_back(std::string("SRC0_TYPE=") + src0_name);
+                                defines.push_back(std::string("SRC0_TYPE=") + src0_name);
-                        }
+                            }
                    }
                    defines.push_back("BYTE_HELPERS");
@ -2340,6 +2361,47 @@ class ggml_webgpu_shader_lib {
        return soft_max_pipelines[key];
    }
    webgpu_pipeline get_conv2d_pipeline(const ggml_webgpu_shader_lib_context & context) {
        ggml_webgpu_conv2d_pipeline_key key = {
            .weight_type = context.src0->type,
            .input_type  = context.src1->type,
            .output_type = context.dst->type,
        };
        auto it = conv2d_pipelines.find(key);
        if (it != conv2d_pipelines.end()) {
            return it->second;
        }
        std::vector<std::string> defines;
        std::string              variant = "conv_2d";
        auto push_type_defines = [&](const char * prefix, ggml_type type) {
            std::string s_prefix = prefix;
            if (type == GGML_TYPE_F32) {
                defines.push_back(s_prefix + "_F32");
            } else if (type == GGML_TYPE_F16) {
                defines.push_back(s_prefix + "_F16");
            } else {
                GGML_ABORT("Unsupported type for CONV_2D shader");
            }
        };
        push_type_defines("WEIGHT", key.weight_type);
        push_type_defines("INPUT", key.input_type);
        push_type_defines("OUTPUT", key.output_type);
        defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
        auto processed           = preprocessor.preprocess(wgsl_conv2d, defines);
        auto decisions           = std::make_shared<ggml_webgpu_generic_shader_decisions>();
        decisions->wg_size       = context.max_wg_size;
        webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
        pipeline.context         = decisions;
        conv2d_pipelines[key]    = pipeline;
        return conv2d_pipelines[key];
    }
  private:
    static webgpu_pipeline ggml_webgpu_create_pipeline(wgpu::Device & device,
                                                       std::string    shader_code,
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@ -8,6 +8,7 @@
 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
 #include "ggml-webgpu-shader-lib.hpp"
 #include "ggml.h"
 #ifdef __EMSCRIPTEN__
 #    include <emscripten/emscripten.h>
@ -83,7 +84,7 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim
 #define WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN      10u
 #define WEBGPU_RUNTIME_WAIT_TIMEOUT_MS           30000u
 #define WEBGPU_RUNTIME_WAIT_TIMEOUT_NS           (WEBGPU_RUNTIME_WAIT_TIMEOUT_MS * 1e6)
-#define WEBGPU_PARAMS_BUF_SIZE_BYTES             128  // enough for 32 parameters
+#define WEBGPU_PARAMS_BUF_SIZE_BYTES             256  // enough for 64 parameters
 #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES     4
 #define WEBGPU_STORAGE_BUF_BINDING_MULT          4    // a storage buffer binding size must be a multiple of 4
@ -907,6 +908,97 @@ static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context & ctx,
    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
 static webgpu_encoded_op ggml_webgpu_conv_2d(webgpu_context & ctx,
                                             ggml_tensor *    src0,
                                             ggml_tensor *    src1,
                                             ggml_tensor *    dst) {
    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
    const int32_t p0 = ggml_get_op_params_i32(dst, 2);
    const int32_t p1 = ggml_get_op_params_i32(dst, 3);
    const int32_t d0 = ggml_get_op_params_i32(dst, 4);
    const int32_t d1 = ggml_get_op_params_i32(dst, 5);
    std::vector<uint32_t> params = {
        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
        (uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
        (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
        (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
        (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
        (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
        (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
        (uint32_t) (dst->nb[0] / ggml_type_size(dst->type)),
        (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
        (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
        (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
        (uint32_t) src0->ne[0],
        (uint32_t) src0->ne[1],
        (uint32_t) src0->ne[2],
        (uint32_t) src1->ne[0],
        (uint32_t) src1->ne[1],
        (uint32_t) dst->ne[0],
        (uint32_t) dst->ne[1],
        (uint32_t) dst->ne[2],
        (uint32_t) dst->ne[3],
        (uint32_t) s0,
        (uint32_t) s1,
        (uint32_t) p0,
        (uint32_t) p1,
        (uint32_t) d0,
        (uint32_t) d1,
    };
    std::vector<wgpu::BindGroupEntry> entries = {
        { .binding = 0,
         .buffer  = ggml_webgpu_tensor_buf(src0),
         .offset  = ggml_webgpu_tensor_align_offset(ctx, src0),
         .size    = ggml_webgpu_tensor_binding_size(ctx, src0) },
        { .binding = 1,
         .buffer  = ggml_webgpu_tensor_buf(src1),
         .offset  = ggml_webgpu_tensor_align_offset(ctx, src1),
         .size    = ggml_webgpu_tensor_binding_size(ctx, src1) },
        { .binding = 2,
         .buffer  = ggml_webgpu_tensor_buf(dst),
         .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
         .size    = ggml_webgpu_tensor_binding_size(ctx, dst)  },
    };
    uint32_t max_wg_size =
        std::min((uint32_t) WEBGPU_MAX_WG_SIZE, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupSizeX);
    uint32_t wg_size =
        std::min((uint32_t) ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup, max_wg_size);
    ggml_webgpu_shader_lib_context shader_lib_ctx = {
        .src0        = src0,
        .src1        = src1,
        .dst         = dst,
        .max_wg_size = wg_size,
    };
    webgpu_pipeline pipeline = ctx->shader_lib->get_conv2d_pipeline(shader_lib_ctx);
    auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
    uint32_t n_out    = ggml_nelements(dst);
    uint32_t total_wg = CEIL_DIV(n_out, decisions->wg_size);
    uint32_t max_wg   = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
    uint32_t wg_x     = std::min(total_wg, max_wg);
    uint32_t wg_y     = CEIL_DIV(total_wg, wg_x);
    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
 static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx,
                                              ggml_tensor *    src0,
                                              ggml_tensor *    src1,
@ -2753,6 +2845,8 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context c
        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
            return ggml_webgpu_sum_rows(ctx, src0, node);
        case GGML_OP_CONV_2D:
            return ggml_webgpu_conv_2d(ctx, src0, src1, node);
        default:
            return std::nullopt;
    }
@ -3781,6 +3875,11 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
        case GGML_OP_SOLVE_TRI:
            supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
            break;
        case GGML_OP_CONV_2D:
            supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
                          (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
                          (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
            break;
        case GGML_OP_SSM_CONV:
            supports_op = op->type == GGML_TYPE_F32;
            break;
--- a/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl
@ -0,0 +1,166 @@
 #include "common_decls.tmpl"
 enable f16;
@group(0) @binding(0)
 #if defined(WEIGHT_F32)
 var<storage, read_write> weights: array<f32>;
 #elif defined(WEIGHT_F16)
 var<storage, read_write> weights: array<f16>;
 #endif
@group(0) @binding(1)
 #if defined(INPUT_F32)
 var<storage, read_write> input: array<f32>;
 #elif defined(INPUT_F16)
 var<storage, read_write> input: array<f16>;
 #endif
@group(0) @binding(2)
 #if defined(OUTPUT_F32)
 var<storage, read_write> output: array<f32>;
 #elif defined(OUTPUT_F16)
 var<storage, read_write> output: array<f16>;
 #endif
 struct Params {
    offset_w: u32,
    offset_i: u32,
    offset_o: u32,
    // element strides
    sw0: u32, sw1: u32, sw2: u32, sw3: u32,
    si0: u32, si1: u32, si2: u32, si3: u32,
    so0: u32, so1: u32, so2: u32, so3: u32,
    // kernel dimensions
    KW: u32, KH: u32, IC: u32,
    // input dimensions
    IW: u32, IH: u32,
    // output dimensions
    OW: u32, OH: u32, OC_out: u32, N_out: u32,
    // stride
    s0: u32, s1: u32,
    // padding
    p0: u32, p1: u32,
    // dilation
    d0: u32, d1: u32,
 };
@group(0) @binding(3)
 var<uniform> params: Params;
 fn load_weight(idx: u32) -> f32 {
    #if defined(WEIGHT_F32)
        return weights[idx];
    #elif defined(WEIGHT_F16)
        return f32(weights[idx]);
    #endif
 }
 fn load_input(idx: u32) -> f32 {
    #if defined(INPUT_F32)
        return input[idx];
    #elif defined(INPUT_F16)
        return f32(input[idx]);
    #endif
 }
 fn store_output(idx: u32, val: f32) {
    #if defined(OUTPUT_F32)
        output[idx] = val;
    #elif defined(OUTPUT_F16)
        output[idx] = f16(val);
    #endif
 }
 fn ceil_div_u32(x: u32, y: u32) -> u32 {
    return (x + y - 1) / y;
 }   
 // returns the first valid kernel index k such that base + k * step >= 0
 fn first_valid_k(base: i32, step: u32) -> u32 {
    if (base >= 0) {
        return 0;
    }
    return ceil_div_u32(u32(-base), step);
 }
 // returns the first invalid kernel index k such that base + k * step >= limit so valid k are in [0, end_valid_k)
 fn end_valid_k(base: i32, step: u32, limit: u32, k_max: u32) -> u32 {
    let remaining = i32(limit) - base;
    if (remaining <= 0) {
        return 0;
    }
    return min(k_max, ceil_div_u32(u32(remaining), step));
 }
@compute @workgroup_size(WG_SIZE)
 fn main(
    @builtin(global_invocation_id) gid: vec3<u32>,
    @builtin(num_workgroups) num_wg: vec3<u32>
 ) {
    let threads_per_group = u32(WG_SIZE);
    let i_out = gid.x + (num_wg.x * threads_per_group) * gid.y;
    let n_out = params.OW * params.OH * params.OC_out * params.N_out;
    var sum: f32 = 0.0;
    if (i_out >= n_out) {
        return;
    }
    // Kernel layout: [KW, KH, IC, ..]
    // Input layout:  [IW, IH, .., ..]
    // Output layout: [OW, OH, OC, N]
    var i = i_out;
    let n = i / (params.OC_out * params.OH * params.OW);
    i = i % (params.OC_out * params.OH * params.OW);
    let oc = i / (params.OH * params.OW);
    i = i % (params.OH * params.OW);
    let oh = i / params.OW;
    let ow = i % params.OW;
    let ow_base = i32(ow * params.s0) - i32(params.p0);
    let oh_base = i32(oh * params.s1) - i32(params.p1);
    // clip the valid kernel window once
    let kw_begin = first_valid_k(ow_base, params.d0);
    let kw_end = end_valid_k(ow_base, params.d0, params.IW, params.KW);
    let kh_begin = first_valid_k(oh_base, params.d1);
    let kh_end = end_valid_k(oh_base, params.d1, params.IH, params.KH);
    // entire receptive field is out of bounds
    if (kw_begin >= kw_end || kh_begin >= kh_end) {
        let out_idx = params.offset_o + ow * params.so0 + oh * params.so1 + oc * params.so2 + n * params.so3;
        store_output(out_idx, 0.0);
        return;
    }
    let weight_oc_base = params.offset_w + oc * params.sw3;
    let input_n_base = params.offset_i + n * params.si3;
    for (var ic: u32 = 0; ic < params.IC; ic += 1) {
        let w_base_ic = ic * params.sw2 + weight_oc_base;
        let in_base = ic * params.si2 + input_n_base;
       for (var kh: u32 = kh_begin; kh < kh_end; kh += 1)  {
        let ih = u32(oh_base + i32(kh * params.d1));
        let w_row_base = w_base_ic + kh * params.sw1;
        let in_row_base = in_base + ih * params.si1;
        for (var kw: u32 = kw_begin; kw < kw_end; kw += 1) {
            let iw = u32(ow_base + i32(kw * params.d0));
            let w_idx = w_row_base + kw * params.sw0;
            let in_idx = in_row_base + iw * params.si0;
            sum += load_weight(w_idx) * load_input(in_idx);
        }
       }
    }
    let out_idx = params.offset_o + ow * params.so0 + oh * params.so1 + oc * params.so2 + n * params.so3;
    store_output(out_idx, sum);
 }