From 30ba139e5b973f056ff8c51347d4715ab0e3df93 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 30 Jul 2025 12:33:06 -0700
Subject: [PATCH 01/17] Add paramater buffer pool, batching of submissions,
 refactor command building/submission

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 684 +++++++++++++--------------
 1 file changed, 337 insertions(+), 347 deletions(-)
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index c5abc69343..e35c865ea7 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -8,7 +8,6 @@
 #include "ggml-wgsl-shaders.hpp"
 
 #include <cstring>
-#include <iostream>
 #include <mutex>
 #include <vector>
 
@@ -20,26 +19,78 @@
 
 /* Constants */
 
+#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 16
 #define WEBGPU_MUL_MAT_WG_SIZE 64
-#define WEBGPU_MUL_MAT_PARAMS_SIZE (13 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts
-#define WEBGPU_CPY_PARAMS_SIZE (15 * sizeof(uint32_t)) // strides and offsets
+#define WEBGPU_NUM_PARAM_BUFS 100
+#define WEBGPU_PARAMS_BUF_SIZE_BYTES 256
 #define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
 
 /* End Constants */
 
 // This is a "fake" base pointer, since WebGPU buffers do not have pointers to their locations.
-static void * const webgpu_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
+static void* const webgpu_ptr_base = (void*)(uintptr_t)0x1000;  // NOLINT
 
 // Always returns the base offset of a tensor, regardless of views.
-static uint64_t webgpu_tensor_offset(const ggml_tensor * tensor) {
+static uint64_t webgpu_tensor_offset(const ggml_tensor* tensor) {
     if (tensor->view_src) {
-        return (uint8_t *) tensor->view_src->data - (uint8_t *) webgpu_ptr_base;
+        return (uint8_t*)tensor->view_src->data - (uint8_t*)webgpu_ptr_base;
     }
-    return (uint8_t *) tensor->data - (uint8_t *) webgpu_ptr_base;
+    return (uint8_t*)tensor->data - (uint8_t*)webgpu_ptr_base;
 }
 
 /* Struct definitions */
 
+// Forward reference
+static void ggml_webgpu_create_buffer(wgpu::Device& device, wgpu::Buffer& buffer, size_t size, wgpu::BufferUsage usage, const char* label);
+
+struct webgpu_param_bufs {
+    wgpu::Buffer host_buf;
+    wgpu::Buffer dev_buf;
+};
+
+// Holds a pool of parameter buffers for WebGPU operations
+struct webgpu_param_buf_pool {
+    std::vector<webgpu_param_bufs> free;
+
+    std::mutex mutex;
+    std::condition_variable cv;
+
+    void init(wgpu::Device device) {
+        for (int i = 0; i < WEBGPU_NUM_PARAM_BUFS; i++) {
+            wgpu::Buffer host_buf;
+            wgpu::Buffer dev_buf;
+            ggml_webgpu_create_buffer(device, host_buf, WEBGPU_PARAMS_BUF_SIZE_BYTES, wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite, "ggml_webgpu_host_params_buf");
+            ggml_webgpu_create_buffer(device, dev_buf, WEBGPU_PARAMS_BUF_SIZE_BYTES, wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, "ggml_webgpu_dev_params_buf");
+            free.push_back({ host_buf, dev_buf });
+        }
+    }
+
+    webgpu_param_bufs alloc_bufs() {
+        std::unique_lock<std::mutex> lock(mutex);
+        cv.wait(lock, [this] {
+            return !free.empty();
+            });
+        webgpu_param_bufs bufs = free.back();
+        free.pop_back();
+        return bufs;
+    }
+
+    void free_bufs(const webgpu_param_bufs& bufs) {
+        std::lock_guard<std::mutex> lock(mutex);
+        free.push_back(bufs);
+        cv.notify_one();
+    }
+
+    void cleanup() {
+        std::lock_guard<std::mutex> lock(mutex);
+        for (auto& bufs : free) {
+            bufs.host_buf.Destroy();
+            bufs.dev_buf.Destroy();
+        }
+        free.clear();
+    }
+};
+
 // All the base objects needed to run operations on a WebGPU device
 struct webgpu_context_struct {
     wgpu::Instance instance;
@@ -49,25 +100,27 @@ struct webgpu_context_struct {
     wgpu::Limits limits;
     wgpu::SupportedFeatures features;
 
-    std::mutex mutex;
-    bool device_initialized = false;
+    std::recursive_mutex submit_mutex;
+    std::mutex get_tensor_mutex;
+    std::mutex init_mutex;
+    bool device_init = false;
+
+    // Parameter buffer pool
+    webgpu_param_buf_pool param_buf_pool;
 
-    // pipelines and parameter buffers
-    // TODO: reuse params buffers for different pipelines when possible
     wgpu::ComputePipeline memset_pipeline;
-    wgpu::Buffer memset_params_dev_buf;
-    wgpu::Buffer memset_params_host_buf;
     wgpu::ComputePipeline mul_mat_pipeline;
-    wgpu::Buffer mul_mat_params_dev_buf;
-    wgpu::Buffer mul_mat_params_host_buf;
     wgpu::ComputePipeline cpy_pipeline;
-    wgpu::Buffer cpy_params_dev_buf;
-    wgpu::Buffer cpy_params_host_buf;
 
     size_t memset_bytes_per_thread;
 
     // Staging buffer for reading data from the GPU
     wgpu::Buffer get_tensor_staging_buf;
+
+    // Command buffers which need to be submitted
+    std::vector<wgpu::CommandBuffer> staged_command_bufs;
+    // Parameter buffers associated with the staged command buffers
+    std::vector<webgpu_param_bufs> staged_param_bufs;
 };
 
 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -76,7 +129,7 @@ struct ggml_backend_webgpu_reg_context {
     webgpu_context webgpu_ctx;
 
     size_t device_count;
-    const char * name;
+    const char* name;
 };
 
 struct ggml_backend_webgpu_device_context {
@@ -98,7 +151,7 @@ struct ggml_backend_webgpu_buffer_context {
     wgpu::Buffer buffer;
 
     ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) :
-        webgpu_ctx(ctx), buffer(buf) {
+        webgpu_ctx(std::move(ctx)), buffer(std::move(buf)) {
     }
 };
 
@@ -106,7 +159,7 @@ struct ggml_backend_webgpu_buffer_context {
 
 /* WebGPU object initializations */
 
-static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipeline &pipeline, const char * shader_code, const char * label, const std::vector<wgpu::ConstantEntry> &constants = {}) {
+static void ggml_webgpu_create_pipeline(wgpu::Device& device, wgpu::ComputePipeline& pipeline, const char* shader_code, const char* label, const std::vector<wgpu::ConstantEntry>& constants = {}) {
     WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()");
     wgpu::ShaderSourceWGSL shader_source;
     shader_source.code = shader_code;
@@ -126,7 +179,7 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipel
     pipeline = device.CreateComputePipeline(&pipeline_desc);
 }
 
-static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer, size_t size, wgpu::BufferUsage usage, const char* label) {
+static void ggml_webgpu_create_buffer(wgpu::Device& device, wgpu::Buffer& buffer, size_t size, wgpu::BufferUsage usage, const char* label) {
     WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()");
 
     wgpu::BufferDescriptor buffer_desc;
@@ -142,68 +195,9 @@ static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer
 
 /** WebGPU Actions */
 
-static void ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buffer, wgpu::MapMode mode, size_t offset, size_t size) {
-    ctx->instance.WaitAny(buffer.MapAsync(
-        mode, offset, size, wgpu::CallbackMode::WaitAnyOnly,
-        [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
-            if (status != wgpu::MapAsyncStatus::Success) {
-                GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n", message.data);
-            }
-        }),
-        UINT64_MAX
-    );
-}
-
-static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint32_t value, size_t offset, size_t size) {
-    std::lock_guard<std::mutex> lock(ctx->mutex);
-    wgpu::Device device = ctx->device;
-
-    // map the host parameters buffer
-    ggml_backend_webgpu_map_buffer(ctx, ctx->memset_params_host_buf, wgpu::MapMode::Write, 0, ctx->memset_params_host_buf.GetSize());
-    uint32_t * params = (uint32_t *) ctx->memset_params_host_buf.GetMappedRange();
-
-    params[0] = (uint32_t)offset;
-    params[1] = (uint32_t)size;
-    params[2] = value;
-    ctx->memset_params_host_buf.Unmap();
-
-    wgpu::BindGroupEntry entries[2];
-    entries[0].binding = 0; // binding for the buffer to memset
-    entries[0].buffer = buf;
-    entries[0].offset = 0;
-    entries[0].size = buf.GetSize();
-    entries[1].binding = 1; // binding for the parameters
-    entries[1].buffer = ctx->memset_params_dev_buf;
-    entries[1].offset = 0;
-    entries[1].size = ctx->memset_params_dev_buf.GetSize();
-
-    wgpu::BindGroupDescriptor bind_group_desc;
-    bind_group_desc.layout = ctx->memset_pipeline.GetBindGroupLayout(0);
-    bind_group_desc.entryCount = 2;
-    bind_group_desc.label = "ggml_memset";
-    bind_group_desc.entries = entries;
-    wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc);
-
-    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-    encoder.CopyBufferToBuffer(
-        ctx->memset_params_host_buf, 0,
-        ctx->memset_params_dev_buf, 0,
-        ctx->memset_params_dev_buf.GetSize()
-    );
-    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
-    pass.SetPipeline(ctx->memset_pipeline);
-    pass.SetBindGroup(0, bind_group);
-    size_t bytes_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_bytes_per_thread;
-    pass.DispatchWorkgroups(((size + 3) + bytes_per_wg - 1) / bytes_per_wg, 1, 1);
-    pass.End();
-    wgpu::CommandBuffer commands = encoder.Finish();
-
-    ctx->queue.Submit(1, &commands);
-}
-
-static void ggml_backend_webgpu_wait_on_submission(webgpu_context ctx) {
+static void ggml_backend_webgpu_wait_on_submission(webgpu_context& ctx) {
     // Wait for the queue to finish processing all commands
-    ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::WaitAnyOnly,
+    ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
         [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
             if (status != wgpu::QueueWorkDoneStatus::Success) {
                 GGML_LOG_ERROR("ggml_webgpu: Failed to wait on queue: %s\n", message.data);
@@ -213,223 +207,233 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context ctx) {
     );
 }
 
+static void ggml_backend_webgpu_submit_queue(webgpu_context& ctx) {
+    std::lock_guard<std::recursive_mutex> lock(ctx->submit_mutex);
+
+    ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data());
+    ctx->staged_command_bufs.clear();
+    std::vector<webgpu_param_bufs> staged_param_bufs = std::move(ctx->staged_param_bufs);
+    // Free the staged parameter buffers once the submission completes
+    ctx->queue.OnSubmittedWorkDone(
+        wgpu::CallbackMode::AllowSpontaneous,
+        [ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+            if (status != wgpu::QueueWorkDoneStatus::Success) {
+                GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data);
+            }
+            // Free the staged parameter buffers
+            for (const auto& bufs : staged_param_bufs) {
+                ctx->param_buf_pool.free_bufs(bufs);
+            }
+        });
+}
+
+static void ggml_backend_webgpu_map_buffer(webgpu_context& ctx, wgpu::Buffer& buffer, wgpu::MapMode mode, size_t offset, size_t size) {
+    ctx->instance.WaitAny(buffer.MapAsync(
+        mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
+        [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+            if (status != wgpu::MapAsyncStatus::Success) {
+                GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n", message.data);
+            }
+        }),
+        UINT64_MAX
+    );
+}
+
+static void ggml_backend_webgpu_build_and_enqueue(webgpu_context& ctx, wgpu::ComputePipeline& pipeline, std::vector<uint32_t> params, std::vector<wgpu::BindGroupEntry> bind_group_entries, uint32_t wg_x, bool submit_imm = false) {
+    webgpu_param_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
+
+    ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf,
+        wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
+    uint32_t* _params = (uint32_t*)params_bufs.host_buf.GetMappedRange();
+    for (size_t i = 0; i < params.size(); i++) {
+        _params[i] = params[i];
+    };
+
+    params_bufs.host_buf.Unmap();
+
+    uint32_t params_bufs_binding_num = bind_group_entries.size();
+    bind_group_entries.push_back({
+        .binding = params_bufs_binding_num,
+        .buffer = params_bufs.dev_buf,
+        .offset = 0,
+        .size = params_bufs.dev_buf.GetSize()
+    });
+
+    wgpu::BindGroupDescriptor bind_group_desc;
+    bind_group_desc.layout = pipeline.GetBindGroupLayout(0);
+    bind_group_desc.entryCount = bind_group_entries.size();
+    bind_group_desc.entries = bind_group_entries.data();
+    wgpu::BindGroup bind_group = ctx->device.CreateBindGroup(&bind_group_desc);
+
+    wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
+    encoder.CopyBufferToBuffer(
+        params_bufs.host_buf, 0,
+        params_bufs.dev_buf, 0,
+        params_bufs.dev_buf.GetSize()
+    );
+    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+    pass.SetPipeline(pipeline);
+    pass.SetBindGroup(0, bind_group);
+    pass.DispatchWorkgroups(wg_x, 1, 1);
+    pass.End();
+    wgpu::CommandBuffer commands =  encoder.Finish();
+    if (submit_imm) {
+        // Submit immediately
+        ctx->queue.Submit(1, &commands);
+        ctx->queue.OnSubmittedWorkDone(
+            wgpu::CallbackMode::AllowSpontaneous,
+            [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+                if (status != wgpu::QueueWorkDoneStatus::Success) {
+                    GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data);
+                }
+                ctx->param_buf_pool.free_bufs(params_bufs);
+        });
+    } else {
+        // Enqueue commands and only submit if we have enough staged commands
+        std::lock_guard<std::recursive_mutex> lock(ctx->submit_mutex);
+        ctx->staged_command_bufs.push_back(commands);
+        ctx->staged_param_bufs.push_back(params_bufs);
+        if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
+            ggml_backend_webgpu_submit_queue(ctx);
+        }
+    }
+}
+
+static void ggml_backend_webgpu_buffer_memset(webgpu_context& ctx, wgpu::Buffer& buf, uint32_t value, size_t offset, size_t size) {
+    std::vector<uint32_t> params = {(uint32_t)offset, (uint32_t)size, value};
+    std::vector<wgpu::BindGroupEntry> entries = {{ .binding = 0, .buffer = buf, .offset = 0, .size = buf.GetSize() }};
+    size_t bytes_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_bytes_per_thread;
+    uint32_t wg_x = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
+    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, true);
+}
+
+static size_t ggml_backend_webgpu_tensor_offset(const ggml_tensor* tensor) {
+    return webgpu_tensor_offset(tensor) + tensor->view_offs;
+}
+
+static wgpu::Buffer ggml_backend_webgpu_tensor_buf(const ggml_tensor* tensor) {
+    ggml_backend_webgpu_buffer_context* ctx = (ggml_backend_webgpu_buffer_context*)tensor->buffer->context;
+    return ctx->buffer;
+}
+
 /** End WebGPU Actions */
 
 /** GGML Backend Interface */
 
-static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
-    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context;
+static const char* ggml_backend_webgpu_name(ggml_backend_t backend) {
+    ggml_backend_webgpu_context* ctx = (ggml_backend_webgpu_context*)backend->context;
     return ctx->name.c_str();
 }
 
 static void ggml_backend_webgpu_free(ggml_backend_t backend) {
-    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context;
+    ggml_backend_webgpu_context* ctx = (ggml_backend_webgpu_context*)backend->context;
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
 
     // TODO: cleanup
     GGML_UNUSED(ctx);
 }
 
+static void ggml_webgpu_cpy(webgpu_context& ctx, ggml_tensor* src, ggml_tensor* dst) {
+    size_t src_offset = ggml_backend_webgpu_tensor_offset(src);
+    // assumes power of 2 offset alignment
+    size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
+    // align to minimum offset alignment
+    src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
+    size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst);
+    size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
+    dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
+    uint32_t ne = (uint32_t)ggml_nelements(dst);
+    std::vector<uint32_t> params = {
+        ne, (uint32_t)(src_misalignment / ggml_type_size(src->type)), (uint32_t)(dst_misalignment / ggml_type_size(dst->type)),
+        // Convert byte-strides to element-strides
+        (uint32_t)(src->nb[0] / ggml_type_size(src->type)), (uint32_t)(src->nb[1] / ggml_type_size(src->type)),
+        (uint32_t)(src->nb[2] / ggml_type_size(src->type)), (uint32_t)(src->nb[3] / ggml_type_size(src->type)),
+        (uint32_t)(dst->nb[0] / ggml_type_size(dst->type)), (uint32_t)(dst->nb[1] / ggml_type_size(dst->type)),
+        (uint32_t)(dst->nb[2] / ggml_type_size(dst->type)), (uint32_t)(dst->nb[3] / ggml_type_size(dst->type)),
+        // Logical shape — same for both tensors even if permuted
+        (uint32_t)src->ne[0], (uint32_t)src->ne[1], (uint32_t)src->ne[2], (uint32_t)src->ne[3]
+    };
+
+    std::vector<wgpu::BindGroupEntry> entries = {
+        { .binding = 0, .buffer = ggml_backend_webgpu_tensor_buf(src), .offset = src_offset, .size = (ggml_nbytes(src) + src_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) },
+        { .binding = 1, .buffer = ggml_backend_webgpu_tensor_buf(dst), .offset = dst_offset, .size = (ggml_nbytes(dst) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) }
+    };
+
+    size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
+    uint32_t wg_x = (ne + max_wg_size - 1) / max_wg_size;
+    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline, params, entries, wg_x);
+}
+
+static void ggml_webgpu_mul_mat(webgpu_context& ctx, ggml_tensor* src0, ggml_tensor* src1, ggml_tensor* dst) {
+    std::vector<uint32_t> params = {
+        (uint32_t)dst->ne[1], // number of rows in result (M)
+        (uint32_t)dst->ne[0], // number of columns in result (N)
+        (uint32_t)src0->ne[0], // number of columns in src0/src1 (K)
+        (uint32_t)(src0->nb[1] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 1
+        (uint32_t)(src1->nb[1] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 1
+        (uint32_t)(src0->nb[2] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 2
+        (uint32_t)(src1->nb[2] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 2
+        (uint32_t)(src0->nb[3] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 3
+        (uint32_t)(src1->nb[3] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 3
+        (uint32_t)src0->ne[2], // batch size in dimension 2
+        (uint32_t)src0->ne[3], // batch size in dimension 3
+        (uint32_t)(src1->ne[2] / src0->ne[2]), // broadcast in dimension 2
+        (uint32_t)(src1->ne[3] / src0->ne[3]) // broadcast in dimension 3
+    };
+
+    std::vector<wgpu::BindGroupEntry> entries = {
+        { .binding = 0, .buffer = ggml_backend_webgpu_tensor_buf(src0), .offset = ggml_backend_webgpu_tensor_offset(src0), .size = ggml_nbytes(src0) },
+        { .binding = 1, .buffer = ggml_backend_webgpu_tensor_buf(src1), .offset = ggml_backend_webgpu_tensor_offset(src1), .size = ggml_nbytes(src1) },
+        { .binding = 2, .buffer = ggml_backend_webgpu_tensor_buf(dst), .offset = ggml_backend_webgpu_tensor_offset(dst), .size = ggml_nbytes(dst) }
+    };
+
+    uint32_t wg_x = (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
+    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline, params, entries, wg_x);
+}
+
 // Returns true if node has enqueued work into the queue, false otherwise
-static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
+static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor* node) {
     if (ggml_is_empty(node)) {
         return false;
     }
-
     WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
 
+    ggml_tensor* src0 = node->src[0];
+    ggml_tensor* src1 = node->src[1];
 
     switch (node->op) {
         // no-ops
-        case GGML_OP_NONE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-            return false;
-
-        case GGML_OP_CPY: {
-            std::lock_guard<std::mutex> lock(ctx->mutex);
-            const ggml_tensor * src = node->src[0];
-            ggml_backend_webgpu_buffer_context * src_ctx = (ggml_backend_webgpu_buffer_context *) src->buffer->context;
-            size_t src_offset = webgpu_tensor_offset(src) + src->view_offs;
-            // assumes power of 2 offset alignment
-            size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
-            // align to minimum offset alignment
-            src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
-            ggml_backend_webgpu_buffer_context * dst_ctx = (ggml_backend_webgpu_buffer_context *) node->buffer->context;
-            size_t dst_offset = webgpu_tensor_offset(node) + node->view_offs;
-            size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
-            dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
-
-            wgpu::Device device = ctx->device;
-            ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf,
-                wgpu::MapMode::Write, 0, ctx->cpy_params_host_buf.GetSize());
-            uint32_t * params = (uint32_t *) ctx->cpy_params_host_buf.GetMappedRange();
-            uint32_t ne = (uint32_t)ggml_nelements(node);
-            params[0] = ne;
-            params[1] = src_misalignment/ggml_type_size(src->type);
-            params[2] = dst_misalignment/ggml_type_size(node->type);
-
-            // Convert byte-strides to element-strides
-            params[3] = (uint32_t)src->nb[0]/ggml_type_size(src->type);
-            params[4] = (uint32_t)src->nb[1]/ggml_type_size(src->type);
-            params[5] = (uint32_t)src->nb[2]/ggml_type_size(src->type);
-            params[6] = (uint32_t)src->nb[3]/ggml_type_size(src->type);
-            params[7] = (uint32_t)node->nb[0]/ggml_type_size(node->type);
-            params[8] = (uint32_t)node->nb[1]/ggml_type_size(node->type);
-            params[9] = (uint32_t)node->nb[2]/ggml_type_size(node->type);
-            params[10] = (uint32_t)node->nb[3]/ggml_type_size(node->type);
-            // Logical shape — same for both tensors even if permuted
-            params[11] = (uint32_t)(src->ne[0]);
-            params[12] = (uint32_t)(src->ne[1]);
-            params[13] = (uint32_t)(src->ne[2]);
-            params[14] = (uint32_t)(src->ne[3]);
-
-            ctx->cpy_params_host_buf.Unmap();
-
-            wgpu::BindGroupEntry entries[3];
-            entries[0].binding = 0;
-            entries[0].buffer = src_ctx->buffer;
-            entries[0].offset = src_offset;
-            entries[0].size = (ggml_nbytes(src) + src_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
-
-            entries[1].binding = 1;
-            entries[1].buffer = dst_ctx->buffer;
-            entries[1].offset = dst_offset;
-            entries[1].size = (ggml_nbytes(node) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
-
-            entries[2].binding = 2;
-            entries[2].buffer = ctx->cpy_params_dev_buf;
-            entries[2].offset = 0;
-            entries[2].size = ctx->cpy_params_dev_buf.GetSize();
-
-            wgpu::BindGroupDescriptor bind_group_desc;
-            bind_group_desc.layout = ctx->cpy_pipeline.GetBindGroupLayout(0);
-            bind_group_desc.label = "ggml_op_cpy";
-            bind_group_desc.entryCount = 3;
-            bind_group_desc.entries = entries;
-            wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc);
-
-            wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-            encoder.CopyBufferToBuffer(
-                ctx->cpy_params_host_buf, 0,
-                ctx->cpy_params_dev_buf, 0,
-                ctx->cpy_params_dev_buf.GetSize()
-            );
-            wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
-            pass.SetPipeline(ctx->cpy_pipeline);
-            pass.SetBindGroup(0, bind_group);
-            size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
-            pass.DispatchWorkgroups((ne + max_wg_size - 1) / max_wg_size);
-            pass.End();
-            wgpu::CommandBuffer commands = encoder.Finish();
-
-            // TODO, don't submit here, batch submissions
-            ctx->queue.Submit(1, &commands);
-            // TODO, don't wait on submission here
-            ggml_backend_webgpu_wait_on_submission(ctx);
-            return true;
-        }
-
-        case GGML_OP_MUL_MAT:
-         {
-            const ggml_tensor * src0 = node->src[0];
-            ggml_backend_webgpu_buffer_context * src0_ctx = (ggml_backend_webgpu_buffer_context *) src0->buffer->context;
-            size_t src0_offset = webgpu_tensor_offset(src0) + src0->view_offs;
-            const ggml_tensor * src1 = node->src[1];
-            ggml_backend_webgpu_buffer_context * src1_ctx = (ggml_backend_webgpu_buffer_context *) src1->buffer->context;
-            size_t src1_offset = webgpu_tensor_offset(src1) + src1->view_offs;
-            ggml_backend_webgpu_buffer_context * dst_ctx = (ggml_backend_webgpu_buffer_context *) node->buffer->context;
-
-            size_t dst_offset = webgpu_tensor_offset(node) + node->view_offs;
-
-            wgpu::Device device = ctx->device;
-
-            // map the host parameters buffer
-            ggml_backend_webgpu_map_buffer(ctx, ctx->mul_mat_params_host_buf,
-                wgpu::MapMode::Write, 0, ctx->mul_mat_params_host_buf.GetSize());
-            uint32_t * params = (uint32_t *) ctx->mul_mat_params_host_buf.GetMappedRange();
-
-            params[0] = (uint32_t)node->ne[1]; // number of rows in result (M)
-            params[1] = (uint32_t)node->ne[0]; // number of columns in result (N)
-            params[2] = (uint32_t)src0->ne[0]; // number of columns in src0/src1 (K)
-
-            params[3] = (uint32_t)src0->nb[1]/ggml_type_size(src0->type); // stride (elements) of src0 in dimension 1
-            params[4] = (uint32_t)src1->nb[1]/ggml_type_size(src1->type); // stride (elements) of src1 in dimension 1
-            params[5] = (uint32_t)src0->nb[2]/ggml_type_size(src0->type); // stride (elements) of src0 in dimension 2
-            params[6] = (uint32_t)src1->nb[2]/ggml_type_size(src1->type); // stride (elements) of src1 in dimension 2
-            params[7] = (uint32_t)src0->nb[3]/ggml_type_size(src0->type); // stride (elements) of src0 in dimension 3
-            params[8] = (uint32_t)src1->nb[3]/ggml_type_size(src1->type); // stride (elements) of src1 in dimension 3
-
-            params[9] = (uint32_t)src0->ne[2]; // batch size in dimension 2
-            params[10] = (uint32_t)src0->ne[3]; // batch size in dimension 3
-            params[11] = (uint32_t)(src1->ne[2]/src0->ne[2]); // broadcast in dimension 2
-            params[12] = (uint32_t)(src1->ne[3]/src0->ne[3]); // broadcast in dimension 3
-
-            ctx->mul_mat_params_host_buf.Unmap();
-
-            wgpu::BindGroupEntry entries[4];
-            entries[0].binding = 0;
-            entries[0].buffer = src0_ctx->buffer;
-            entries[0].offset = src0_offset;
-            entries[0].size = ggml_nbytes(src0);
-
-            entries[1].binding = 1;
-            entries[1].buffer = src1_ctx->buffer;
-            entries[1].offset = src1_offset;
-            entries[1].size = ggml_nbytes(src1);
-
-            entries[2].binding = 2;
-            entries[2].buffer = dst_ctx->buffer;
-            entries[2].offset = dst_offset;
-            entries[2].size = ggml_nbytes(node);
-
-            entries[3].binding = 3;
-            entries[3].buffer = ctx->mul_mat_params_dev_buf;
-            entries[3].offset = 0;
-            entries[3].size = ctx->mul_mat_params_dev_buf.GetSize();
-
-            wgpu::BindGroupDescriptor bind_group_desc;
-            bind_group_desc.layout = ctx->mul_mat_pipeline.GetBindGroupLayout(0);
-            bind_group_desc.entryCount = 4;
-            bind_group_desc.label = "ggml_op_mul_mat";
-            bind_group_desc.entries = entries;
-            wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc);
-
-            wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-            encoder.CopyBufferToBuffer(
-                ctx->mul_mat_params_host_buf, 0,
-                ctx->mul_mat_params_dev_buf, 0,
-                ctx->mul_mat_params_dev_buf.GetSize()
-            );
-            wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
-            pass.SetPipeline(ctx->mul_mat_pipeline);
-            pass.SetBindGroup(0, bind_group);
-            pass.DispatchWorkgroups((node->ne[0] * node->ne[1] * node->ne[2] * node->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE);
-            pass.End();
-            wgpu::CommandBuffer commands = encoder.Finish();
-
-            // TODO, don't submit here, batch submissions
-            ctx->queue.Submit(1, &commands);
-            // TODO, don't wait on submission here
-            ggml_backend_webgpu_wait_on_submission(ctx);
-            return true;
-        }
-
-        default:
-            return false;
+    case GGML_OP_NONE:
+    case GGML_OP_VIEW:
+    case GGML_OP_PERMUTE:
+        return false;
+    case GGML_OP_CPY: {
+        ggml_webgpu_cpy(ctx, src0, node);
+        break;
     }
+    case GGML_OP_MUL_MAT: {
+        ggml_webgpu_mul_mat(ctx, src0, src1, node);
+        break;
+    }
+    default:
+        return false;
+    }
+    return true;
 }
 
-static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)");
 
-    ggml_backend_webgpu_context * backend_ctx = static_cast<ggml_backend_webgpu_context *>(backend->context);
+    ggml_backend_webgpu_context* backend_ctx = static_cast<ggml_backend_webgpu_context*>(backend->context);
     webgpu_context ctx = backend_ctx->webgpu_ctx;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_webgpu_encode_node(ctx, cgraph->nodes[i]);
     }
 
+    ggml_backend_webgpu_submit_queue(ctx);
+    ggml_backend_webgpu_wait_on_submission(ctx);
+
     return GGML_STATUS_SUCCESS;
 }
 
@@ -455,17 +459,17 @@ static ggml_backend_i ggml_backend_webgpu_i = {
 
 static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()");
-    ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
+    ggml_backend_webgpu_buffer_context* ctx = static_cast<ggml_backend_webgpu_buffer_context*>(buffer->context);
     ctx->buffer.Destroy();
 }
 
 // Returns the "fake" base pointer.
-static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+static void* ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) {
     GGML_UNUSED(buffer);
     return webgpu_ptr_base;
 }
 
-static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor* tensor, uint8_t value, size_t offset, size_t size) {
     if (size == 0) {
         WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor: size is zero, nothing to do.");
         return;
@@ -473,21 +477,21 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe
 
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")");
 
-    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    ggml_backend_webgpu_buffer_context* buf_ctx = (ggml_backend_webgpu_buffer_context*)buffer->context;
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
     // This is a trick to set all bytes of a u32 to the same 1 byte value.
     uint32_t val32 = (uint32_t)value * 0x01010101;
     ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, val32, total_offset, size);
 }
 
-static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data, size_t offset, size_t size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
-    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    ggml_backend_webgpu_buffer_context* buf_ctx = (ggml_backend_webgpu_buffer_context*)buffer->context;
     webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
 
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
-    webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size/4)*4);
+    webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4);
 
     if (size % 4 != 0) {
         // If size is not a multiple of 4, we need to memset the remaining bytes
@@ -495,17 +499,17 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
         // pack the remaining bytes into a uint32_t
         uint32_t val32 = 0;
         for (size_t i = 0; i < remaining_size; i++) {
-            ((uint8_t *)&val32)[i] = ((const uint8_t *)data)[size - remaining_size + i];
+            ((uint8_t*)&val32)[i] = ((const uint8_t*)data)[size - remaining_size + i];
         }
         // memset the remaining bytes
         ggml_backend_webgpu_buffer_memset(webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size), remaining_size);
     }
 }
 
-static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data, size_t offset, size_t size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
 
-    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    ggml_backend_webgpu_buffer_context* buf_ctx = (ggml_backend_webgpu_buffer_context*)buffer->context;
     webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
     wgpu::Device device = webgpu_ctx->device;
 
@@ -517,7 +521,7 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
         final_size = size + (4 - (size % 4));
     }
 
-    std::lock_guard<std::mutex> lock(webgpu_ctx->mutex);
+    std::lock_guard<std::mutex> lock(webgpu_ctx->get_tensor_mutex);
 
     if (webgpu_ctx->get_tensor_staging_buf == nullptr ||
         webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) {
@@ -539,7 +543,7 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
     // Map the staging buffer to read the data
     ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, final_size);
     // Must specify size here since the staging buffer might be larger than the tensor size
-    const void * mapped_range = webgpu_ctx->get_tensor_staging_buf.GetConstMappedRange(0, final_size);
+    const void* mapped_range = webgpu_ctx->get_tensor_staging_buf.GetConstMappedRange(0, final_size);
 
     // Copy the data from the mapped range to the output buffer
     std::memcpy(data, mapped_range, size);
@@ -547,9 +551,9 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
 }
 
 static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t) value << ")");
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t)value << ")");
 
-    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    ggml_backend_webgpu_buffer_context* buf_ctx = (ggml_backend_webgpu_buffer_context*)buffer->context;
     ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buffer->size);
 }
 
@@ -569,32 +573,32 @@ static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
 
 /* GGML Backend Buffer Type Interface */
 
-static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+static const char* ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(buft->device->context);
     return ctx->device_name.c_str();
 }
 
 static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")");
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(buft->device->context);
 
     wgpu::Buffer buf;
     ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size,
         wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, "allocated_buffer");
 
-    ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
+    ggml_backend_webgpu_buffer_context* buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
 
     return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size);
 }
 
 static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(buft->device->context);
     return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment;
 }
 
 // maxBufferSize might be larger, but you can't bind more than maxStorageBufferBindingSize to a single binding.
 static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(buft->device->context);
     return ctx->webgpu_ctx->limits.maxStorageBufferBindingSize;
 }
 
@@ -602,18 +606,18 @@ static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_t
 
 /* GGML Backend Device Interface */
 
-static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
+static const char* ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(dev->context);
     return ctx->device_name.c_str();
 }
 
-static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
+static const char* ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(dev->context);
     return ctx->device_desc.c_str();
 }
 
-static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
+static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t* free, size_t* total) {
+    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(dev->context);
     // TODO: what do we actually want to return here? maxBufferSize might not be the full available memory.
     *free = ctx->webgpu_ctx->limits.maxBufferSize;
     *total = ctx->webgpu_ctx->limits.maxBufferSize;
@@ -624,10 +628,10 @@ static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backe
     return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
-static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_webgpu_device_get_name(dev);
+static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props* props) {
+    props->name = ggml_backend_webgpu_device_get_name(dev);
     props->description = ggml_backend_webgpu_device_get_description(dev);
-    props->type        = ggml_backend_webgpu_device_get_type(dev);
+    props->type = ggml_backend_webgpu_device_get_type(dev);
     ggml_backend_webgpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = {
         /* .async                 = */ false,
@@ -638,11 +642,11 @@ static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct
 }
 
 static ggml_guid_t ggml_backend_webgpu_guid(void) {
-    static const char * guid_str = "__ggml_webgpu :)";
-    return reinterpret_cast<ggml_guid_t>((void *)guid_str);
+    static const char* guid_str = "__ggml_webgpu :)";
+    return reinterpret_cast<ggml_guid_t>((void*)guid_str);
 }
 
-static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) {
+static void ggml_webgpu_init_memset_pipeline(webgpu_context& webgpu_ctx) {
     // we use the maximum workgroup size for the memset pipeline
     size_t max_wg_size = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
     size_t max_threads = max_wg_size * webgpu_ctx->limits.maxComputeWorkgroupsPerDimension;
@@ -654,45 +658,30 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) {
     constants[1].key = "bytes_per_thread";
     constants[1].value = webgpu_ctx->memset_bytes_per_thread;
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, "memset", constants);
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf,
-        3 * sizeof(uint32_t), // 3 parameters: buffer size, offset, value
-        wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "memset_params_dev_buf");
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_host_buf,
-        3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "memset_params_host_buf");
 }
 
-static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context webgpu_ctx) {
+static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context& webgpu_ctx) {
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat");
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_dev_buf, WEBGPU_MUL_MAT_PARAMS_SIZE,
-        wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "mul_mat_params_dev_buf");
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf, WEBGPU_MUL_MAT_PARAMS_SIZE,
-        wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "mul_mat_params_host_buf");
 }
 
-static void ggml_webgpu_init_cpy_pipeline(webgpu_context webgpu_ctx) {
+static void ggml_webgpu_init_cpy_pipeline(webgpu_context& webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants(1);
     constants[0].key = "wg_size";
     constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
-
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline, wgsl_cpy, "cpy", constants);
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->cpy_params_dev_buf, WEBGPU_CPY_PARAMS_SIZE,
-        wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "cpy_params_dev_buf");
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->cpy_params_host_buf, WEBGPU_CPY_PARAMS_SIZE,
-        wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "cpy_params_host_buf");
 }
 
-// TODO: Make thread safe if multiple devices are used
-static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
+static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char* params) {
     GGML_UNUSED(params);
 
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_init()");
 
-    ggml_backend_webgpu_device_context * dev_ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
+    ggml_backend_webgpu_device_context* dev_ctx = static_cast<ggml_backend_webgpu_device_context*>(dev->context);
     webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx;
 
-    std::lock_guard<std::mutex> lock(webgpu_ctx->mutex);
-
-    if (!webgpu_ctx->device_initialized) {
+    // Multiple threads may try to initialize the device
+    std::lock_guard<std::mutex> lock(webgpu_ctx->init_mutex);
+    if (!webgpu_ctx->device_init) {
         // Initialize device
         wgpu::DeviceDescriptor dev_desc;
         dev_desc.requiredLimits = &webgpu_ctx->limits;
@@ -702,19 +691,19 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
             [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
                 GGML_UNUSED(device);
                 GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
-        });
+            });
         dev_desc.SetUncapturedErrorCallback(
             [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) {
                 GGML_UNUSED(device);
                 GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
-        });
-        webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&dev_desc, wgpu::CallbackMode::WaitAnyOnly,
+            });
+        webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&dev_desc, wgpu::CallbackMode::AllowSpontaneous,
             [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
                 if (status != wgpu::RequestDeviceStatus::Success) {
                     GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
                     return;
                 }
-                webgpu_ctx->device = device;
+                webgpu_ctx->device = std::move(device);
             }),
             UINT64_MAX
         );
@@ -723,10 +712,13 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
         // Initialize (compute) queue
         webgpu_ctx->queue = webgpu_ctx->device.GetQueue();
 
+        // Create buffer pool for shader parameters
+        webgpu_ctx->param_buf_pool.init(webgpu_ctx->device);
+
         ggml_webgpu_init_memset_pipeline(webgpu_ctx);
         ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx);
         ggml_webgpu_init_cpy_pipeline(webgpu_ctx);
-        webgpu_ctx->device_initialized = true;
+        webgpu_ctx->device_init = true;
     }
 
     static ggml_backend_webgpu_context backend_ctx;
@@ -767,20 +759,20 @@ static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggm
     return  buft->iface.get_name == ggml_backend_webgpu_buffer_type_get_name;
 }
 
-static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) {
     GGML_UNUSED(dev);
 
     switch (op->op) {
-        case GGML_OP_NONE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-            return true;
-        case GGML_OP_CPY:
-            return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
-        case GGML_OP_MUL_MAT:
-            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
-        default:
-            return false;
+    case GGML_OP_NONE:
+    case GGML_OP_VIEW:
+    case GGML_OP_PERMUTE:
+        return true;
+    case GGML_OP_CPY:
+        return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
+    case GGML_OP_MUL_MAT:
+        return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+    default:
+        return false;
     }
 }
 
@@ -806,13 +798,13 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
 
 /* GGML Backend Registration Interface */
 
-static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
-    ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
+static const char* ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
+    ggml_backend_webgpu_reg_context* ctx = static_cast<ggml_backend_webgpu_reg_context*>(reg->context);
     return ctx->name;
 }
 
 static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
-    ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
+    ggml_backend_webgpu_reg_context* ctx = static_cast<ggml_backend_webgpu_reg_context*>(reg->context);
     return ctx->device_count;
 }
 
@@ -822,20 +814,20 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     GGML_ASSERT(index == 0);
     WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
 
-    ggml_backend_webgpu_reg_context * reg_ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
+    ggml_backend_webgpu_reg_context* reg_ctx = static_cast<ggml_backend_webgpu_reg_context*>(reg->context);
 
     webgpu_context ctx = reg_ctx->webgpu_ctx;
 
     wgpu::RequestAdapterOptions options = {};
-    auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) {
+    auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char* message, void* userdata) {
         if (status != wgpu::RequestAdapterStatus::Success) {
             GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
             return;
         }
-        *static_cast<wgpu::Adapter *>(userdata) = adapter;
-    };
-    void *userdata = &ctx->adapter;
-    ctx->instance.WaitAny(ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::WaitAnyOnly, callback, userdata), UINT64_MAX);
+        *static_cast<wgpu::Adapter*>(userdata) = std::move(adapter);
+        };
+    void* userdata = &ctx->adapter;
+    ctx->instance.WaitAny(ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::AllowSpontaneous, callback, userdata), UINT64_MAX);
     GGML_ASSERT(ctx->adapter != nullptr);
 
     ctx->adapter.GetLimits(&ctx->limits);
@@ -871,12 +863,10 @@ static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = {
 
 /* End GGML Backend Registration Interface */
 
-// TODO: Does this need to be thread safe? Is it only called once?
 ggml_backend_reg_t ggml_backend_webgpu_reg() {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()");
 
     webgpu_context webgpu_ctx = std::make_shared<webgpu_context_struct>();
-    webgpu_ctx->device_initialized = false;
 
     static ggml_backend_webgpu_reg_context ctx;
     ctx.webgpu_ctx = webgpu_ctx;
@@ -884,7 +874,7 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
     ctx.device_count = 1;
 
     wgpu::InstanceDescriptor instance_descriptor{};
-    std::vector<wgpu::InstanceFeatureName> instance_features = {wgpu::InstanceFeatureName::TimedWaitAny};
+    std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
     instance_descriptor.requiredFeatures = instance_features.data();
     instance_descriptor.requiredFeatureCount = instance_features.size();
     webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);

From 04d7b272d65f19e65ec271e6f3442c758f53f3c0 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 30 Jul 2025 13:45:58 -0700
Subject: [PATCH 02/17] Add header for linux builds

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index e35c865ea7..a3f41da1b1 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -7,6 +7,7 @@
 
 #include "ggml-wgsl-shaders.hpp"
 
+#include <condition_variable>
 #include <cstring>
 #include <mutex>
 #include <vector>

From 01c8ced232fffdd670a3919741cabdc4e9a63b9f Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 30 Jul 2025 14:27:29 -0700
Subject: [PATCH 03/17] Free staged parameter buffers at once

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index a3f41da1b1..dcc32e88ef 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -76,10 +76,10 @@ struct webgpu_param_buf_pool {
         return bufs;
     }
 
-    void free_bufs(const webgpu_param_bufs& bufs) {
+    void free_bufs(std::vector<webgpu_param_bufs> bufs) {
         std::lock_guard<std::mutex> lock(mutex);
-        free.push_back(bufs);
-        cv.notify_one();
+        free.insert(free.end(), bufs.begin(), bufs.end());
+        cv.notify_all();
     }
 
     void cleanup() {
@@ -222,9 +222,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context& ctx) {
                 GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data);
             }
             // Free the staged parameter buffers
-            for (const auto& bufs : staged_param_bufs) {
-                ctx->param_buf_pool.free_bufs(bufs);
-            }
+            ctx->param_buf_pool.free_bufs(staged_param_bufs);
         });
 }
 
@@ -287,7 +285,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context& ctx, wgpu::Com
                 if (status != wgpu::QueueWorkDoneStatus::Success) {
                     GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data);
                 }
-                ctx->param_buf_pool.free_bufs(params_bufs);
+                ctx->param_buf_pool.free_bufs({params_bufs});
         });
     } else {
         // Enqueue commands and only submit if we have enough staged commands

From bfff27f130e818fdf7aade536843c9d1e2aa54b4 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 30 Jul 2025 15:06:09 -0700
Subject: [PATCH 04/17] Format with clang-format

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 662 +++++++++++++++------------
 1 file changed, 379 insertions(+), 283 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index dcc32e88ef..666bfbe183 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1,48 +1,56 @@
+/*
+    WebGPU backend implementation.
+    Note: Use ClangFormat to format this file.
+*/
+
 #include "ggml-webgpu.h"
 
-#include <webgpu/webgpu_cpp.h>
-
-#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
-
+#include "ggml-impl.h"
 #include "ggml-wgsl-shaders.hpp"
 
+#include <webgpu/webgpu_cpp.h>
+
 #include <condition_variable>
 #include <cstring>
 #include <mutex>
 #include <vector>
 
 #ifdef GGML_WEBGPU_DEBUG
-#define WEBGPU_LOG_DEBUG(msg) std::cout << msg << std::endl
+#    define WEBGPU_LOG_DEBUG(msg) std::cout << msg << std::endl
 #else
-#define WEBGPU_LOG_DEBUG(msg) ((void) 0)
-#endif // GGML_WEBGPU_DEBUG
+#    define WEBGPU_LOG_DEBUG(msg) ((void) 0)
+#endif  // GGML_WEBGPU_DEBUG
 
 /* Constants */
 
 #define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 16
-#define WEBGPU_MUL_MAT_WG_SIZE 64
-#define WEBGPU_NUM_PARAM_BUFS 100
-#define WEBGPU_PARAMS_BUF_SIZE_BYTES 256
-#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
+#define WEBGPU_MUL_MAT_WG_SIZE           64
+#define WEBGPU_NUM_PARAM_BUFS            100
+#define WEBGPU_PARAMS_BUF_SIZE_BYTES     256
+#define WEBGPU_STORAGE_BUF_BINDING_MULT  4  // a storage buffer binding size must be a multiple of 4
 
 /* End Constants */
 
 // This is a "fake" base pointer, since WebGPU buffers do not have pointers to their locations.
-static void* const webgpu_ptr_base = (void*)(uintptr_t)0x1000;  // NOLINT
+static void * const webgpu_ptr_base = (void *) (uintptr_t) 0x1000;  // NOLINT
 
 // Always returns the base offset of a tensor, regardless of views.
-static uint64_t webgpu_tensor_offset(const ggml_tensor* tensor) {
+static uint64_t webgpu_tensor_offset(const ggml_tensor * tensor) {
     if (tensor->view_src) {
-        return (uint8_t*)tensor->view_src->data - (uint8_t*)webgpu_ptr_base;
+        return (uint8_t *) tensor->view_src->data - (uint8_t *) webgpu_ptr_base;
     }
-    return (uint8_t*)tensor->data - (uint8_t*)webgpu_ptr_base;
+    return (uint8_t *) tensor->data - (uint8_t *) webgpu_ptr_base;
 }
 
 /* Struct definitions */
 
 // Forward reference
-static void ggml_webgpu_create_buffer(wgpu::Device& device, wgpu::Buffer& buffer, size_t size, wgpu::BufferUsage usage, const char* label);
+static void ggml_webgpu_create_buffer(wgpu::Device &    device,
+                                      wgpu::Buffer &    buffer,
+                                      size_t            size,
+                                      wgpu::BufferUsage usage,
+                                      const char *      label);
 
 struct webgpu_param_bufs {
     wgpu::Buffer host_buf;
@@ -53,24 +61,30 @@ struct webgpu_param_bufs {
 struct webgpu_param_buf_pool {
     std::vector<webgpu_param_bufs> free;
 
-    std::mutex mutex;
+    std::mutex              mutex;
     std::condition_variable cv;
 
     void init(wgpu::Device device) {
         for (int i = 0; i < WEBGPU_NUM_PARAM_BUFS; i++) {
             wgpu::Buffer host_buf;
             wgpu::Buffer dev_buf;
-            ggml_webgpu_create_buffer(device, host_buf, WEBGPU_PARAMS_BUF_SIZE_BYTES, wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite, "ggml_webgpu_host_params_buf");
-            ggml_webgpu_create_buffer(device, dev_buf, WEBGPU_PARAMS_BUF_SIZE_BYTES, wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, "ggml_webgpu_dev_params_buf");
+            ggml_webgpu_create_buffer(device,
+                                      host_buf,
+                                      WEBGPU_PARAMS_BUF_SIZE_BYTES,
+                                      wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite,
+                                      "ggml_webgpu_host_params_buf");
+            ggml_webgpu_create_buffer(device,
+                                      dev_buf,
+                                      WEBGPU_PARAMS_BUF_SIZE_BYTES,
+                                      wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
+                                      "ggml_webgpu_dev_params_buf");
             free.push_back({ host_buf, dev_buf });
         }
     }
 
     webgpu_param_bufs alloc_bufs() {
         std::unique_lock<std::mutex> lock(mutex);
-        cv.wait(lock, [this] {
-            return !free.empty();
-            });
+        cv.wait(lock, [this] { return !free.empty(); });
         webgpu_param_bufs bufs = free.back();
         free.pop_back();
         return bufs;
@@ -84,7 +98,7 @@ struct webgpu_param_buf_pool {
 
     void cleanup() {
         std::lock_guard<std::mutex> lock(mutex);
-        for (auto& bufs : free) {
+        for (auto & bufs : free) {
             bufs.host_buf.Destroy();
             bufs.dev_buf.Destroy();
         }
@@ -94,17 +108,17 @@ struct webgpu_param_buf_pool {
 
 // All the base objects needed to run operations on a WebGPU device
 struct webgpu_context_struct {
-    wgpu::Instance instance;
-    wgpu::Adapter adapter;
-    wgpu::Device device;
-    wgpu::Queue queue;
-    wgpu::Limits limits;
+    wgpu::Instance          instance;
+    wgpu::Adapter           adapter;
+    wgpu::Device            device;
+    wgpu::Queue             queue;
+    wgpu::Limits            limits;
     wgpu::SupportedFeatures features;
 
     std::recursive_mutex submit_mutex;
-    std::mutex get_tensor_mutex;
-    std::mutex init_mutex;
-    bool device_init = false;
+    std::mutex           get_tensor_mutex;
+    std::mutex           init_mutex;
+    bool                 device_init = false;
 
     // Parameter buffer pool
     webgpu_param_buf_pool param_buf_pool;
@@ -121,7 +135,7 @@ struct webgpu_context_struct {
     // Command buffers which need to be submitted
     std::vector<wgpu::CommandBuffer> staged_command_bufs;
     // Parameter buffers associated with the staged command buffers
-    std::vector<webgpu_param_bufs> staged_param_bufs;
+    std::vector<webgpu_param_bufs>   staged_param_bufs;
 };
 
 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -129,8 +143,8 @@ typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
 struct ggml_backend_webgpu_reg_context {
     webgpu_context webgpu_ctx;
 
-    size_t device_count;
-    const char* name;
+    size_t       device_count;
+    const char * name;
 };
 
 struct ggml_backend_webgpu_device_context {
@@ -152,63 +166,71 @@ struct ggml_backend_webgpu_buffer_context {
     wgpu::Buffer buffer;
 
     ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) :
-        webgpu_ctx(std::move(ctx)), buffer(std::move(buf)) {
-    }
+        webgpu_ctx(std::move(ctx)),
+        buffer(std::move(buf)) {}
 };
 
 /* End struct definitions */
 
 /* WebGPU object initializations */
 
-static void ggml_webgpu_create_pipeline(wgpu::Device& device, wgpu::ComputePipeline& pipeline, const char* shader_code, const char* label, const std::vector<wgpu::ConstantEntry>& constants = {}) {
+static void ggml_webgpu_create_pipeline(wgpu::Device &                           device,
+                                        wgpu::ComputePipeline &                  pipeline,
+                                        const char *                             shader_code,
+                                        const char *                             label,
+                                        const std::vector<wgpu::ConstantEntry> & constants = {}) {
     WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()");
     wgpu::ShaderSourceWGSL shader_source;
     shader_source.code = shader_code;
     wgpu::ShaderModuleDescriptor shader_desc;
-    shader_desc.nextInChain = &shader_source;
+    shader_desc.nextInChain          = &shader_source;
     wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc);
 
     wgpu::ComputePipelineDescriptor pipeline_desc;
-    pipeline_desc.label = label;
-    pipeline_desc.compute.module = shader_module;
-    pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code
-    pipeline_desc.layout = nullptr; // nullptr means auto layout
+    pipeline_desc.label              = label;
+    pipeline_desc.compute.module     = shader_module;
+    pipeline_desc.compute.entryPoint = "main";   // Entry point in the WGSL code
+    pipeline_desc.layout             = nullptr;  // nullptr means auto layout
     if (constants.size() > 0) {
-        pipeline_desc.compute.constants = constants.data();
+        pipeline_desc.compute.constants     = constants.data();
         pipeline_desc.compute.constantCount = constants.size();
     }
     pipeline = device.CreateComputePipeline(&pipeline_desc);
 }
 
-static void ggml_webgpu_create_buffer(wgpu::Device& device, wgpu::Buffer& buffer, size_t size, wgpu::BufferUsage usage, const char* label) {
+static void ggml_webgpu_create_buffer(wgpu::Device &    device,
+                                      wgpu::Buffer &    buffer,
+                                      size_t            size,
+                                      wgpu::BufferUsage usage,
+                                      const char *      label) {
     WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()");
 
     wgpu::BufferDescriptor buffer_desc;
-    buffer_desc.size = size;
-    buffer_desc.usage = usage;
-    buffer_desc.label = label;
+    buffer_desc.size             = size;
+    buffer_desc.usage            = usage;
+    buffer_desc.label            = label;
     buffer_desc.mappedAtCreation = false;
     // TODO: error handling
-    buffer = device.CreateBuffer(&buffer_desc);
+    buffer                       = device.CreateBuffer(&buffer_desc);
 }
 
 /** End WebGPU object initializations */
 
 /** WebGPU Actions */
 
-static void ggml_backend_webgpu_wait_on_submission(webgpu_context& ctx) {
+static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
     // Wait for the queue to finish processing all commands
-    ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
-        [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
-            if (status != wgpu::QueueWorkDoneStatus::Success) {
-                GGML_LOG_ERROR("ggml_webgpu: Failed to wait on queue: %s\n", message.data);
-            }
-        }),
-        UINT64_MAX
-    );
+    ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(
+                              wgpu::CallbackMode::AllowSpontaneous,
+                              [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+                                  if (status != wgpu::QueueWorkDoneStatus::Success) {
+                                      GGML_LOG_ERROR("ggml_webgpu: Failed to wait on queue: %s\n", message.data);
+                                  }
+                              }),
+                          UINT64_MAX);
 }
 
-static void ggml_backend_webgpu_submit_queue(webgpu_context& ctx) {
+static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
     std::lock_guard<std::recursive_mutex> lock(ctx->submit_mutex);
 
     ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data());
@@ -226,24 +248,34 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context& ctx) {
         });
 }
 
-static void ggml_backend_webgpu_map_buffer(webgpu_context& ctx, wgpu::Buffer& buffer, wgpu::MapMode mode, size_t offset, size_t size) {
-    ctx->instance.WaitAny(buffer.MapAsync(
-        mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
-        [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
-            if (status != wgpu::MapAsyncStatus::Success) {
-                GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n", message.data);
-            }
-        }),
-        UINT64_MAX
-    );
+static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
+                                           wgpu::Buffer &   buffer,
+                                           wgpu::MapMode    mode,
+                                           size_t           offset,
+                                           size_t           size) {
+    ctx->instance.WaitAny(buffer.MapAsync(mode,
+                                          offset,
+                                          size,
+                                          wgpu::CallbackMode::AllowSpontaneous,
+                                          [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+                                              if (status != wgpu::MapAsyncStatus::Success) {
+                                                  GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n",
+                                                                 message.data);
+                                              }
+                                          }),
+                          UINT64_MAX);
 }
 
-static void ggml_backend_webgpu_build_and_enqueue(webgpu_context& ctx, wgpu::ComputePipeline& pipeline, std::vector<uint32_t> params, std::vector<wgpu::BindGroupEntry> bind_group_entries, uint32_t wg_x, bool submit_imm = false) {
+static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &                  ctx,
+                                                  wgpu::ComputePipeline &           pipeline,
+                                                  std::vector<uint32_t>             params,
+                                                  std::vector<wgpu::BindGroupEntry> bind_group_entries,
+                                                  uint32_t                          wg_x,
+                                                  bool                              submit_imm = false) {
     webgpu_param_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
 
-    ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf,
-        wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
-    uint32_t* _params = (uint32_t*)params_bufs.host_buf.GetMappedRange();
+    ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
+    uint32_t * _params = (uint32_t *) params_bufs.host_buf.GetMappedRange();
     for (size_t i = 0; i < params.size(); i++) {
         _params[i] = params[i];
     };
@@ -251,42 +283,36 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context& ctx, wgpu::Com
     params_bufs.host_buf.Unmap();
 
     uint32_t params_bufs_binding_num = bind_group_entries.size();
-    bind_group_entries.push_back({
-        .binding = params_bufs_binding_num,
-        .buffer = params_bufs.dev_buf,
-        .offset = 0,
-        .size = params_bufs.dev_buf.GetSize()
-    });
+    bind_group_entries.push_back({ .binding = params_bufs_binding_num,
+                                   .buffer  = params_bufs.dev_buf,
+                                   .offset  = 0,
+                                   .size    = params_bufs.dev_buf.GetSize() });
 
     wgpu::BindGroupDescriptor bind_group_desc;
-    bind_group_desc.layout = pipeline.GetBindGroupLayout(0);
+    bind_group_desc.layout     = pipeline.GetBindGroupLayout(0);
     bind_group_desc.entryCount = bind_group_entries.size();
-    bind_group_desc.entries = bind_group_entries.data();
+    bind_group_desc.entries    = bind_group_entries.data();
     wgpu::BindGroup bind_group = ctx->device.CreateBindGroup(&bind_group_desc);
 
     wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
-    encoder.CopyBufferToBuffer(
-        params_bufs.host_buf, 0,
-        params_bufs.dev_buf, 0,
-        params_bufs.dev_buf.GetSize()
-    );
+    encoder.CopyBufferToBuffer(params_bufs.host_buf, 0, params_bufs.dev_buf, 0, params_bufs.dev_buf.GetSize());
     wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
     pass.SetPipeline(pipeline);
     pass.SetBindGroup(0, bind_group);
     pass.DispatchWorkgroups(wg_x, 1, 1);
     pass.End();
-    wgpu::CommandBuffer commands =  encoder.Finish();
+    wgpu::CommandBuffer commands = encoder.Finish();
     if (submit_imm) {
         // Submit immediately
         ctx->queue.Submit(1, &commands);
-        ctx->queue.OnSubmittedWorkDone(
-            wgpu::CallbackMode::AllowSpontaneous,
-            [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
-                if (status != wgpu::QueueWorkDoneStatus::Success) {
-                    GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data);
-                }
-                ctx->param_buf_pool.free_bufs({params_bufs});
-        });
+        ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
+                                       [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+                                           if (status != wgpu::QueueWorkDoneStatus::Success) {
+                                               GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
+                                                              message.data);
+                                           }
+                                           ctx->param_buf_pool.free_bufs({ params_bufs });
+                                       });
     } else {
         // Enqueue commands and only submit if we have enough staged commands
         std::lock_guard<std::recursive_mutex> lock(ctx->submit_mutex);
@@ -298,20 +324,26 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context& ctx, wgpu::Com
     }
 }
 
-static void ggml_backend_webgpu_buffer_memset(webgpu_context& ctx, wgpu::Buffer& buf, uint32_t value, size_t offset, size_t size) {
-    std::vector<uint32_t> params = {(uint32_t)offset, (uint32_t)size, value};
-    std::vector<wgpu::BindGroupEntry> entries = {{ .binding = 0, .buffer = buf, .offset = 0, .size = buf.GetSize() }};
-    size_t bytes_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_bytes_per_thread;
-    uint32_t wg_x = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
+static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
+                                              wgpu::Buffer &   buf,
+                                              uint32_t         value,
+                                              size_t           offset,
+                                              size_t           size) {
+    std::vector<uint32_t>             params  = { (uint32_t) offset, (uint32_t) size, value };
+    std::vector<wgpu::BindGroupEntry> entries = {
+        { .binding = 0, .buffer = buf, .offset = 0, .size = buf.GetSize() }
+    };
+    size_t   bytes_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_bytes_per_thread;
+    uint32_t wg_x         = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
     ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, true);
 }
 
-static size_t ggml_backend_webgpu_tensor_offset(const ggml_tensor* tensor) {
+static size_t ggml_backend_webgpu_tensor_offset(const ggml_tensor * tensor) {
     return webgpu_tensor_offset(tensor) + tensor->view_offs;
 }
 
-static wgpu::Buffer ggml_backend_webgpu_tensor_buf(const ggml_tensor* tensor) {
-    ggml_backend_webgpu_buffer_context* ctx = (ggml_backend_webgpu_buffer_context*)tensor->buffer->context;
+static wgpu::Buffer ggml_backend_webgpu_tensor_buf(const ggml_tensor * tensor) {
+    ggml_backend_webgpu_buffer_context * ctx = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context;
     return ctx->buffer;
 }
 
@@ -319,112 +351,139 @@ static wgpu::Buffer ggml_backend_webgpu_tensor_buf(const ggml_tensor* tensor) {
 
 /** GGML Backend Interface */
 
-static const char* ggml_backend_webgpu_name(ggml_backend_t backend) {
-    ggml_backend_webgpu_context* ctx = (ggml_backend_webgpu_context*)backend->context;
+static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
+    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context;
     return ctx->name.c_str();
 }
 
 static void ggml_backend_webgpu_free(ggml_backend_t backend) {
-    ggml_backend_webgpu_context* ctx = (ggml_backend_webgpu_context*)backend->context;
+    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context;
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
 
     // TODO: cleanup
     GGML_UNUSED(ctx);
 }
 
-static void ggml_webgpu_cpy(webgpu_context& ctx, ggml_tensor* src, ggml_tensor* dst) {
-    size_t src_offset = ggml_backend_webgpu_tensor_offset(src);
+static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
+    size_t src_offset       = ggml_backend_webgpu_tensor_offset(src);
     // assumes power of 2 offset alignment
     size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
     // align to minimum offset alignment
     src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
-    size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst);
+    size_t dst_offset       = ggml_backend_webgpu_tensor_offset(dst);
     size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
     dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
-    uint32_t ne = (uint32_t)ggml_nelements(dst);
-    std::vector<uint32_t> params = {
-        ne, (uint32_t)(src_misalignment / ggml_type_size(src->type)), (uint32_t)(dst_misalignment / ggml_type_size(dst->type)),
-        // Convert byte-strides to element-strides
-        (uint32_t)(src->nb[0] / ggml_type_size(src->type)), (uint32_t)(src->nb[1] / ggml_type_size(src->type)),
-        (uint32_t)(src->nb[2] / ggml_type_size(src->type)), (uint32_t)(src->nb[3] / ggml_type_size(src->type)),
-        (uint32_t)(dst->nb[0] / ggml_type_size(dst->type)), (uint32_t)(dst->nb[1] / ggml_type_size(dst->type)),
-        (uint32_t)(dst->nb[2] / ggml_type_size(dst->type)), (uint32_t)(dst->nb[3] / ggml_type_size(dst->type)),
-        // Logical shape — same for both tensors even if permuted
-        (uint32_t)src->ne[0], (uint32_t)src->ne[1], (uint32_t)src->ne[2], (uint32_t)src->ne[3]
-    };
+    uint32_t              ne     = (uint32_t) ggml_nelements(dst);
+    std::vector<uint32_t> params = { ne,
+                                     (uint32_t) (src_misalignment / ggml_type_size(src->type)),
+                                     (uint32_t) (dst_misalignment / ggml_type_size(dst->type)),
+                                     // Convert byte-strides to element-strides
+                                     (uint32_t) (src->nb[0] / ggml_type_size(src->type)),
+                                     (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
+                                     (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
+                                     (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
+                                     (uint32_t) (dst->nb[0] / ggml_type_size(dst->type)),
+                                     (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
+                                     (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
+                                     (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
+                                     // Logical shape — same for both tensors even if permuted
+                                     (uint32_t) src->ne[0],
+                                     (uint32_t) src->ne[1],
+                                     (uint32_t) src->ne[2],
+                                     (uint32_t) src->ne[3] };
 
     std::vector<wgpu::BindGroupEntry> entries = {
-        { .binding = 0, .buffer = ggml_backend_webgpu_tensor_buf(src), .offset = src_offset, .size = (ggml_nbytes(src) + src_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) },
-        { .binding = 1, .buffer = ggml_backend_webgpu_tensor_buf(dst), .offset = dst_offset, .size = (ggml_nbytes(dst) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) }
+        { .binding = 0,
+         .buffer  = ggml_backend_webgpu_tensor_buf(src),
+         .offset  = src_offset,
+         .size    = (ggml_nbytes(src) + src_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) &
+                  ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) },
+        { .binding = 1,
+         .buffer  = ggml_backend_webgpu_tensor_buf(dst),
+         .offset  = dst_offset,
+         .size    = (ggml_nbytes(dst) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) &
+                  ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) }
     };
 
-    size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
-    uint32_t wg_x = (ne + max_wg_size - 1) / max_wg_size;
+    size_t   max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
+    uint32_t wg_x        = (ne + max_wg_size - 1) / max_wg_size;
     ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline, params, entries, wg_x);
 }
 
-static void ggml_webgpu_mul_mat(webgpu_context& ctx, ggml_tensor* src0, ggml_tensor* src1, ggml_tensor* dst) {
+static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
     std::vector<uint32_t> params = {
-        (uint32_t)dst->ne[1], // number of rows in result (M)
-        (uint32_t)dst->ne[0], // number of columns in result (N)
-        (uint32_t)src0->ne[0], // number of columns in src0/src1 (K)
-        (uint32_t)(src0->nb[1] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 1
-        (uint32_t)(src1->nb[1] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 1
-        (uint32_t)(src0->nb[2] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 2
-        (uint32_t)(src1->nb[2] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 2
-        (uint32_t)(src0->nb[3] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 3
-        (uint32_t)(src1->nb[3] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 3
-        (uint32_t)src0->ne[2], // batch size in dimension 2
-        (uint32_t)src0->ne[3], // batch size in dimension 3
-        (uint32_t)(src1->ne[2] / src0->ne[2]), // broadcast in dimension 2
-        (uint32_t)(src1->ne[3] / src0->ne[3]) // broadcast in dimension 3
+        (uint32_t) dst->ne[1],                                  // number of rows in result (M)
+        (uint32_t) dst->ne[0],                                  // number of columns in result (N)
+        (uint32_t) src0->ne[0],                                 // number of columns in src0/src1 (K)
+        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),  // stride (elements) of src0 in dimension 1
+        (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),  // stride (elements) of src1 in dimension 1
+        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),  // stride (elements) of src0 in dimension 2
+        (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),  // stride (elements) of src1 in dimension 2
+        (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),  // stride (elements) of src0 in dimension 3
+        (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),  // stride (elements) of src1 in dimension 3
+        (uint32_t) src0->ne[2],                                 // batch size in dimension 2
+        (uint32_t) src0->ne[3],                                 // batch size in dimension 3
+        (uint32_t) (src1->ne[2] / src0->ne[2]),                 // broadcast in dimension 2
+        (uint32_t) (src1->ne[3] / src0->ne[3])                  // broadcast in dimension 3
     };
 
     std::vector<wgpu::BindGroupEntry> entries = {
-        { .binding = 0, .buffer = ggml_backend_webgpu_tensor_buf(src0), .offset = ggml_backend_webgpu_tensor_offset(src0), .size = ggml_nbytes(src0) },
-        { .binding = 1, .buffer = ggml_backend_webgpu_tensor_buf(src1), .offset = ggml_backend_webgpu_tensor_offset(src1), .size = ggml_nbytes(src1) },
-        { .binding = 2, .buffer = ggml_backend_webgpu_tensor_buf(dst), .offset = ggml_backend_webgpu_tensor_offset(dst), .size = ggml_nbytes(dst) }
+        { .binding = 0,
+         .buffer  = ggml_backend_webgpu_tensor_buf(src0),
+         .offset  = ggml_backend_webgpu_tensor_offset(src0),
+         .size    = ggml_nbytes(src0) },
+        { .binding = 1,
+         .buffer  = ggml_backend_webgpu_tensor_buf(src1),
+         .offset  = ggml_backend_webgpu_tensor_offset(src1),
+         .size    = ggml_nbytes(src1) },
+        { .binding = 2,
+         .buffer  = ggml_backend_webgpu_tensor_buf(dst),
+         .offset  = ggml_backend_webgpu_tensor_offset(dst),
+         .size    = ggml_nbytes(dst)  }
     };
 
-    uint32_t wg_x = (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
+    uint32_t wg_x =
+        (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
     ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline, params, entries, wg_x);
 }
 
 // Returns true if node has enqueued work into the queue, false otherwise
-static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor* node) {
+static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
     if (ggml_is_empty(node)) {
         return false;
     }
     WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
 
-    ggml_tensor* src0 = node->src[0];
-    ggml_tensor* src1 = node->src[1];
+    ggml_tensor * src0 = node->src[0];
+    ggml_tensor * src1 = node->src[1];
 
     switch (node->op) {
-        // no-ops
-    case GGML_OP_NONE:
-    case GGML_OP_VIEW:
-    case GGML_OP_PERMUTE:
-        return false;
-    case GGML_OP_CPY: {
-        ggml_webgpu_cpy(ctx, src0, node);
-        break;
-    }
-    case GGML_OP_MUL_MAT: {
-        ggml_webgpu_mul_mat(ctx, src0, src1, node);
-        break;
-    }
-    default:
-        return false;
+            // no-ops
+        case GGML_OP_NONE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+            return false;
+        case GGML_OP_CPY:
+            {
+                ggml_webgpu_cpy(ctx, src0, node);
+                break;
+            }
+        case GGML_OP_MUL_MAT:
+            {
+                ggml_webgpu_mul_mat(ctx, src0, src1, node);
+                break;
+            }
+        default:
+            return false;
     }
     return true;
 }
 
-static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
+static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)");
 
-    ggml_backend_webgpu_context* backend_ctx = static_cast<ggml_backend_webgpu_context*>(backend->context);
-    webgpu_context ctx = backend_ctx->webgpu_ctx;
+    ggml_backend_webgpu_context * backend_ctx = static_cast<ggml_backend_webgpu_context *>(backend->context);
+    webgpu_context                ctx         = backend_ctx->webgpu_ctx;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_webgpu_encode_node(ctx, cgraph->nodes[i]);
@@ -458,35 +517,45 @@ static ggml_backend_i ggml_backend_webgpu_i = {
 
 static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()");
-    ggml_backend_webgpu_buffer_context* ctx = static_cast<ggml_backend_webgpu_buffer_context*>(buffer->context);
+    ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
     ctx->buffer.Destroy();
 }
 
 // Returns the "fake" base pointer.
-static void* ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) {
     GGML_UNUSED(buffer);
     return webgpu_ptr_base;
 }
 
-static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor* tensor, uint8_t value, size_t offset, size_t size) {
+static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffer,
+                                                     ggml_tensor *         tensor,
+                                                     uint8_t               value,
+                                                     size_t                offset,
+                                                     size_t                size) {
     if (size == 0) {
         WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor: size is zero, nothing to do.");
         return;
     }
 
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")");
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", "
+                                                                 << offset << ", " << size << ")");
 
-    ggml_backend_webgpu_buffer_context* buf_ctx = (ggml_backend_webgpu_buffer_context*)buffer->context;
-    size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
+    ggml_backend_webgpu_buffer_context * buf_ctx      = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    size_t                               total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
     // This is a trick to set all bytes of a u32 to the same 1 byte value.
-    uint32_t val32 = (uint32_t)value * 0x01010101;
+    uint32_t                             val32        = (uint32_t) value * 0x01010101;
     ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, val32, total_offset, size);
 }
 
-static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data, size_t offset, size_t size) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
-    ggml_backend_webgpu_buffer_context* buf_ctx = (ggml_backend_webgpu_buffer_context*)buffer->context;
-    webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
+static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                  ggml_tensor *         tensor,
+                                                  const void *          data,
+                                                  size_t                offset,
+                                                  size_t                size) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", "
+                                                              << offset << ", " << size << ")");
+    ggml_backend_webgpu_buffer_context * buf_ctx    = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    webgpu_context                       webgpu_ctx = buf_ctx->webgpu_ctx;
 
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
@@ -494,23 +563,29 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
     if (size % 4 != 0) {
         // If size is not a multiple of 4, we need to memset the remaining bytes
-        size_t remaining_size = size % 4;
+        size_t   remaining_size = size % 4;
         // pack the remaining bytes into a uint32_t
-        uint32_t val32 = 0;
+        uint32_t val32          = 0;
         for (size_t i = 0; i < remaining_size; i++) {
-            ((uint8_t*)&val32)[i] = ((const uint8_t*)data)[size - remaining_size + i];
+            ((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i];
         }
         // memset the remaining bytes
-        ggml_backend_webgpu_buffer_memset(webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size), remaining_size);
+        ggml_backend_webgpu_buffer_memset(
+            webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size), remaining_size);
     }
 }
 
-static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data, size_t offset, size_t size) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
+static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                  const ggml_tensor *   tensor,
+                                                  void *                data,
+                                                  size_t                offset,
+                                                  size_t                size) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", "
+                                                              << offset << ", " << size << ")");
 
-    ggml_backend_webgpu_buffer_context* buf_ctx = (ggml_backend_webgpu_buffer_context*)buffer->context;
-    webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
-    wgpu::Device device = webgpu_ctx->device;
+    ggml_backend_webgpu_buffer_context * buf_ctx    = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    webgpu_context                       webgpu_ctx = buf_ctx->webgpu_ctx;
+    wgpu::Device                         device     = webgpu_ctx->device;
 
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
@@ -522,14 +597,16 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
 
     std::lock_guard<std::mutex> lock(webgpu_ctx->get_tensor_mutex);
 
-    if (webgpu_ctx->get_tensor_staging_buf == nullptr ||
-        webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) {
+    if (webgpu_ctx->get_tensor_staging_buf == nullptr || webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) {
         // Create a new staging buffer if it doesn't exist or is too small
         if (webgpu_ctx->get_tensor_staging_buf) {
             webgpu_ctx->get_tensor_staging_buf.Destroy();
         }
-        ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size,
-            wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "get_tensor_staging_buf");
+        ggml_webgpu_create_buffer(device,
+                                  webgpu_ctx->get_tensor_staging_buf,
+                                  final_size,
+                                  wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
+                                  "get_tensor_staging_buf");
     }
 
     // Copy the data from the buffer to the staging buffer
@@ -542,7 +619,7 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
     // Map the staging buffer to read the data
     ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, final_size);
     // Must specify size here since the staging buffer might be larger than the tensor size
-    const void* mapped_range = webgpu_ctx->get_tensor_staging_buf.GetConstMappedRange(0, final_size);
+    const void * mapped_range = webgpu_ctx->get_tensor_staging_buf.GetConstMappedRange(0, final_size);
 
     // Copy the data from the mapped range to the output buffer
     std::memcpy(data, mapped_range, size);
@@ -550,54 +627,58 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
 }
 
 static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t)value << ")");
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t) value << ")");
 
-    ggml_backend_webgpu_buffer_context* buf_ctx = (ggml_backend_webgpu_buffer_context*)buffer->context;
+    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
     ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buffer->size);
 }
 
 static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_webgpu_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_webgpu_buffer_get_base,
-    /* .init_tensor     = */ NULL, // TODO: optional, needed?
+    /* .init_tensor     = */ NULL,  // TODO: optional, needed?
     /* .memset_tensor   = */ ggml_backend_webgpu_buffer_memset_tensor,
     /* .set_tensor      = */ ggml_backend_webgpu_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_webgpu_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL, // TODO: optional, implement this
+    /* .cpy_tensor      = */ NULL,  // TODO: optional, implement this
     /* .clear           = */ ggml_backend_webgpu_buffer_clear,
-    /* .reset           = */ NULL, // TODO: optional, think it coordinates with .init_tensor
+    /* .reset           = */ NULL,  // TODO: optional, think it coordinates with .init_tensor
 };
 
 /* End GGML Backend Buffer Interface */
 
 /* GGML Backend Buffer Type Interface */
 
-static const char* ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(buft->device->context);
+static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
     return ctx->device_name.c_str();
 }
 
-static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                                                          size_t                     size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")");
-    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(buft->device->context);
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
 
     wgpu::Buffer buf;
-    ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size,
-        wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, "allocated_buffer");
+    ggml_webgpu_create_buffer(ctx->webgpu_ctx->device,
+                              buf,
+                              size,
+                              wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst,
+                              "allocated_buffer");
 
-    ggml_backend_webgpu_buffer_context* buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
+    ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
 
     return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size);
 }
 
 static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(buft->device->context);
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
     return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment;
 }
 
 // maxBufferSize might be larger, but you can't bind more than maxStorageBufferBindingSize to a single binding.
 static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(buft->device->context);
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
     return ctx->webgpu_ctx->limits.maxStorageBufferBindingSize;
 }
 
@@ -605,21 +686,21 @@ static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_t
 
 /* GGML Backend Device Interface */
 
-static const char* ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
-    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(dev->context);
+static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     return ctx->device_name.c_str();
 }
 
-static const char* ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) {
-    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(dev->context);
+static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     return ctx->device_desc.c_str();
 }
 
-static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t* free, size_t* total) {
-    ggml_backend_webgpu_device_context* ctx = static_cast<ggml_backend_webgpu_device_context*>(dev->context);
+static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     // TODO: what do we actually want to return here? maxBufferSize might not be the full available memory.
-    *free = ctx->webgpu_ctx->limits.maxBufferSize;
-    *total = ctx->webgpu_ctx->limits.maxBufferSize;
+    *free                                    = ctx->webgpu_ctx->limits.maxBufferSize;
+    *total                                   = ctx->webgpu_ctx->limits.maxBufferSize;
 }
 
 static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) {
@@ -627,10 +708,10 @@ static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backe
     return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
-static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props* props) {
-    props->name = ggml_backend_webgpu_device_get_name(dev);
+static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_webgpu_device_get_name(dev);
     props->description = ggml_backend_webgpu_device_get_description(dev);
-    props->type = ggml_backend_webgpu_device_get_type(dev);
+    props->type        = ggml_backend_webgpu_device_get_type(dev);
     ggml_backend_webgpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = {
         /* .async                 = */ false,
@@ -641,71 +722,77 @@ static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct
 }
 
 static ggml_guid_t ggml_backend_webgpu_guid(void) {
-    static const char* guid_str = "__ggml_webgpu :)";
-    return reinterpret_cast<ggml_guid_t>((void*)guid_str);
+    static const char * guid_str = "__ggml_webgpu :)";
+    return reinterpret_cast<ggml_guid_t>((void *) guid_str);
 }
 
-static void ggml_webgpu_init_memset_pipeline(webgpu_context& webgpu_ctx) {
+static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
     // we use the maximum workgroup size for the memset pipeline
     size_t max_wg_size = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
     size_t max_threads = max_wg_size * webgpu_ctx->limits.maxComputeWorkgroupsPerDimension;
     // Size the bytes_per_thread so that the largest buffer size can be handled
-    webgpu_ctx->memset_bytes_per_thread = (webgpu_ctx->limits.maxStorageBufferBindingSize + max_threads - 1) / max_threads;
+    webgpu_ctx->memset_bytes_per_thread =
+        (webgpu_ctx->limits.maxStorageBufferBindingSize + max_threads - 1) / max_threads;
     std::vector<wgpu::ConstantEntry> constants(2);
-    constants[0].key = "wg_size";
+    constants[0].key   = "wg_size";
     constants[0].value = max_wg_size;
-    constants[1].key = "bytes_per_thread";
+    constants[1].key   = "bytes_per_thread";
     constants[1].value = webgpu_ctx->memset_bytes_per_thread;
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, "memset", constants);
 }
 
-static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context& webgpu_ctx) {
+static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat");
 }
 
-static void ggml_webgpu_init_cpy_pipeline(webgpu_context& webgpu_ctx) {
+static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants(1);
-    constants[0].key = "wg_size";
+    constants[0].key   = "wg_size";
     constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline, wgsl_cpy, "cpy", constants);
 }
 
-static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char* params) {
+static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
 
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_init()");
 
-    ggml_backend_webgpu_device_context* dev_ctx = static_cast<ggml_backend_webgpu_device_context*>(dev->context);
-    webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx;
+    ggml_backend_webgpu_device_context * dev_ctx    = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
+    webgpu_context                       webgpu_ctx = dev_ctx->webgpu_ctx;
 
     // Multiple threads may try to initialize the device
     std::lock_guard<std::mutex> lock(webgpu_ctx->init_mutex);
     if (!webgpu_ctx->device_init) {
         // Initialize device
         wgpu::DeviceDescriptor dev_desc;
-        dev_desc.requiredLimits = &webgpu_ctx->limits;
-        dev_desc.requiredFeatures = webgpu_ctx->features.features;
+        dev_desc.requiredLimits       = &webgpu_ctx->limits;
+        dev_desc.requiredFeatures     = webgpu_ctx->features.features;
         dev_desc.requiredFeatureCount = webgpu_ctx->features.featureCount;
-        dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous,
-            [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
+        dev_desc.SetDeviceLostCallback(
+            wgpu::CallbackMode::AllowSpontaneous,
+            [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
                 GGML_UNUSED(device);
-                GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
+                GGML_LOG_ERROR(
+                    "ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
             });
         dev_desc.SetUncapturedErrorCallback(
-            [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) {
+            [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
                 GGML_UNUSED(device);
-                GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
+                GGML_LOG_ERROR(
+                    "ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
             });
-        webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&dev_desc, wgpu::CallbackMode::AllowSpontaneous,
-            [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
-                if (status != wgpu::RequestDeviceStatus::Success) {
-                    GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
-                    return;
-                }
-                webgpu_ctx->device = std::move(device);
-            }),
-            UINT64_MAX
-        );
+        webgpu_ctx->instance.WaitAny(
+            webgpu_ctx->adapter.RequestDevice(
+                &dev_desc,
+                wgpu::CallbackMode::AllowSpontaneous,
+                [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
+                    if (status != wgpu::RequestDeviceStatus::Success) {
+                        GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
+                        return;
+                    }
+                    webgpu_ctx->device = std::move(device);
+                }),
+            UINT64_MAX);
         GGML_ASSERT(webgpu_ctx->device != nullptr);
 
         // Initialize (compute) queue
@@ -721,7 +808,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
     }
 
     static ggml_backend_webgpu_context backend_ctx;
-    backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name;
+    backend_ctx.name       = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name;
     backend_ctx.webgpu_ctx = webgpu_ctx;
 
     // See GGML Backend Interface section
@@ -739,14 +826,15 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm
     // See GGML Backend Buffer Type Interface section
     static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type = {
         /* .iface = */ {
-            /* .get_name         = */ ggml_backend_webgpu_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_webgpu_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_webgpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ ggml_backend_webgpu_buffer_type_get_max_size,
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ NULL, // defaults to false
+                        /* .get_name         = */ ggml_backend_webgpu_buffer_type_get_name,
+                        /* .alloc_buffer     = */ ggml_backend_webgpu_buffer_type_alloc_buffer,
+                        /* .get_alignment    = */ ggml_backend_webgpu_buffer_type_get_alignment,
+                        /* .get_max_size     = */ ggml_backend_webgpu_buffer_type_get_max_size,
+                        /* .get_alloc_size   = */ NULL,  // defaults to ggml_nbytes
+            /* .is_host          = */ NULL,  // defaults to false
         },
-        /* .device  = */ dev,
+        /* .device  = */
+        dev,
         /* .context = */ NULL,
     };
 
@@ -755,23 +843,23 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm
 
 static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(dev);
-    return  buft->iface.get_name == ggml_backend_webgpu_buffer_type_get_name;
+    return buft->iface.get_name == ggml_backend_webgpu_buffer_type_get_name;
 }
 
-static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) {
+static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     GGML_UNUSED(dev);
 
     switch (op->op) {
-    case GGML_OP_NONE:
-    case GGML_OP_VIEW:
-    case GGML_OP_PERMUTE:
-        return true;
-    case GGML_OP_CPY:
-        return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
-    case GGML_OP_MUL_MAT:
-        return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
-    default:
-        return false;
+        case GGML_OP_NONE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+            return true;
+        case GGML_OP_CPY:
+            return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_MUL_MAT:
+            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+        default:
+            return false;
     }
 }
 
@@ -797,13 +885,13 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
 
 /* GGML Backend Registration Interface */
 
-static const char* ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
-    ggml_backend_webgpu_reg_context* ctx = static_cast<ggml_backend_webgpu_reg_context*>(reg->context);
+static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
+    ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
     return ctx->name;
 }
 
 static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
-    ggml_backend_webgpu_reg_context* ctx = static_cast<ggml_backend_webgpu_reg_context*>(reg->context);
+    ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
     return ctx->device_count;
 }
 
@@ -813,20 +901,22 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     GGML_ASSERT(index == 0);
     WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
 
-    ggml_backend_webgpu_reg_context* reg_ctx = static_cast<ggml_backend_webgpu_reg_context*>(reg->context);
+    ggml_backend_webgpu_reg_context * reg_ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
 
     webgpu_context ctx = reg_ctx->webgpu_ctx;
 
     wgpu::RequestAdapterOptions options = {};
-    auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char* message, void* userdata) {
-        if (status != wgpu::RequestAdapterStatus::Success) {
-            GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
-            return;
-        }
-        *static_cast<wgpu::Adapter*>(userdata) = std::move(adapter);
+    auto                        callback =
+        [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message, void * userdata) {
+            if (status != wgpu::RequestAdapterStatus::Success) {
+                GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
+                return;
+            }
+            *static_cast<wgpu::Adapter *>(userdata) = std::move(adapter);
         };
-    void* userdata = &ctx->adapter;
-    ctx->instance.WaitAny(ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::AllowSpontaneous, callback, userdata), UINT64_MAX);
+    void * userdata = &ctx->adapter;
+    ctx->instance.WaitAny(
+        ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::AllowSpontaneous, callback, userdata), UINT64_MAX);
     GGML_ASSERT(ctx->adapter != nullptr);
 
     ctx->adapter.GetLimits(&ctx->limits);
@@ -836,12 +926,19 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     ctx->adapter.GetInfo(&info);
 
     static ggml_backend_webgpu_device_context device_ctx;
-    device_ctx.webgpu_ctx = ctx;
+    device_ctx.webgpu_ctx  = ctx;
     device_ctx.device_name = GGML_WEBGPU_NAME;
     device_ctx.device_desc = std::string(info.description.data);
 
-    GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n",
-        info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data);
+    GGML_LOG_INFO(
+        "ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | "
+        "device_desc: %s\n",
+        info.vendorID,
+        info.vendor.data,
+        info.architecture.data,
+        info.deviceID,
+        info.device.data,
+        info.description.data);
 
     // See GGML Backend Device Interface section
     static ggml_backend_device device = {
@@ -852,7 +949,6 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     return &device;
 }
 
-
 static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = {
     /* .get_name         = */ ggml_backend_webgpu_reg_get_name,
     /* .get_device_count = */ ggml_backend_webgpu_reg_get_device_count,
@@ -868,15 +964,15 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
     webgpu_context webgpu_ctx = std::make_shared<webgpu_context_struct>();
 
     static ggml_backend_webgpu_reg_context ctx;
-    ctx.webgpu_ctx = webgpu_ctx;
-    ctx.name = GGML_WEBGPU_NAME;
+    ctx.webgpu_ctx   = webgpu_ctx;
+    ctx.name         = GGML_WEBGPU_NAME;
     ctx.device_count = 1;
 
-    wgpu::InstanceDescriptor instance_descriptor{};
+    wgpu::InstanceDescriptor               instance_descriptor{};
     std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
-    instance_descriptor.requiredFeatures = instance_features.data();
-    instance_descriptor.requiredFeatureCount = instance_features.size();
-    webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
+    instance_descriptor.requiredFeatures                     = instance_features.data();
+    instance_descriptor.requiredFeatureCount                 = instance_features.size();
+    webgpu_ctx->instance                                     = wgpu::CreateInstance(&instance_descriptor);
     GGML_ASSERT(webgpu_ctx->instance != nullptr);
 
     static ggml_backend_reg reg = {

From b8012ecc0a6b1972bd6cf307c9ca30cbe9b68b5f Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Thu, 31 Jul 2025 11:02:08 -0700
Subject: [PATCH 05/17] Fix thread-safe implementation

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 87 ++++++++++++++++------------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 666bfbe183..61f0a19f70 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -13,7 +13,9 @@
 
 #include <condition_variable>
 #include <cstring>
+#include <iostream>
 #include <mutex>
+#include <string>
 #include <vector>
 
 #ifdef GGML_WEBGPU_DEBUG
@@ -61,7 +63,8 @@ struct webgpu_param_bufs {
 struct webgpu_param_buf_pool {
     std::vector<webgpu_param_bufs> free;
 
-    std::mutex              mutex;
+    std::mutex mutex;
+
     std::condition_variable cv;
 
     void init(wgpu::Device device) {
@@ -108,19 +111,18 @@ struct webgpu_param_buf_pool {
 
 // All the base objects needed to run operations on a WebGPU device
 struct webgpu_context_struct {
-    wgpu::Instance          instance;
-    wgpu::Adapter           adapter;
-    wgpu::Device            device;
-    wgpu::Queue             queue;
-    wgpu::Limits            limits;
-    wgpu::SupportedFeatures features;
+    wgpu::Instance instance;
+    wgpu::Adapter  adapter;
+    wgpu::Device   device;
+    wgpu::Queue    queue;
+    wgpu::Limits   limits;
 
-    std::recursive_mutex submit_mutex;
+    std::recursive_mutex mutex;
     std::mutex           get_tensor_mutex;
     std::mutex           init_mutex;
-    bool                 device_init = false;
 
-    // Parameter buffer pool
+    bool device_init = false;
+
     webgpu_param_buf_pool param_buf_pool;
 
     wgpu::ComputePipeline memset_pipeline;
@@ -134,36 +136,33 @@ struct webgpu_context_struct {
 
     // Command buffers which need to be submitted
     std::vector<wgpu::CommandBuffer> staged_command_bufs;
+
     // Parameter buffers associated with the staged command buffers
-    std::vector<webgpu_param_bufs>   staged_param_bufs;
+    std::vector<webgpu_param_bufs> staged_param_bufs;
 };
 
 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
 
 struct ggml_backend_webgpu_reg_context {
     webgpu_context webgpu_ctx;
-
-    size_t       device_count;
-    const char * name;
+    size_t         device_count;
+    const char *   name;
 };
 
 struct ggml_backend_webgpu_device_context {
     webgpu_context webgpu_ctx;
-
-    std::string device_name;
-    std::string device_desc;
+    std::string    device_name;
+    std::string    device_desc;
 };
 
 struct ggml_backend_webgpu_context {
     webgpu_context webgpu_ctx;
-
-    std::string name;
+    std::string    name;
 };
 
 struct ggml_backend_webgpu_buffer_context {
     webgpu_context webgpu_ctx;
-
-    wgpu::Buffer buffer;
+    wgpu::Buffer   buffer;
 
     ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) :
         webgpu_ctx(std::move(ctx)),
@@ -180,10 +179,13 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &
                                         const char *                             label,
                                         const std::vector<wgpu::ConstantEntry> & constants = {}) {
     WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()");
+
     wgpu::ShaderSourceWGSL shader_source;
     shader_source.code = shader_code;
+
     wgpu::ShaderModuleDescriptor shader_desc;
-    shader_desc.nextInChain          = &shader_source;
+    shader_desc.nextInChain = &shader_source;
+
     wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc);
 
     wgpu::ComputePipelineDescriptor pipeline_desc;
@@ -210,8 +212,9 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
     buffer_desc.usage            = usage;
     buffer_desc.label            = label;
     buffer_desc.mappedAtCreation = false;
+
     // TODO: error handling
-    buffer                       = device.CreateBuffer(&buffer_desc);
+    buffer = device.CreateBuffer(&buffer_desc);
 }
 
 /** End WebGPU object initializations */
@@ -231,8 +234,7 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
 }
 
 static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
-    std::lock_guard<std::recursive_mutex> lock(ctx->submit_mutex);
-
+    std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
     ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data());
     ctx->staged_command_bufs.clear();
     std::vector<webgpu_param_bufs> staged_param_bufs = std::move(ctx->staged_param_bufs);
@@ -274,6 +276,8 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
                                                   bool                              submit_imm = false) {
     webgpu_param_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
 
+    std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
+
     ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
     uint32_t * _params = (uint32_t *) params_bufs.host_buf.GetMappedRange();
     for (size_t i = 0; i < params.size(); i++) {
@@ -315,7 +319,6 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
                                        });
     } else {
         // Enqueue commands and only submit if we have enough staged commands
-        std::lock_guard<std::recursive_mutex> lock(ctx->submit_mutex);
         ctx->staged_command_bufs.push_back(commands);
         ctx->staged_param_bufs.push_back(params_bufs);
         if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
@@ -540,10 +543,12 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", "
                                                                  << offset << ", " << size << ")");
 
-    ggml_backend_webgpu_buffer_context * buf_ctx      = (ggml_backend_webgpu_buffer_context *) buffer->context;
-    size_t                               total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
+    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
+
+    size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
+
     // This is a trick to set all bytes of a u32 to the same 1 byte value.
-    uint32_t                             val32        = (uint32_t) value * 0x01010101;
+    uint32_t val32 = (uint32_t) value * 0x01010101;
     ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, val32, total_offset, size);
 }
 
@@ -559,13 +564,16 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
+    std::lock_guard<std::recursive_mutex> lock(webgpu_ctx->mutex);
     webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4);
 
     if (size % 4 != 0) {
         // If size is not a multiple of 4, we need to memset the remaining bytes
-        size_t   remaining_size = size % 4;
+        size_t remaining_size = size % 4;
+
         // pack the remaining bytes into a uint32_t
-        uint32_t val32          = 0;
+        uint32_t val32 = 0;
+
         for (size_t i = 0; i < remaining_size; i++) {
             ((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i];
         }
@@ -613,8 +621,12 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
     wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
     encoder.CopyBufferToBuffer(buf_ctx->buffer, total_offset, webgpu_ctx->get_tensor_staging_buf, 0, final_size);
     wgpu::CommandBuffer commands = encoder.Finish();
-    // Submit the command buffer to the queue
-    webgpu_ctx->queue.Submit(1, &commands);
+
+    {
+        std::lock_guard<std::recursive_mutex> submit_lock(webgpu_ctx->mutex);
+        // Submit the command buffer to the queue
+        webgpu_ctx->queue.Submit(1, &commands);
+    }
 
     // Map the staging buffer to read the data
     ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, final_size);
@@ -628,7 +640,6 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
 
 static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t) value << ")");
-
     ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
     ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buffer->size);
 }
@@ -764,10 +775,11 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
     std::lock_guard<std::mutex> lock(webgpu_ctx->init_mutex);
     if (!webgpu_ctx->device_init) {
         // Initialize device
-        wgpu::DeviceDescriptor dev_desc;
+        std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16 };
+        wgpu::DeviceDescriptor         dev_desc;
         dev_desc.requiredLimits       = &webgpu_ctx->limits;
-        dev_desc.requiredFeatures     = webgpu_ctx->features.features;
-        dev_desc.requiredFeatureCount = webgpu_ctx->features.featureCount;
+        dev_desc.requiredFeatures     = required_features.data();
+        dev_desc.requiredFeatureCount = required_features.size();
         dev_desc.SetDeviceLostCallback(
             wgpu::CallbackMode::AllowSpontaneous,
             [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
@@ -920,7 +932,6 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     GGML_ASSERT(ctx->adapter != nullptr);
 
     ctx->adapter.GetLimits(&ctx->limits);
-    ctx->adapter.GetFeatures(&ctx->features);
 
     wgpu::AdapterInfo info{};
     ctx->adapter.GetInfo(&info);

From cddda7e73034d1e8594c4aef5ee56807b058b5d0 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Thu, 31 Jul 2025 12:28:29 -0700
Subject: [PATCH 06/17] Use device implicit synchronization

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 61f0a19f70..91411d9c00 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -276,8 +276,6 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
                                                   bool                              submit_imm = false) {
     webgpu_param_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
 
-    std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
-
     ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
     uint32_t * _params = (uint32_t *) params_bufs.host_buf.GetMappedRange();
     for (size_t i = 0; i < params.size(); i++) {
@@ -318,6 +316,8 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
                                            ctx->param_buf_pool.free_bufs({ params_bufs });
                                        });
     } else {
+        // Lock the context mutex when pushing to the staging vectors.
+        std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
         // Enqueue commands and only submit if we have enough staged commands
         ctx->staged_command_bufs.push_back(commands);
         ctx->staged_param_bufs.push_back(params_bufs);
@@ -564,7 +564,6 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
-    std::lock_guard<std::recursive_mutex> lock(webgpu_ctx->mutex);
     webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4);
 
     if (size % 4 != 0) {
@@ -622,11 +621,8 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
     encoder.CopyBufferToBuffer(buf_ctx->buffer, total_offset, webgpu_ctx->get_tensor_staging_buf, 0, final_size);
     wgpu::CommandBuffer commands = encoder.Finish();
 
-    {
-        std::lock_guard<std::recursive_mutex> submit_lock(webgpu_ctx->mutex);
-        // Submit the command buffer to the queue
-        webgpu_ctx->queue.Submit(1, &commands);
-    }
+    // Submit the command buffer to the queue
+    webgpu_ctx->queue.Submit(1, &commands);
 
     // Map the staging buffer to read the data
     ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, final_size);
@@ -775,7 +771,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
     std::lock_guard<std::mutex> lock(webgpu_ctx->init_mutex);
     if (!webgpu_ctx->device_init) {
         // Initialize device
-        std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16 };
+        std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16, wgpu::FeatureName::ImplicitDeviceSynchronization };
         wgpu::DeviceDescriptor         dev_desc;
         dev_desc.requiredLimits       = &webgpu_ctx->limits;
         dev_desc.requiredFeatures     = required_features.data();

From 6a20e396dc54e7f89d6660a7f7525433634dc0e0 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Fri, 1 Aug 2025 10:08:11 -0700
Subject: [PATCH 07/17] Update workflow to use custom release

---
 .github/workflows/build.yml | 65 ++++++++++---------------------------
 1 file changed, 17 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c6d51fb0c2..0dfe75c637 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -5,6 +5,7 @@ on:
   push:
     branches:
       - master
+      - workflow-updates
     paths: [
       '.github/workflows/build.yml',
       '.github/workflows/build-linux-cross.yml',
@@ -159,31 +160,15 @@ jobs:
       - name: Dawn Dependency
         id: dawn-depends
         run: |
-          ARTIFACTS_JSON=$(curl -s -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            "https://api.github.com/repos/google/dawn/actions/artifacts")
-          echo "Finding latest macos-latest-Release artifact..."
-          DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
-            | sort_by(.created_at)
-            | reverse
-            | map(select(.name | test("macos-latest-Release$")))
-            | .[0].archive_download_url')
-          if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
-            echo "No suitable Dawn artifact found!"
-            exit 1
-          fi
-          echo "Downloading from: $DOWNLOAD_URL"
-          curl -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -o artifact.zip "$DOWNLOAD_URL"
-          unzip artifact.zip
+          DAWN_VERSION="v1.0.0"
+          DAWN_OWNER="reeselevine"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
           mkdir dawn
-          tar_file=$(find . -name '*.tar.gz' | head -n 1)
-          echo "Extracting: $tar_file"
-          tar -xvf "$tar_file" -C dawn --strip-components=1
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1
 
       - name: Build
         id: cmake_build
@@ -433,31 +418,15 @@ jobs:
         id: dawn-depends
         run: |
           sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
-          ARTIFACTS_JSON=$(curl -s -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            "https://api.github.com/repos/google/dawn/actions/artifacts")
-          echo "Finding latest ubuntu-latest-Release artifact..."
-          DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
-            | sort_by(.created_at)
-            | reverse
-            | map(select(.name | test("ubuntu-latest-Release$")))
-            | .[0].archive_download_url')
-          if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
-            echo "No suitable Dawn artifact found!"
-            exit 1
-          fi
-          echo "Downloading from: $DOWNLOAD_URL"
-          curl -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -o artifact.zip "$DOWNLOAD_URL"
-          unzip artifact.zip
+          DAWN_VERSION="v1.0.0"
+          DAWN_OWNER="reeselevine"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
           mkdir dawn
-          tar_file=$(find . -name '*.tar.gz' | head -n 1)
-          echo "Extracting: $tar_file"
-          tar -xvf "$tar_file" -C dawn --strip-components=1
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1
 
       - name: Build
         id: cmake_build

From ea39068e393662c5002c7c6fab889b478028b3c3 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Fri, 1 Aug 2025 11:00:07 -0700
Subject: [PATCH 08/17] Remove testing branch workflow

---
 .github/workflows/build.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0dfe75c637..3d4f837e24 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -5,7 +5,6 @@ on:
   push:
     branches:
       - master
-      - workflow-updates
     paths: [
       '.github/workflows/build.yml',
       '.github/workflows/build-linux-cross.yml',

From 96d107e5059de2b9fdedb472a2380a3380f88f19 Mon Sep 17 00:00:00 2001
From: Neha Abbas <nehaabbas@ReeseLevines-MacBook-Pro.local>
Date: Fri, 1 Aug 2025 14:35:20 -0500
Subject: [PATCH 09/17] some f32 tests passing

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp       | 109 +++++++++++++++++++++
 ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl |  78 +++++++++++++++
 2 files changed, 187 insertions(+)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 91411d9c00..f5baa7e21f 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -128,6 +128,7 @@ struct webgpu_context_struct {
     wgpu::ComputePipeline memset_pipeline;
     wgpu::ComputePipeline mul_mat_pipeline;
     wgpu::ComputePipeline cpy_pipeline;
+    wgpu::ComputePipeline add_pipeline;
 
     size_t memset_bytes_per_thread;
 
@@ -450,6 +451,95 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
     ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline, params, entries, wg_x);
 }
 
+// adds src0 and src1 and puts in dst
+static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
+    // each tensor in GGML is stored inside a buffer on the GPU
+        // but that buffer may contain more than one tensor’s data (or it might be a subregion of a larger buffer)
+        // offset = starting byte position inside that buffer where this tensor’s data actually begins
+    
+    size_t src0_offset = ggml_backend_webgpu_tensor_offset(src0);
+    // assumes power of 2 offset alignment
+    size_t src0_misalignment = src0_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
+    // align to minimum offset alignment
+    src0_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
+
+    size_t src1_offset = ggml_backend_webgpu_tensor_offset(src1);
+    // assumes power of 2 offset alignment
+    size_t src1_misalignment = src1_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
+    // align to minimum offset alignment
+    src1_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
+
+    size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst);
+    size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
+    dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
+    
+    // set up parameters
+    std::vector<uint32_t> params = {
+        // number of elements-- determines how many threads to dispatch (one for each addition operation)
+        (uint32_t) ggml_nelements(dst),                             
+        
+        // even though tensors are 4d, the actual data is stored linearly
+        // stride = how many elements (or bytes) we must skip in memory to move from one value to another along a certain dimension
+            // i.e.
+            // nb[0] = 1                   // each element is next to the previous
+            // nb[1] = nb[0] * ne[0] = 5   // to move to next row, skip 5 elements
+            // nb[2] = nb[1] * ne[1] = 20  // to next matrix, skip 20 elements
+            // nb[3] = nb[2] * ne[2] = 60  // to next batch, skip 60 elements
+
+        // calculate element strides for each tensor
+        (uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
+        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
+        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
+        (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
+
+        (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
+        (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
+        (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
+        (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
+
+        (uint32_t) (dst->nb[0] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
+
+        // number of elements in each dimension
+        (uint32_t) dst->ne[0],
+        (uint32_t) dst->ne[1],
+        (uint32_t) dst->ne[2],
+        (uint32_t) dst->ne[3],
+
+        // offsets in terms of elements instead of bytes
+        (uint32_t) (src0_misalignment / ggml_type_size(src0->type)),
+        (uint32_t) (src1_misalignment / ggml_type_size(src1->type)),
+        (uint32_t) (dst_misalignment / ggml_type_size(dst->type)),
+    };
+
+    // bind group = groups together several GPU resources that shaders will use (e.g., buffers holding tensor data)
+    // bind group entry describes one resource within the bind group (in this case, one tensor)
+        // offset + size: specify exactly where in the gpu buffer the shader should read/write
+
+    std::vector<wgpu::BindGroupEntry> entries = {
+        { .binding = 0,
+         .buffer  = ggml_backend_webgpu_tensor_buf(src0),
+         .offset  = src0_offset,
+         .size    = (ggml_nbytes(src0) + src0_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) },
+        { .binding = 1,
+         .buffer  = ggml_backend_webgpu_tensor_buf(src1),
+         .offset  = src1_offset,
+         .size    = (ggml_nbytes(src1) + src1_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) },
+        { .binding = 2,
+         .buffer  = ggml_backend_webgpu_tensor_buf(dst),
+         .offset  = dst_offset,
+         .size    = (ggml_nbytes(dst) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) }
+    };
+
+    size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; // max threads in a single workgroup
+    uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; // number of workgroups to dispatch to cover all elements
+    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->add_pipeline, params, entries, wg_x); // dispatch shader
+
+}
+
+
 // Returns true if node has enqueued work into the queue, false otherwise
 static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
     if (ggml_is_empty(node)) {
@@ -476,6 +566,11 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
                 ggml_webgpu_mul_mat(ctx, src0, src1, node);
                 break;
             }
+        case GGML_OP_ADD:
+            {
+                ggml_webgpu_add(ctx, src0, src1, node);
+                break;
+            }
         default:
             return false;
     }
@@ -759,6 +854,14 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline, wgsl_cpy, "cpy", constants);
 }
 
+static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) {
+    std::vector<wgpu::ConstantEntry> constants(1);
+    constants[0].key   = "wg_size";
+    constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline, wgsl_add, "add", constants);
+}
+
+
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
 
@@ -812,6 +915,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
         ggml_webgpu_init_memset_pipeline(webgpu_ctx);
         ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx);
         ggml_webgpu_init_cpy_pipeline(webgpu_ctx);
+        ggml_webgpu_init_add_pipeline(webgpu_ctx);
         webgpu_ctx->device_init = true;
     }
 
@@ -866,6 +970,11 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
             return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_MUL_MAT:
             return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+        case GGML_OP_ADD: 
+            // return (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32) &&
+            //     (op->src[0]->type == op->type) &&
+            //     (op->src[1]->type == op->type);
+            return op->type == GGML_TYPE_F32;
         default:
             return false;
     }
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl
new file mode 100644
index 0000000000..9a23b18190
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl
@@ -0,0 +1,78 @@
+enable f16;
+
+@group(0) @binding(0)
+var<storage, read_write> src0: array<f32>;
+
+@group(0) @binding(1)
+var<storage, read_write> src1: array<f32>;
+
+@group(0) @binding(2)
+var<storage, read_write> dst: array<f32>;
+
+struct Params {
+    ne: u32,             // total number of elements
+
+    stride_src0_0: u32,
+    stride_src0_1: u32,
+    stride_src0_2: u32,
+    stride_src0_3: u32,
+
+    stride_src1_0: u32,
+    stride_src1_1: u32,
+    stride_src1_2: u32,
+    stride_src1_3: u32,
+
+    stride_dst_0: u32,
+    stride_dst_1: u32,
+    stride_dst_2: u32,
+    stride_dst_3: u32,
+
+    ne0: u32,
+    ne1: u32,
+    ne2: u32,
+    ne3: u32,
+
+    // offsets in elements
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_dst: u32,
+};
+
+@group(0) @binding(3)
+var<uniform> params: Params;
+
+override wg_size: u32;
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    if (gid.x >= params.ne) {
+        return;
+    }
+
+    var i = gid.x; // i = thread id
+
+    // compute indexes for each dimension of the tensor 
+    let i3 = i / (params.ne2 * params.ne1 * params.ne0);
+    i = i % (params.ne2 * params.ne1 * params.ne0);
+
+    let i2 = i / (params.ne1 * params.ne0);
+    i = i % (params.ne1 * params.ne0);
+
+    let i1 = i / params.ne0;
+
+    let i0 = i % params.ne0;
+
+    // compute indexes for position in each flat array
+    let src0_idx = i0 * params.stride_src0_0 + i1 * params.stride_src0_1 +
+                   i2 * params.stride_src0_2 + i3 * params.stride_src0_3;
+
+    let src1_idx = i0 * params.stride_src1_0 + i1 * params.stride_src1_1 +
+                   i2 * params.stride_src1_2 + i3 * params.stride_src1_3;
+
+    let dst_idx = i0 * params.stride_dst_0 + i1 * params.stride_dst_1 +
+                  i2 * params.stride_dst_2 + i3 * params.stride_dst_3;
+
+
+    // dst[dst_idx] = src0[src0_idx] + src1[src1_idx];
+
+    dst[params.offset_dst + dst_idx] = src0[params.offset_src0 + src0_idx] + src1[params.offset_src1 + src1_idx];
+}

From ae8edbfd11656cc2cac934ec84aea4d7fa5ea133 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 4 Aug 2025 11:18:23 -0700
Subject: [PATCH 10/17] Disable set_rows until it's implemented

---
 .github/workflows/build.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3d4f837e24..63e40c3586 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -179,6 +179,7 @@ jobs:
       - name: Test
         id: cmake_test
         run: |
+          export LLAMA_SET_ROWS=0
           cd build
           ctest -L main --verbose --timeout 900
 
@@ -437,6 +438,7 @@ jobs:
       - name: Test
         id: cmake_test
         run: |
+          export LLAMA_SET_ROWS=0
           cd build
           # This is using llvmpipe and runs slower than other backends
           ctest -L main --verbose --timeout 3600

From 39aa11d9a4ce8baf93c2c112b108b3bdb35aeae5 Mon Sep 17 00:00:00 2001
From: Neha Abbas <nehaabbas@ReeseLevines-MacBook-Pro.local>
Date: Mon, 4 Aug 2025 15:52:07 -0500
Subject: [PATCH 11/17] f32 add all tests passing

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp       | 36 +++++++++---
 ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl | 65 +++++++++++++---------
 2 files changed, 67 insertions(+), 34 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index f5baa7e21f..e4e5b1dbfa 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -451,6 +451,16 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
     ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline, params, entries, wg_x);
 }
 
+// sample test
+// ADD(type=f32, ne=[10,5,4,3], nr=[2,1,1,1], nf=1)
+    // ne: number of elements in each dimension of tensor b
+    // nr: number of repetitions in each dimension
+        // tensor b is the smaller tensor, and is broadcasted with repetitions to match the size of a 
+        // broadcasted with ne * nr
+            // 10*2, 5*1, 4*1, 3*1 = [20, 5, 4, 3] is the shape of dst and a
+            // essentially, if nr[x] is > 1, that dimension of b is repeated 
+    // nf: number of fused operations (1 means singular addition)
+
 // adds src0 and src1 and puts in dst
 static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
     // each tensor in GGML is stored inside a buffer on the GPU
@@ -464,15 +474,13 @@ static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tenso
     src0_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
 
     size_t src1_offset = ggml_backend_webgpu_tensor_offset(src1);
-    // assumes power of 2 offset alignment
     size_t src1_misalignment = src1_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
-    // align to minimum offset alignment
     src1_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
 
     size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst);
     size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
     dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
-    
+
     // set up parameters
     std::vector<uint32_t> params = {
         // number of elements-- determines how many threads to dispatch (one for each addition operation)
@@ -480,11 +488,13 @@ static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tenso
         
         // even though tensors are 4d, the actual data is stored linearly
         // stride = how many elements (or bytes) we must skip in memory to move from one value to another along a certain dimension
-            // i.e.
-            // nb[0] = 1                   // each element is next to the previous
-            // nb[1] = nb[0] * ne[0] = 5   // to move to next row, skip 5 elements
-            // nb[2] = nb[1] * ne[1] = 20  // to next matrix, skip 20 elements
-            // nb[3] = nb[2] * ne[2] = 60  // to next batch, skip 60 elements
+            // i.e. tensor: [5, 6, 3, 2], ggml_type_size: 4 (each number is 4 bytes) 
+                // (nb = number of bytes to skip for each element (stride))
+                // (ne = number of elements in that dimension)
+            // nb[0] = 4                                // each element is next to the previous, so only 4 bytes in between
+            // nb[1] = nb[0] * ne[0] = 4 * 5 = 20       // to move to next row, skip 20 bytes
+            // nb[2] = nb[1] * ne[1] = 20 * 6 = 120     // to next matrix, skip 120 elements
+            // nb[3] = nb[2] * ne[2] = 120 * 3 = 360    // to next batch, skip 60 elements
 
         // calculate element strides for each tensor
         (uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
@@ -502,16 +512,24 @@ static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tenso
         (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
         (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
 
-        // number of elements in each dimension
+        // number of elements in each dimension of larger tensors (src0 and dst)
         (uint32_t) dst->ne[0],
         (uint32_t) dst->ne[1],
         (uint32_t) dst->ne[2],
         (uint32_t) dst->ne[3],
 
+        // number of elements in each dimension of smaller tensor to be broadcasted (src1)
+        (uint32_t) src1->ne[0],
+        (uint32_t) src1->ne[1],
+        (uint32_t) src1->ne[2],
+        (uint32_t) src1->ne[3],
+
         // offsets in terms of elements instead of bytes
         (uint32_t) (src0_misalignment / ggml_type_size(src0->type)),
         (uint32_t) (src1_misalignment / ggml_type_size(src1->type)),
         (uint32_t) (dst_misalignment / ggml_type_size(dst->type)),
+
+        
     };
 
     // bind group = groups together several GPU resources that shaders will use (e.g., buffers holding tensor data)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl
index 9a23b18190..93adb0d562 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl
@@ -10,7 +10,7 @@ var<storage, read_write> src1: array<f32>;
 var<storage, read_write> dst: array<f32>;
 
 struct Params {
-    ne: u32,             // total number of elements
+    ne: u32,
 
     stride_src0_0: u32,
     stride_src0_1: u32,
@@ -27,10 +27,15 @@ struct Params {
     stride_dst_2: u32,
     stride_dst_3: u32,
 
-    ne0: u32,
-    ne1: u32,
-    ne2: u32,
-    ne3: u32,
+    a_ne0: u32,
+    a_ne1: u32,
+    a_ne2: u32,
+    a_ne3: u32,
+
+    b_ne0: u32,
+    b_ne1: u32,
+    b_ne2: u32,
+    b_ne3: u32,
 
     // offsets in elements
     offset_src0: u32,
@@ -48,31 +53,41 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
         return;
     }
 
-    var i = gid.x; // i = thread id
+    // i = thread id, ranges from 0 --> total ne - 1 
+    // represents the position in the flat array a we are adding with array b
+    var i = gid.x;  
 
-    // compute indexes for each dimension of the tensor 
-    let i3 = i / (params.ne2 * params.ne1 * params.ne0);
-    i = i % (params.ne2 * params.ne1 * params.ne0);
+    // given the index of linear a, we want to compute the 4d index [a_i0, a_i1, a_i2, a_i3]
+    // we need this because tensor a and b are different shapes 
+    // so the same linear index won't work for b, and we can only compute b's linear index from the 4d index of a
+ 
+    let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
 
-    let i2 = i / (params.ne1 * params.ne0);
-    i = i % (params.ne1 * params.ne0);
+    let a_i2 = i / (params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne1 * params.a_ne0);
 
-    let i1 = i / params.ne0;
+    let a_i1 = i / params.a_ne0;
 
-    let i0 = i % params.ne0;
-
-    // compute indexes for position in each flat array
-    let src0_idx = i0 * params.stride_src0_0 + i1 * params.stride_src0_1 +
-                   i2 * params.stride_src0_2 + i3 * params.stride_src0_3;
-
-    let src1_idx = i0 * params.stride_src1_0 + i1 * params.stride_src1_1 +
-                   i2 * params.stride_src1_2 + i3 * params.stride_src1_3;
-
-    let dst_idx = i0 * params.stride_dst_0 + i1 * params.stride_dst_1 +
-                  i2 * params.stride_dst_2 + i3 * params.stride_dst_3;
+    let a_i0 = i % params.a_ne0;
 
 
-    // dst[dst_idx] = src0[src0_idx] + src1[src1_idx];
+    // handle repetition of b 
+        // index loops back to the beginning and repeats after elements are exhausted = modulo
+    let b_i0 = a_i0 % params.b_ne0;
+    let b_i1 = a_i1 % params.b_ne1;
+    let b_i2 = a_i2 % params.b_ne2;
+    let b_i3 = a_i3 % params.b_ne3;
 
-    dst[params.offset_dst + dst_idx] = src0[params.offset_src0 + src0_idx] + src1[params.offset_src1 + src1_idx];
+
+    // compute index for position in b's flat array
+    let src1_idx = b_i0 * params.stride_src1_0 +
+                b_i1 * params.stride_src1_1 +
+                b_i2 * params.stride_src1_2 +
+                b_i3 * params.stride_src1_3;
+
+    // actual addition operation, now that the indexes are all figured out
+    // ensuring that the offsets are included
+    // gid.x used for flat indexing into dst and a, since variable i was modified during calcs
+    dst[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] + src1[params.offset_src1 + src1_idx];
 }

From 6a6135cc85f602b629ac513a1c24e7f122cae6e3 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 5 Aug 2025 13:30:08 -0700
Subject: [PATCH 12/17] Begin work on set_rows

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 126 ++++++++++++++++++++++++++-
 tests/test-backend-ops.cpp           |  11 +--
 2 files changed, 131 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 5009e26a20..8ba416f0eb 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -20,6 +20,7 @@
 
 #ifdef GGML_WEBGPU_DEBUG
 #    define WEBGPU_LOG_DEBUG(msg) std::cout << msg << std::endl
+#    define WEBGPU_DEBUG_BUF_ELEMS 32
 #else
 #    define WEBGPU_LOG_DEBUG(msg) ((void) 0)
 #endif  // GGML_WEBGPU_DEBUG
@@ -125,6 +126,7 @@ struct webgpu_context_struct {
 
     wgpu::ComputePipeline memset_pipeline;
     wgpu::ComputePipeline mul_mat_pipeline;
+    wgpu::ComputePipeline set_rows_pipeline;
     wgpu::ComputePipeline cpy_pipeline;
 
     size_t memset_bytes_per_thread;
@@ -139,6 +141,12 @@ struct webgpu_context_struct {
     std::vector<webgpu_param_bufs> staged_param_bufs;
 
     std::vector<wgpu::FutureWaitInfo> callback_futures;
+
+#ifdef GGML_WEBGPU_DEBUG
+    wgpu::Buffer debug_host_buf;
+    wgpu::Buffer debug_dev_buf;
+#endif
+
 };
 
 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -283,6 +291,27 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
                           UINT64_MAX);
 }
 
+#ifdef GGML_WEBGPU_DEBUG
+// This function adds debugging information to shaders, as WebGPU does not support printing directly.
+// To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and
+// debug statements in the shader, and then call this function after encoding the commands.
+static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
+    wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
+    encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize());
+    wgpu::CommandBuffer commands = encoder.Finish();
+    ctx->queue.Submit(1, &commands);
+
+    ggml_backend_webgpu_map_buffer(ctx, ctx->debug_host_buf, wgpu::MapMode::Read, 0, ctx->debug_host_buf.GetSize());
+    const uint32_t * debug_data = (const uint32_t *) ctx->debug_host_buf.GetConstMappedRange();
+    std::cout << "debug data:";
+    for (size_t i = 0; i < WEBGPU_DEBUG_BUF_ELEMS; i++) {
+        std::cout << "  " << i << ": " << debug_data[i];
+    }
+    std::cout << "\n";
+    ctx->debug_host_buf.Unmap();
+}
+#endif
+
 static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &                  ctx,
                                                   wgpu::ComputePipeline &           pipeline,
                                                   std::vector<uint32_t>             params,
@@ -429,6 +458,74 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor
     ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline, params, entries, wg_x);
 }
 
+static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
+    // For set rows specifically, we need to check if src and idx are empty tensors.
+    if (ggml_is_empty(src) || ggml_is_empty(idx)) {
+        return;
+    }
+
+    size_t src_offset       = ggml_backend_webgpu_tensor_offset(src);
+    // assumes power of 2 offset alignment
+    size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
+    // align to minimum offset alignment
+    src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
+    size_t idx_offset       = ggml_backend_webgpu_tensor_offset(idx);
+    size_t idx_misalignment = idx_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
+    idx_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
+    size_t dst_offset       = ggml_backend_webgpu_tensor_offset(dst);
+    size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
+    dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
+
+    std::vector<uint32_t> params = {
+                                     (uint32_t) (src_misalignment / ggml_type_size(src->type)),
+                                     (uint32_t) (idx_misalignment / ggml_type_size(idx->type)),
+                                     (uint32_t) (dst_misalignment / ggml_type_size(dst->type)),
+                                     // Convert byte-strides to element-strides
+                                     (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
+                                     (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
+                                     (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
+                                     (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
+                                     (uint32_t) (idx->nb[1] / ggml_type_size(idx->type)),
+                                     (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
+                                     (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
+                                     (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
+                                     (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
+                                     // Shape of src
+                                     (uint32_t) src->ne[0],
+                                     (uint32_t) src->ne[1],
+                                     (uint32_t) src->ne[2],
+                                     (uint32_t) src->ne[3],
+                                     // broadcast shape of idx
+                                     (uint32_t) (src->ne[2] / idx->ne[1]),
+                                     (uint32_t) (src->ne[3] / idx->ne[2])
+    };
+
+    std::vector<wgpu::BindGroupEntry> entries = {
+        { .binding = 0,
+         .buffer  = ggml_backend_webgpu_tensor_buf(src),
+         .offset  = ggml_backend_webgpu_tensor_offset(src),
+         .size    = ggml_nbytes(src) },
+        { .binding = 1,
+         .buffer  = ggml_backend_webgpu_tensor_buf(idx),
+         .offset  = ggml_backend_webgpu_tensor_offset(idx),
+         .size    = ggml_nbytes(idx) },
+        { .binding = 2,
+         .buffer  = ggml_backend_webgpu_tensor_buf(dst),
+         .offset  = ggml_backend_webgpu_tensor_offset(dst),
+         .size    = ggml_nbytes(dst)  },
+        { .binding = 3,
+         .buffer  = ctx->debug_dev_buf,
+         .offset  = 0,
+         .size    = ctx->debug_dev_buf.GetSize() }
+    };
+
+    size_t   max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
+    uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
+    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x);
+    ggml_backend_webgpu_submit_queue(ctx);
+    ggml_backend_webgpu_debug(ctx);
+}
+
 static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
     std::vector<uint32_t> params = {
         (uint32_t) dst->ne[1],                                  // number of rows in result (M)
@@ -487,6 +584,11 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
                 ggml_webgpu_cpy(ctx, src0, node);
                 break;
             }
+        case GGML_OP_SET_ROWS:
+            {
+                ggml_webgpu_set_rows(ctx, src0, src1, node);
+                break;
+            }
         case GGML_OP_MUL_MAT:
             {
                 ggml_webgpu_mul_mat(ctx, src0, src1, node);
@@ -771,6 +873,13 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat");
 }
 
+static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
+    std::vector<wgpu::ConstantEntry> constants(1);
+    constants[0].key   = "wg_size";
+    constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", constants);
+}
+
 static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants(1);
     constants[0].key   = "wg_size";
@@ -831,7 +940,22 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
 
         ggml_webgpu_init_memset_pipeline(webgpu_ctx);
         ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx);
+        ggml_webgpu_init_set_rows_pipeline(webgpu_ctx);
         ggml_webgpu_init_cpy_pipeline(webgpu_ctx);
+
+#ifdef GGML_WEBGPU_DEBUG
+        // Initialize debug buffers
+        ggml_webgpu_create_buffer(webgpu_ctx->device,
+                                  webgpu_ctx->debug_host_buf,
+                                  WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
+                                  wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
+                                  "debug_host_buf");
+        ggml_webgpu_create_buffer(webgpu_ctx->device,
+                                  webgpu_ctx->debug_dev_buf,
+                                  WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
+                                  wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc,
+                                  "debug_dev_buf");
+#endif
         webgpu_ctx->device_init = true;
     }
 
@@ -882,7 +1006,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
             return true;
-        case GGML_OP_CPY:
+        case GGML_OP_CPY | GGML_OP_SET_ROWS:
             return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_MUL_MAT:
             return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index d29779cd12..6eb240456e 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1212,12 +1212,13 @@ struct test_case {
 
             double err = nmse(f1.data(), f2.data(), f1.size());
             if (err > ud->max_err) {
+                //printf("Backends %s and %s mismatch: ", bn1, bn2);
                 printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
-                //for (int i = 0; i < (int) f1.size(); i++) {
-                //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
-                //}
-                //printf("\n");
-                //exit(1);
+                for (int i = 0; i < (int) f1.size(); i++) {
+                    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
+                }
+                printf("\n");
+                exit(1);
                 ud->ok = false;
             }
             return true;

From b2dbfcdcb143d6f572ec9ee5a36ee4494199f8a6 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 5 Aug 2025 16:33:15 -0700
Subject: [PATCH 13/17] Work on set rows

---
 .github/workflows/build.yml                   |  2 -
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 13 +---
 .../ggml-webgpu/wgsl-shaders/set_rows.wgsl    | 73 +++++++++++++++++++
 tests/test-backend-ops.cpp                    | 12 +--
 4 files changed, 83 insertions(+), 17 deletions(-)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 63e40c3586..3d4f837e24 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -179,7 +179,6 @@ jobs:
       - name: Test
         id: cmake_test
         run: |
-          export LLAMA_SET_ROWS=0
           cd build
           ctest -L main --verbose --timeout 900
 
@@ -438,7 +437,6 @@ jobs:
       - name: Test
         id: cmake_test
         run: |
-          export LLAMA_SET_ROWS=0
           cd build
           # This is using llvmpipe and runs slower than other backends
           ctest -L main --verbose --timeout 3600
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 8ba416f0eb..b4c3ea08dc 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -495,9 +495,9 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
                                      (uint32_t) src->ne[1],
                                      (uint32_t) src->ne[2],
                                      (uint32_t) src->ne[3],
-                                     // broadcast shape of idx
-                                     (uint32_t) (src->ne[2] / idx->ne[1]),
-                                     (uint32_t) (src->ne[3] / idx->ne[2])
+                                     // Shape of idx
+                                     (uint32_t) (idx->ne[1]),
+                                     (uint32_t) (idx->ne[2])
     };
 
     std::vector<wgpu::BindGroupEntry> entries = {
@@ -512,18 +512,13 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
         { .binding = 2,
          .buffer  = ggml_backend_webgpu_tensor_buf(dst),
          .offset  = ggml_backend_webgpu_tensor_offset(dst),
-         .size    = ggml_nbytes(dst)  },
-        { .binding = 3,
-         .buffer  = ctx->debug_dev_buf,
-         .offset  = 0,
-         .size    = ctx->debug_dev_buf.GetSize() }
+         .size    = ggml_nbytes(dst)  }
     };
 
     size_t   max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
     uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
     ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x);
     ggml_backend_webgpu_submit_queue(ctx);
-    ggml_backend_webgpu_debug(ctx);
 }
 
 static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
new file mode 100644
index 0000000000..9b8c2634e4
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
@@ -0,0 +1,73 @@
+enable f16;
+
+@group(0) @binding(0)
+var<storage, read_write> src: array<f32>;
+
+@group(0) @binding(1)
+var<storage, read_write> idx: array<u32>;
+
+@group(0) @binding(2)
+var<storage, read_write> dst: array<f16>;
+
+struct Params {
+    offset_src: u32, // in elements
+    offset_idx: u32, // in elements
+    offset_dst: u32, // in elements
+
+    // Strides (in elements)
+    stride_src1: u32,
+    stride_src2: u32,
+    stride_src3: u32,
+
+    stride_idx0: u32,
+    stride_idx1: u32,
+    stride_idx2: u32,
+
+    stride_dst1: u32,
+    stride_dst2: u32,
+    stride_dst3: u32,
+
+    // Shape of src
+    ne0: u32,
+    n_rows: u32,
+    ne2: u32,
+    ne3: u32,
+
+    // Shape of idx
+    idx1: u32,
+    idx2: u32,
+};
+
+@group(0) @binding(3)
+var<uniform> params: Params;
+
+override wg_size: u32;
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    if (gid.x >= params.n_rows * params.ne2 * params.ne3) {
+        return;
+    }
+    var i = gid.x;
+    let i_src3 = i / (params.ne2 * params.n_rows);
+    let i_dst3 = i / (params.ne2 * 3);
+
+    i = i % (params.ne2 * params.n_rows);
+    let i_src2 = i / params.n_rows;
+    let i_src1 = i % params.n_rows;
+
+    let i_idx2 = i_src3 % params.idx2;
+    let i_idx1 = i_src2 % params.idx1;
+    let i_idx0 = i_src1;
+
+    let idx_high = (params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2) * 2;
+
+    let idx_high_val = idx[idx_high];
+    let idx_low_val = idx[idx_high + 1];
+
+    let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3;
+    let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3;
+
+    for (var i: u32 = 0; i < params.ne0; i++) {
+      dst[i_dst_row + i] = f16(src[i_src_row + i]);
+    }
+}
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 6eb240456e..5b9dc7f59b 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1213,12 +1213,12 @@ struct test_case {
             double err = nmse(f1.data(), f2.data(), f1.size());
             if (err > ud->max_err) {
                 //printf("Backends %s and %s mismatch: ", bn1, bn2);
-                printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
-                for (int i = 0; i < (int) f1.size(); i++) {
-                    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
-                }
-                printf("\n");
-                exit(1);
+                //printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
+                //for (int i = 0; i < (int) f1.size(); i++) {
+                //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
+                //}
+                //printf("\n");
+                //exit(1);
                 ud->ok = false;
             }
             return true;

From 248f7a512f3e0f0adf728369b3b01037dc2f636b Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 6 Aug 2025 10:15:50 -0700
Subject: [PATCH 14/17] Add error buffers for reporting unsupported SET_ROWS
 indices

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 141 ++++++++++++------
 .../ggml-webgpu/wgsl-shaders/set_rows.wgsl    |  11 +-
 2 files changed, 107 insertions(+), 45 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index b4c3ea08dc..1a223ed939 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -19,7 +19,7 @@
 #include <vector>
 
 #ifdef GGML_WEBGPU_DEBUG
-#    define WEBGPU_LOG_DEBUG(msg) std::cout << msg << std::endl
+#    define WEBGPU_LOG_DEBUG(msg)  std::cout << msg << std::endl
 #    define WEBGPU_DEBUG_BUF_ELEMS 32
 #else
 #    define WEBGPU_LOG_DEBUG(msg) ((void) 0)
@@ -27,11 +27,13 @@
 
 /* Constants */
 
-#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 16
-#define WEBGPU_MUL_MAT_WG_SIZE           64
-#define WEBGPU_NUM_PARAM_BUFS            100
-#define WEBGPU_PARAMS_BUF_SIZE_BYTES     256
-#define WEBGPU_STORAGE_BUF_BINDING_MULT  4  // a storage buffer binding size must be a multiple of 4
+#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE     16
+#define WEBGPU_MUL_MAT_WG_SIZE               64
+#define WEBGPU_NUM_PARAM_BUFS                100
+#define WEBGPU_PARAMS_BUF_SIZE_BYTES         128  // enough for 32 parameters
+#define WEBGPU_NUM_SET_ROWS_ERROR_BUFS       32
+#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
+#define WEBGPU_STORAGE_BUF_BINDING_MULT      4  // a storage buffer binding size must be a multiple of 4
 
 /* End Constants */
 
@@ -55,46 +57,42 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
                                       wgpu::BufferUsage usage,
                                       const char *      label);
 
-struct webgpu_param_bufs {
+struct webgpu_pool_bufs {
     wgpu::Buffer host_buf;
     wgpu::Buffer dev_buf;
 };
 
 // Holds a pool of parameter buffers for WebGPU operations
-struct webgpu_param_buf_pool {
-    std::vector<webgpu_param_bufs> free;
+struct webgpu_buf_pool {
+    std::vector<webgpu_pool_bufs> free;
 
     std::mutex mutex;
 
     std::condition_variable cv;
 
-    void init(wgpu::Device device) {
-        for (int i = 0; i < WEBGPU_NUM_PARAM_BUFS; i++) {
+    void init(wgpu::Device      device,
+              int               num_bufs,
+              size_t            buf_size,
+              wgpu::BufferUsage dev_buf_usage,
+              wgpu::BufferUsage host_buf_usage) {
+        for (int i = 0; i < num_bufs; i++) {
             wgpu::Buffer host_buf;
             wgpu::Buffer dev_buf;
-            ggml_webgpu_create_buffer(device,
-                                      host_buf,
-                                      WEBGPU_PARAMS_BUF_SIZE_BYTES,
-                                      wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite,
-                                      "ggml_webgpu_host_params_buf");
-            ggml_webgpu_create_buffer(device,
-                                      dev_buf,
-                                      WEBGPU_PARAMS_BUF_SIZE_BYTES,
-                                      wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
-                                      "ggml_webgpu_dev_params_buf");
+            ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_pool_buf");
+            ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_pool_buf");
             free.push_back({ host_buf, dev_buf });
         }
     }
 
-    webgpu_param_bufs alloc_bufs() {
+    webgpu_pool_bufs alloc_bufs() {
         std::unique_lock<std::mutex> lock(mutex);
         cv.wait(lock, [this] { return !free.empty(); });
-        webgpu_param_bufs bufs = free.back();
+        webgpu_pool_bufs bufs = free.back();
         free.pop_back();
         return bufs;
     }
 
-    void free_bufs(std::vector<webgpu_param_bufs> bufs) {
+    void free_bufs(std::vector<webgpu_pool_bufs> bufs) {
         std::lock_guard<std::mutex> lock(mutex);
         free.insert(free.end(), bufs.begin(), bufs.end());
         cv.notify_all();
@@ -122,7 +120,8 @@ struct webgpu_context_struct {
 
     bool device_init = false;
 
-    webgpu_param_buf_pool param_buf_pool;
+    webgpu_buf_pool param_buf_pool;
+    webgpu_buf_pool set_rows_error_buf_pool;
 
     wgpu::ComputePipeline memset_pipeline;
     wgpu::ComputePipeline mul_mat_pipeline;
@@ -138,7 +137,9 @@ struct webgpu_context_struct {
     std::vector<wgpu::CommandBuffer> staged_command_bufs;
 
     // Parameter buffers associated with the staged command buffers
-    std::vector<webgpu_param_bufs> staged_param_bufs;
+    std::vector<webgpu_pool_bufs> staged_param_bufs;
+    // Buffers associated with set_rows operations, used to store potential errors
+    std::vector<webgpu_pool_bufs> staged_set_row_error_bufs;
 
     std::vector<wgpu::FutureWaitInfo> callback_futures;
 
@@ -146,7 +147,6 @@ struct webgpu_context_struct {
     wgpu::Buffer debug_host_buf;
     wgpu::Buffer debug_dev_buf;
 #endif
-
 };
 
 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -257,20 +257,55 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
         return;
     }
     ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data());
+
+    // If there are SET_ROWS operations in this submission, copy their error buffers to the host.
+    if (ctx->staged_set_row_error_bufs.size() > 0) {
+        wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
+        for (auto & error_bufs : ctx->staged_set_row_error_bufs) {
+            // Copy the error buffer to the host buffer
+            encoder.CopyBufferToBuffer(error_bufs.dev_buf, 0, error_bufs.host_buf, 0, error_bufs.host_buf.GetSize());
+        }
+        wgpu::CommandBuffer commands = encoder.Finish();
+        ctx->queue.Submit(1, &commands);
+    }
+
     ctx->staged_command_bufs.clear();
-    std::vector<webgpu_param_bufs> staged_param_bufs = std::move(ctx->staged_param_bufs);
+    std::vector<webgpu_pool_bufs> staged_param_bufs         = std::move(ctx->staged_param_bufs);
+    std::vector<webgpu_pool_bufs> staged_set_row_error_bufs = std::move(ctx->staged_set_row_error_bufs);
 
     // Free the staged parameter buffers once the submission completes
-    wgpu::Future f = ctx->queue.OnSubmittedWorkDone(
+    wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone(
         wgpu::CallbackMode::AllowSpontaneous,
         [ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
             if (status != wgpu::QueueWorkDoneStatus::Success) {
                 GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data);
             }
-            // Free the staged parameter buffers
+            // Free the staged buffers
             ctx->param_buf_pool.free_bufs(staged_param_bufs);
         });
-    ctx->callback_futures.push_back({ f });
+    ctx->callback_futures.push_back({ p_f });
+
+    // Check for errrors in SET_ROWS operations
+    for (auto & error_bufs : staged_set_row_error_bufs) {
+        wgpu::Future f = error_bufs.host_buf.MapAsync(
+            wgpu::MapMode::Read,
+            0,
+            error_bufs.host_buf.GetSize(),
+            wgpu::CallbackMode::AllowSpontaneous,
+            [ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+                if (status != wgpu::MapAsyncStatus::Success) {
+                    GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", message.data);
+                } else {
+                    const uint32_t * error_data = (const uint32_t *) error_bufs.host_buf.GetConstMappedRange();
+                    if (*error_data) {
+                        GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported.");
+                    }
+                    // We can't unmap in here due to WebGPU reentrancy limitations.
+                    ctx->set_rows_error_buf_pool.free_bufs({ error_bufs });
+                }
+            });
+        ctx->callback_futures.push_back({ f });
+    }
 }
 
 static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
@@ -294,7 +329,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
 #ifdef GGML_WEBGPU_DEBUG
 // This function adds debugging information to shaders, as WebGPU does not support printing directly.
 // To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and
-// debug statements in the shader, and then call this function after encoding the commands.
+// debug statements in the shader, and then call this function after encoding the commands and submitting them.
 static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
     wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
     encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize());
@@ -318,7 +353,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
                                                   std::vector<wgpu::BindGroupEntry> bind_group_entries,
                                                   uint32_t                          wg_x,
                                                   bool                              submit_and_wait = false) {
-    webgpu_param_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
+    webgpu_pool_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
 
     ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
     uint32_t * _params = (uint32_t *) params_bufs.host_buf.GetMappedRange();
@@ -464,6 +499,12 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
         return;
     }
 
+    // allocate error bufs
+    webgpu_pool_bufs error_bufs = ctx->set_rows_error_buf_pool.alloc_bufs();
+    if (error_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
+        error_bufs.host_buf.Unmap();
+    }
+
     size_t src_offset       = ggml_backend_webgpu_tensor_offset(src);
     // assumes power of 2 offset alignment
     size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
@@ -476,8 +517,7 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
     size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
     dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
 
-    std::vector<uint32_t> params = {
-                                     (uint32_t) (src_misalignment / ggml_type_size(src->type)),
+    std::vector<uint32_t> params = { (uint32_t) (src_misalignment / ggml_type_size(src->type)),
                                      (uint32_t) (idx_misalignment / ggml_type_size(idx->type)),
                                      (uint32_t) (dst_misalignment / ggml_type_size(dst->type)),
                                      // Convert byte-strides to element-strides
@@ -497,28 +537,31 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
                                      (uint32_t) src->ne[3],
                                      // Shape of idx
                                      (uint32_t) (idx->ne[1]),
-                                     (uint32_t) (idx->ne[2])
-    };
+                                     (uint32_t) (idx->ne[2]) };
 
     std::vector<wgpu::BindGroupEntry> entries = {
         { .binding = 0,
          .buffer  = ggml_backend_webgpu_tensor_buf(src),
          .offset  = ggml_backend_webgpu_tensor_offset(src),
-         .size    = ggml_nbytes(src) },
+         .size    = ggml_nbytes(src)                                                                       },
         { .binding = 1,
          .buffer  = ggml_backend_webgpu_tensor_buf(idx),
          .offset  = ggml_backend_webgpu_tensor_offset(idx),
-         .size    = ggml_nbytes(idx) },
+         .size    = ggml_nbytes(idx)                                                                       },
         { .binding = 2,
          .buffer  = ggml_backend_webgpu_tensor_buf(dst),
          .offset  = ggml_backend_webgpu_tensor_offset(dst),
-         .size    = ggml_nbytes(dst)  }
+         .size    = ggml_nbytes(dst)                                                                       },
+        { .binding = 3, .buffer = error_bufs.dev_buf,    .offset = 0, .size = error_bufs.dev_buf.GetSize() }
     };
 
     size_t   max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
-    uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
+    uint32_t wg_x        = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
+
+    std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
+    ctx->staged_set_row_error_bufs.push_back(error_bufs);
+
     ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x);
-    ggml_backend_webgpu_submit_queue(ctx);
 }
 
 static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
@@ -872,7 +915,8 @@ static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants(1);
     constants[0].key   = "wg_size";
     constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", constants);
+    ggml_webgpu_create_pipeline(
+        webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", constants);
 }
 
 static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
@@ -931,7 +975,16 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
         webgpu_ctx->queue = webgpu_ctx->device.GetQueue();
 
         // Create buffer pool for shader parameters
-        webgpu_ctx->param_buf_pool.init(webgpu_ctx->device);
+        webgpu_ctx->param_buf_pool.init(webgpu_ctx->device,
+                                        WEBGPU_NUM_PARAM_BUFS,
+                                        WEBGPU_PARAMS_BUF_SIZE_BYTES,
+                                        wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
+                                        wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
+        webgpu_ctx->set_rows_error_buf_pool.init(webgpu_ctx->device,
+                                                 WEBGPU_NUM_SET_ROWS_ERROR_BUFS,
+                                                 WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
+                                                 wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage,
+                                                 wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);
 
         ggml_webgpu_init_memset_pipeline(webgpu_ctx);
         ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx);
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
index 9b8c2634e4..4bd6f94a23 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
@@ -9,6 +9,9 @@ var<storage, read_write> idx: array<u32>;
 @group(0) @binding(2)
 var<storage, read_write> dst: array<f16>;
 
+@group(0) @binding(3)
+var<storage, read_write> error: atomic<u32>;
+
 struct Params {
     offset_src: u32, // in elements
     offset_idx: u32, // in elements
@@ -38,7 +41,7 @@ struct Params {
     idx2: u32,
 };
 
-@group(0) @binding(3)
+@group(0) @binding(4)
 var<uniform> params: Params;
 
 override wg_size: u32;
@@ -64,6 +67,12 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let idx_high_val = idx[idx_high];
     let idx_low_val = idx[idx_high + 1];
 
+    if (idx_low_val != 0) {
+        // Upper bits of index are not zero, output will be incorrect
+        atomicStore(&error, 1);
+        return;
+    }
+
     let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3;
     let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3;
 

From 4ad09861234eb0adba01490335d819f555847fb9 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 6 Aug 2025 12:25:55 -0700
Subject: [PATCH 15/17] Remove extra comments

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 1 -
 tests/test-backend-ops.cpp           | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 1a223ed939..ba1addc8d9 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -499,7 +499,6 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
         return;
     }
 
-    // allocate error bufs
     webgpu_pool_bufs error_bufs = ctx->set_rows_error_buf_pool.alloc_bufs();
     if (error_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
         error_bufs.host_buf.Unmap();
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 5b9dc7f59b..d29779cd12 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1212,8 +1212,7 @@ struct test_case {
 
             double err = nmse(f1.data(), f2.data(), f1.size());
             if (err > ud->max_err) {
-                //printf("Backends %s and %s mismatch: ", bn1, bn2);
-                //printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
+                printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
                 //for (int i = 0; i < (int) f1.size(); i++) {
                 //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
                 //}

From 7f9ee10e75d1644845926448feeb36410e6c18ed Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Thu, 4 Sep 2025 14:12:44 -0700
Subject: [PATCH 16/17] Add templated addition, clean up code

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 379 ++++++------------
 .../wgsl-shaders/{add.wgsl => add.tmpl.wgsl}  |  63 +--
 .../ggml-webgpu/wgsl-shaders/embed_wgsl.py    |  19 +-
 3 files changed, 179 insertions(+), 282 deletions(-)
 rename ggml/src/ggml-webgpu/wgsl-shaders/{add.wgsl => add.tmpl.wgsl} (76%)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index b759c33910..4bc011729e 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -125,7 +125,7 @@ struct webgpu_context_struct {
     wgpu::ComputePipeline mul_mat_pipeline[30][2];
     wgpu::ComputePipeline set_rows_pipeline;
     wgpu::ComputePipeline cpy_pipeline;
-    wgpu::ComputePipeline add_pipeline;
+    wgpu::ComputePipeline add_pipeline[2];
 
     size_t memset_bytes_per_thread;
 
@@ -233,14 +233,15 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
     std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
     if (ctx->callback_futures.empty()) {
         // no existing callbacks, wait on queue submission
-        ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(
-                                  wgpu::CallbackMode::AllowSpontaneous,
-                                  [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
-                                      if (status != wgpu::QueueWorkDoneStatus::Success) {
-                                          GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
-                                      }
-                                  }),
-                              UINT64_MAX);
+        ctx->instance.WaitAny(
+            ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
+                                           [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+                                               if (status != wgpu::QueueWorkDoneStatus::Success) {
+                                                   GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
+                                                                  std::string(message).c_str());
+                                               }
+                                           }),
+            UINT64_MAX);
     } else {
         // existing callbacks, wait on them
         ctx->instance.WaitAny(ctx->callback_futures.size(), ctx->callback_futures.data(), UINT64_MAX);
@@ -287,10 +288,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
     // Check for errrors in SET_ROWS operations
     for (auto & error_bufs : staged_set_row_error_bufs) {
         wgpu::Future f = error_bufs.host_buf.MapAsync(
-            wgpu::MapMode::Read,
-            0,
-            error_bufs.host_buf.GetSize(),
-            wgpu::CallbackMode::AllowSpontaneous,
+            wgpu::MapMode::Read, 0, error_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
             [ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) {
                 if (status != wgpu::MapAsyncStatus::Success) {
                     GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str());
@@ -312,10 +310,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
                                            wgpu::MapMode    mode,
                                            size_t           offset,
                                            size_t           size) {
-    ctx->instance.WaitAny(buffer.MapAsync(mode,
-                                          offset,
-                                          size,
-                                          wgpu::CallbackMode::AllowSpontaneous,
+    ctx->instance.WaitAny(buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
                                           [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
                                               if (status != wgpu::MapAsyncStatus::Success) {
                                                   GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n",
@@ -465,23 +460,17 @@ static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor
 static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     uint32_t ne = (uint32_t) ggml_nelements(dst);
 
-    std::vector<uint32_t> params = { ne,
-                                     (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
-                                     (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
-                                     // Convert byte-strides to element-strides
-                                     (uint32_t) (src->nb[0] / ggml_type_size(src->type)),
-                                     (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
-                                     (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
-                                     (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
-                                     (uint32_t) (dst->nb[0] / ggml_type_size(dst->type)),
-                                     (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
-                                     (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
-                                     (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
-                                     // Logical shape — same for both tensors even if permuted
-                                     (uint32_t) src->ne[0],
-                                     (uint32_t) src->ne[1],
-                                     (uint32_t) src->ne[2],
-                                     (uint32_t) src->ne[3] };
+    std::vector<uint32_t> params = {
+        ne, (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+        // Convert byte-strides to element-strides
+        (uint32_t) (src->nb[0] / ggml_type_size(src->type)), (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
+        (uint32_t) (src->nb[2] / ggml_type_size(src->type)), (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
+        (uint32_t) (dst->nb[0] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
+        // Logical shape — same for both tensors even if permuted
+        (uint32_t) src->ne[0], (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) src->ne[3]
+    };
 
     std::vector<wgpu::BindGroupEntry> entries = {
         { .binding = 0,
@@ -510,27 +499,21 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
         error_bufs.host_buf.Unmap();
     }
 
-    std::vector<uint32_t> params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
-                                     (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
-                                     (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
-                                     // Convert byte-strides to element-strides
-                                     (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
-                                     (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
-                                     (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
-                                     (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
-                                     (uint32_t) (idx->nb[1] / ggml_type_size(idx->type)),
-                                     (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
-                                     (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
-                                     (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
-                                     (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
-                                     // Shape of src
-                                     (uint32_t) src->ne[0],
-                                     (uint32_t) src->ne[1],
-                                     (uint32_t) src->ne[2],
-                                     (uint32_t) src->ne[3],
-                                     // Shape of idx
-                                     (uint32_t) (idx->ne[1]),
-                                     (uint32_t) (idx->ne[2]) };
+    std::vector<uint32_t> params = {
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+        // Convert byte-strides to element-strides
+        (uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
+        (uint32_t) (src->nb[3] / ggml_type_size(src->type)), (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
+        (uint32_t) (idx->nb[1] / ggml_type_size(idx->type)), (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
+        (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
+        // Shape of src
+        (uint32_t) src->ne[0], (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) src->ne[3],
+        // Shape of idx
+        (uint32_t) (idx->ne[1]), (uint32_t) (idx->ne[2])
+    };
 
     std::vector<wgpu::BindGroupEntry> entries = {
         { .binding = 0,
@@ -598,83 +581,44 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
 }
 
 static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
-
-    size_t src0_offset = ggml_webgpu_tensor_offset(src0);
-    // assumes power of 2 offset alignment
-    size_t src0_misalignment = src0_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
-    // align to minimum offset alignment
-    src0_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
-
-    size_t src1_offset = ggml_webgpu_tensor_offset(src1);
-    size_t src1_misalignment = src1_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
-    src1_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
-
-    size_t dst_offset = ggml_webgpu_tensor_offset(dst);
-    size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
-    dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
-
-    // set up parameters
     std::vector<uint32_t> params = {
-        // number of elements-- determines how many threads to dispatch (one for each addition operation)
         (uint32_t) ggml_nelements(dst),
-
-        // calculate element strides for each tensor
-        (uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
-        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
-        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
-        (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
-
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
         (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
         (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
         (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
         (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
-
-        (uint32_t) (dst->nb[0] / ggml_type_size(dst->type)),
-        (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
-        (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
-        (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
-
-        // number of elements in each dimension of larger tensors (src0 and dst)
-        (uint32_t) dst->ne[0],
-        (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2],
-        (uint32_t) dst->ne[3],
-
-        // number of elements in each dimension of smaller tensor to be broadcasted (src1)
+        (uint32_t) src0->ne[0],
+        (uint32_t) src0->ne[1],
+        (uint32_t) src0->ne[2],
         (uint32_t) src1->ne[0],
         (uint32_t) src1->ne[1],
         (uint32_t) src1->ne[2],
         (uint32_t) src1->ne[3],
-
-        // offsets in terms of elements instead of bytes
-        (uint32_t) (src0_misalignment / ggml_type_size(src0->type)),
-        (uint32_t) (src1_misalignment / ggml_type_size(src1->type)),
-        (uint32_t) (dst_misalignment / ggml_type_size(dst->type)),
-
     };
 
     std::vector<wgpu::BindGroupEntry> entries = {
         { .binding = 0,
          .buffer  = ggml_webgpu_tensor_buf(src0),
-         .offset  = src0_offset,
-         .size    = (ggml_nbytes(src0) + src0_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) },
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, src0),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, src0) },
         { .binding = 1,
          .buffer  = ggml_webgpu_tensor_buf(src1),
-         .offset  = src1_offset,
-         .size    = (ggml_nbytes(src1) + src1_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) },
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, src1),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, src1) },
         { .binding = 2,
          .buffer  = ggml_webgpu_tensor_buf(dst),
-         .offset  = dst_offset,
-         .size    = (ggml_nbytes(dst) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) }
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, dst)  }
     };
 
-    size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; // max threads in a single workgroup
-    uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; // number of workgroups to dispatch to cover all elements
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->add_pipeline, params, entries, wg_x); // dispatch shader
-
+    size_t   max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
+    uint32_t wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
+    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->add_pipeline[dst->type], params, entries, wg_x);
 }
 
-
 // Returns true if node has enqueued work into the queue, false otherwise
 static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
     if (ggml_is_empty(node)) {
@@ -814,8 +758,8 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
             ((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i];
         }
         // memset the remaining bytes
-        ggml_backend_webgpu_buffer_memset(
-            webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size), remaining_size);
+        ggml_backend_webgpu_buffer_memset(webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size),
+                                          remaining_size);
     } else {
         // wait for WriteBuffer to complete
         ggml_backend_webgpu_wait_on_submission(webgpu_ctx);
@@ -849,11 +793,8 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
         if (webgpu_ctx->get_tensor_staging_buf) {
             webgpu_ctx->get_tensor_staging_buf.Destroy();
         }
-        ggml_webgpu_create_buffer(device,
-                                  webgpu_ctx->get_tensor_staging_buf,
-                                  final_size,
-                                  wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
-                                  "get_tensor_staging_buf");
+        ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size,
+                                  wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "get_tensor_staging_buf");
     }
 
     // Copy the data from the buffer to the staging buffer
@@ -907,8 +848,7 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
 
     wgpu::Buffer buf;
-    ggml_webgpu_create_buffer(ctx->webgpu_ctx->device,
-                              buf,
+    ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf,
                               (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1),
                               wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst,
                               "allocated_buffer");
@@ -989,102 +929,58 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
 }
 
 static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
-                                wgsl_mul_mat_f32_f32,
-                                "mul_mat_f32_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
-                                wgsl_mul_mat_f16_f16,
-                                "mul_mat_f16_f16");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
-                                wgsl_mul_mat_f16_f32,
-                                "mul_mat_f16_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
-                                wgsl_mul_mat_q4_0_f32,
-                                "mul_mat_q4_0_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
-                                wgsl_mul_mat_q4_1_f32,
-                                "mul_mat_q4_1_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32],
-                                wgsl_mul_mat_q5_0_f32,
-                                "mul_mat_q5_0_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32],
-                                wgsl_mul_mat_q5_1_f32,
-                                "mul_mat_q5_1_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32],
-                                wgsl_mul_mat_q8_0_f32,
-                                "mul_mat_q8_0_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32],
-                                wgsl_mul_mat_q2_k_f32,
-                                "mul_mat_q2_k_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32],
-                                wgsl_mul_mat_q3_k_f32,
-                                "mul_mat_q3_k_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32],
-                                wgsl_mul_mat_q4_k_f32,
-                                "mul_mat_q4_k_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32],
-                                wgsl_mul_mat_q5_k_f32,
-                                "mul_mat_q5_k_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32],
-                                wgsl_mul_mat_q6_k_f32,
-                                "mul_mat_q6_k_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32],
-                                wgsl_mul_mat_iq2_xxs_f32,
-                                "mul_mat_iq2_xxs_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32],
-                                wgsl_mul_mat_iq2_xs_f32,
-                                "mul_mat_iq2_xs_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32],
-                                wgsl_mul_mat_iq2_s_f32,
-                                "mul_mat_iq2_s_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32],
-                                wgsl_mul_mat_iq3_xxs_f32,
-                                "mul_mat_iq3_xxs_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32],
-                                wgsl_mul_mat_iq3_s_f32,
-                                "mul_mat_iq3_s_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32],
-                                wgsl_mul_mat_iq1_s_f32,
-                                "mul_mat_iq1_s_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32],
-                                wgsl_mul_mat_iq1_m_f32,
-                                "mul_mat_iq1_m_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32],
-                                wgsl_mul_mat_iq4_nl_f32,
-                                "mul_mat_iq4_nl_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device,
-                                webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
-                                wgsl_mul_mat_iq4_xs_f32,
-                                "mul_mat_iq4_xs_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
+                                wgsl_mul_mat_f32_f32, "mul_mat_f32_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
+                                wgsl_mul_mat_f16_f16, "mul_mat_f16_f16");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
+                                wgsl_mul_mat_f16_f32, "mul_mat_f16_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
+                                wgsl_mul_mat_q4_0_f32, "mul_mat_q4_0_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
+                                wgsl_mul_mat_q4_1_f32, "mul_mat_q4_1_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32],
+                                wgsl_mul_mat_q5_0_f32, "mul_mat_q5_0_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32],
+                                wgsl_mul_mat_q5_1_f32, "mul_mat_q5_1_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32],
+                                wgsl_mul_mat_q8_0_f32, "mul_mat_q8_0_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32],
+                                wgsl_mul_mat_q2_k_f32, "mul_mat_q2_k_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32],
+                                wgsl_mul_mat_q3_k_f32, "mul_mat_q3_k_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32],
+                                wgsl_mul_mat_q4_k_f32, "mul_mat_q4_k_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32],
+                                wgsl_mul_mat_q5_k_f32, "mul_mat_q5_k_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32],
+                                wgsl_mul_mat_q6_k_f32, "mul_mat_q6_k_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq2_xxs_f32, "mul_mat_iq2_xxs_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq2_xs_f32, "mul_mat_iq2_xs_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq2_s_f32, "mul_mat_iq2_s_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq3_xxs_f32, "mul_mat_iq3_xxs_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq3_s_f32, "mul_mat_iq3_s_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq1_s_f32, "mul_mat_iq1_s_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq1_m_f32, "mul_mat_iq1_m_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq4_nl_f32, "mul_mat_iq4_nl_f32");
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
+                                wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
 }
 
 static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants(1);
     constants[0].key   = "wg_size";
     constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
-    ggml_webgpu_create_pipeline(
-        webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows",
+                                constants);
 }
 
 static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
@@ -1098,10 +994,10 @@ static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants(1);
     constants[0].key   = "wg_size";
     constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline, wgsl_add, "add", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32], wgsl_add_f32, "add_f32", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16], wgsl_add_f16, "add_f16", constants);
 }
 
-
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
 
@@ -1158,9 +1054,8 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
         case GGML_OP_RESHAPE:
-            return true;
         case GGML_OP_ADD:
-            return op->type == GGML_TYPE_F32;
+            return true;
         case GGML_OP_CPY:
         case GGML_OP_SET_ROWS:
             return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
@@ -1248,14 +1143,14 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     webgpu_context ctx = reg_ctx->webgpu_ctx;
 
     wgpu::RequestAdapterOptions options = {};
-    auto                        callback =
-        [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message, void * userdata) {
-            if (status != wgpu::RequestAdapterStatus::Success) {
-                GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
-                return;
-            }
-            *static_cast<wgpu::Adapter *>(userdata) = std::move(adapter);
-        };
+    auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message,
+                       void * userdata) {
+        if (status != wgpu::RequestAdapterStatus::Success) {
+            GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
+            return;
+        }
+        *static_cast<wgpu::Adapter *>(userdata) = std::move(adapter);
+    };
     void * userdata = &ctx->adapter;
     ctx->instance.WaitAny(
         ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::AllowSpontaneous, callback, userdata), UINT64_MAX);
@@ -1277,21 +1172,21 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
         wgpu::CallbackMode::AllowSpontaneous,
         [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
             GGML_UNUSED(device);
-            GGML_LOG_ERROR(
-                "ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), std::string(message).c_str());
+            GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason),
+                           std::string(message).c_str());
         });
     dev_desc.SetUncapturedErrorCallback(
         [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
             GGML_UNUSED(device);
-            GGML_LOG_ERROR(
-                "ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), std::string(message).c_str());
+            GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
+                           std::string(message).c_str());
         });
     ctx->instance.WaitAny(ctx->adapter.RequestDevice(
-                              &dev_desc,
-                              wgpu::CallbackMode::AllowSpontaneous,
+                              &dev_desc, wgpu::CallbackMode::AllowSpontaneous,
                               [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
                                   if (status != wgpu::RequestDeviceStatus::Success) {
-                                      GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", std::string(message).c_str());
+                                      GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n",
+                                                     std::string(message).c_str());
                                       return;
                                   }
                                   ctx->device = std::move(device);
@@ -1303,14 +1198,10 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     ctx->queue = ctx->device.GetQueue();
 
     // Create buffer pool for shader parameters
-    ctx->param_buf_pool.init(ctx->device,
-                             WEBGPU_NUM_PARAM_BUFS,
-                             WEBGPU_PARAMS_BUF_SIZE_BYTES,
+    ctx->param_buf_pool.init(ctx->device, WEBGPU_NUM_PARAM_BUFS, WEBGPU_PARAMS_BUF_SIZE_BYTES,
                              wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
                              wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
-    ctx->set_rows_error_buf_pool.init(ctx->device,
-                                      WEBGPU_NUM_SET_ROWS_ERROR_BUFS,
-                                      WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
+    ctx->set_rows_error_buf_pool.init(ctx->device, WEBGPU_NUM_SET_ROWS_ERROR_BUFS, WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
                                       wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage,
                                       wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);
 
@@ -1322,16 +1213,10 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
 
 #ifdef GGML_WEBGPU_DEBUG
     // Initialize debug buffers
-    ggml_webgpu_create_buffer(ctx->device,
-                              ctx->debug_host_buf,
-                              WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
-                              wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
-                              "debug_host_buf");
-    ggml_webgpu_create_buffer(ctx->device,
-                              ctx->debug_dev_buf,
-                              WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
-                              wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc,
-                              "debug_dev_buf");
+    ggml_webgpu_create_buffer(ctx->device, ctx->debug_host_buf, WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
+                              wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "debug_host_buf");
+    ggml_webgpu_create_buffer(ctx->device, ctx->debug_dev_buf, WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
+                              wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, "debug_dev_buf");
 #endif
 
     static ggml_backend_webgpu_device_context device_ctx;
@@ -1342,12 +1227,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     GGML_LOG_INFO(
         "ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | "
         "device_desc: %s\n",
-        info.vendorID,
-        std::string(info.vendor).c_str(),
-        std::string(info.architecture).c_str(),
-        info.deviceID,
-        std::string(info.device).c_str(),
-        std::string(info.description).c_str());
+        info.vendorID, std::string(info.vendor).c_str(), std::string(info.architecture).c_str(), info.deviceID,
+        std::string(info.device).c_str(), std::string(info.description).c_str());
 
     // See GGML Backend Device Interface section
     static ggml_backend_device device = {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl
similarity index 76%
rename from ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl
rename to ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl
index 93adb0d562..b888c3d10b 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl
@@ -1,46 +1,54 @@
+#define(VARIANTS)
+
+[
+  {
+    "REPLS": {
+      "TYPE" : "f32",
+    }
+  },
+  {
+    "REPLS": {
+      "TYPE" : "f16",
+    }
+  }
+]
+
+#end(VARIANTS)
+
+#define(SHADER)
+
 enable f16;
 
 @group(0) @binding(0)
-var<storage, read_write> src0: array<f32>;
+var<storage, read_write> src0: array<{{TYPE}}>;
 
 @group(0) @binding(1)
-var<storage, read_write> src1: array<f32>;
+var<storage, read_write> src1: array<{{TYPE}}>;
 
 @group(0) @binding(2)
-var<storage, read_write> dst: array<f32>;
+var<storage, read_write> dst: array<{{TYPE}}>;
 
 struct Params {
     ne: u32,
 
-    stride_src0_0: u32,
-    stride_src0_1: u32,
-    stride_src0_2: u32,
-    stride_src0_3: u32,
+    // offsets in elements
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_dst: u32,
 
     stride_src1_0: u32,
     stride_src1_1: u32,
     stride_src1_2: u32,
     stride_src1_3: u32,
 
-    stride_dst_0: u32,
-    stride_dst_1: u32,
-    stride_dst_2: u32,
-    stride_dst_3: u32,
-
     a_ne0: u32,
     a_ne1: u32,
     a_ne2: u32,
-    a_ne3: u32,
 
     b_ne0: u32,
     b_ne1: u32,
     b_ne2: u32,
     b_ne3: u32,
-
-    // offsets in elements
-    offset_src0: u32,
-    offset_src1: u32,
-    offset_dst: u32,
 };
 
 @group(0) @binding(3)
@@ -53,33 +61,28 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
         return;
     }
 
-    // i = thread id, ranges from 0 --> total ne - 1 
+    // i = thread id, ranges from 0 --> total ne - 1
     // represents the position in the flat array a we are adding with array b
-    var i = gid.x;  
+    var i = gid.x;
 
     // given the index of linear a, we want to compute the 4d index [a_i0, a_i1, a_i2, a_i3]
-    // we need this because tensor a and b are different shapes 
+    // we need this because tensor a and b are different shapes
     // so the same linear index won't work for b, and we can only compute b's linear index from the 4d index of a
- 
+
     let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
     i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
-
     let a_i2 = i / (params.a_ne1 * params.a_ne0);
     i = i % (params.a_ne1 * params.a_ne0);
-
     let a_i1 = i / params.a_ne0;
-
     let a_i0 = i % params.a_ne0;
 
-
-    // handle repetition of b 
-        // index loops back to the beginning and repeats after elements are exhausted = modulo
+    // handle repetition of b
+    // index loops back to the beginning and repeats after elements are exhausted = modulo
     let b_i0 = a_i0 % params.b_ne0;
     let b_i1 = a_i1 % params.b_ne1;
     let b_i2 = a_i2 % params.b_ne2;
     let b_i3 = a_i3 % params.b_ne3;
 
-
     // compute index for position in b's flat array
     let src1_idx = b_i0 * params.stride_src1_0 +
                 b_i1 * params.stride_src1_1 +
@@ -91,3 +94,5 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     // gid.x used for flat indexing into dst and a, since variable i was modified during calcs
     dst[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] + src1[params.offset_src1 + src1_idx];
 }
+
+#end(SHADER)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
index cc8def7f13..1e518ec118 100755
--- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@@ -46,11 +46,17 @@ def generate_variants(shader_path, output_dir, outfile):
     except ValueError:
         write_shader(shader_base_name, text, output_dir, outfile)
     else:
-        decls_map = parse_decls(extract_block(text, "DECLS"))
-        shader_template = extract_block(text, "SHADER")
+        try:
+            decls_map = parse_decls(extract_block(text, "DECLS"))
+        except ValueError:
+            decls_map = {}
 
+        shader_template = extract_block(text, "SHADER")
         for variant in variants:
-            decls = variant["DECLS"]
+            if "DECLS" in variant:
+                decls = variant["DECLS"]
+            else:
+                decls = []
             decls_code = ""
             for key in decls:
                 if key not in decls_map:
@@ -60,7 +66,12 @@ def generate_variants(shader_path, output_dir, outfile):
             shader_variant = replace_placeholders(shader_template, variant["REPLS"])
             final_shader = re.sub(r'\bDECLS\b', decls_code, shader_variant)
 
-            output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]])
+            if "SRC0_TYPE" in variant["REPLS"] and "SRC1_TYPE" in variant["REPLS"]:
+                output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]])
+            elif "TYPE" in variant["REPLS"]:
+                output_name = f"{shader_base_name}_" + variant["REPLS"]["TYPE"]
+            else:
+                output_name = shader_base_name
             write_shader(output_name, final_shader, output_dir, outfile)
 
 

From c10219705defef07c8c49622b17ea279f8cbbbb6 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 8 Sep 2025 10:15:21 -0700
Subject: [PATCH 17/17] Get addition and multiplication working

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 116 ++++++++++++++----
 .../ggml-webgpu/wgsl-shaders/add.tmpl.wgsl    |  62 +---------
 .../wgsl-shaders/add_in_place.tmpl.wgsl       |  41 +++++++
 .../ggml-webgpu/wgsl-shaders/binary_head.tmpl |  45 +++++++
 .../ggml-webgpu/wgsl-shaders/embed_wgsl.py    |  26 +++-
 .../ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl    |  44 +++++++
 .../wgsl-shaders/mul_in_place.tmpl.wgsl       |  41 +++++++
 7 files changed, 292 insertions(+), 83 deletions(-)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 4bc011729e..2669a3f95c 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -126,6 +126,9 @@ struct webgpu_context_struct {
     wgpu::ComputePipeline set_rows_pipeline;
     wgpu::ComputePipeline cpy_pipeline;
     wgpu::ComputePipeline add_pipeline[2];
+    wgpu::ComputePipeline add_ip_pipeline[2];
+    wgpu::ComputePipeline mul_pipeline[2];
+    wgpu::ComputePipeline mul_ip_pipeline[2];
 
     size_t memset_bytes_per_thread;
 
@@ -347,7 +350,8 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
                                                   std::vector<uint32_t>             params,
                                                   std::vector<wgpu::BindGroupEntry> bind_group_entries,
                                                   uint32_t                          wg_x,
-                                                  bool                              submit_and_wait = false) {
+                                                  const char *                      bind_group_label = nullptr,
+                                                  bool                              submit_and_wait  = false) {
     webgpu_pool_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
 
     ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
@@ -368,6 +372,9 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
     bind_group_desc.layout     = pipeline.GetBindGroupLayout(0);
     bind_group_desc.entryCount = bind_group_entries.size();
     bind_group_desc.entries    = bind_group_entries.data();
+    if (bind_group_label) {
+        bind_group_desc.label = bind_group_label;
+    }
     wgpu::BindGroup bind_group = ctx->device.CreateBindGroup(&bind_group_desc);
 
     wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
@@ -413,7 +420,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
     };
     size_t   bytes_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_bytes_per_thread;
     uint32_t wg_x         = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, true);
+    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, "MEMSET", true);
 }
 
 /** End WebGPU Actions */
@@ -457,6 +464,12 @@ static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor
            ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
 }
 
+// Used to determine if two tensors are the same for in-place operations
+static bool ggml_webgpu_tensor_equal(ggml_tensor * a, ggml_tensor * b) {
+    return (ggml_webgpu_tensor_buf(a).Get() == ggml_webgpu_tensor_buf(b).Get()) &&
+           (ggml_webgpu_tensor_offset(a) == ggml_webgpu_tensor_offset(b));
+}
+
 static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     uint32_t ne = (uint32_t) ggml_nelements(dst);
 
@@ -485,7 +498,7 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor
 
     size_t   max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
     uint32_t wg_x        = (ne + max_wg_size - 1) / max_wg_size;
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline, params, entries, wg_x);
+    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline, params, entries, wg_x, ggml_op_name(dst->op));
 }
 
 static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
@@ -537,7 +550,7 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
     std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
     ctx->staged_set_row_error_bufs.push_back(error_bufs);
 
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x);
+    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x, ggml_op_name(dst->op));
 }
 
 static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
@@ -577,10 +590,16 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
 
     uint32_t wg_x =
         (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x);
+    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x,
+                                          ggml_op_name(dst->op));
 }
 
-static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_webgpu_binary_op(webgpu_context &        ctx,
+                                  ggml_tensor *           src0,
+                                  ggml_tensor *           src1,
+                                  ggml_tensor *           dst,
+                                  wgpu::ComputePipeline & pipeline,
+                                  bool                    in_place) {
     std::vector<uint32_t> params = {
         (uint32_t) ggml_nelements(dst),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
@@ -607,16 +626,18 @@ static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tenso
         { .binding = 1,
          .buffer  = ggml_webgpu_tensor_buf(src1),
          .offset  = ggml_webgpu_tensor_align_offset(ctx, src1),
-         .size    = ggml_webgpu_tensor_binding_size(ctx, src1) },
-        { .binding = 2,
-         .buffer  = ggml_webgpu_tensor_buf(dst),
-         .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
-         .size    = ggml_webgpu_tensor_binding_size(ctx, dst)  }
+         .size    = ggml_webgpu_tensor_binding_size(ctx, src1) }
     };
+    if (!in_place) {
+        entries.push_back({ .binding = 2,
+                            .buffer  = ggml_webgpu_tensor_buf(dst),
+                            .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
+                            .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
+    }
 
     size_t   max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
     uint32_t wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->add_pipeline[dst->type], params, entries, wg_x);
+    ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
 }
 
 // Returns true if node has enqueued work into the queue, false otherwise
@@ -654,7 +675,20 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
             }
         case GGML_OP_ADD:
             {
-                ggml_webgpu_add(ctx, src0, src1, node);
+                if (ggml_webgpu_tensor_equal(src0, node)) {
+                    ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_ip_pipeline[node->type], true);
+                } else {
+                    ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type], false);
+                }
+                break;
+            }
+        case GGML_OP_MUL:
+            {
+                if (ggml_webgpu_tensor_equal(src0, node)) {
+                    ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_ip_pipeline[node->type], true);
+                } else {
+                    ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type], false);
+                }
                 break;
             }
         default:
@@ -994,8 +1028,28 @@ static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants(1);
     constants[0].key   = "wg_size";
     constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32], wgsl_add_f32, "add_f32", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16], wgsl_add_f16, "add_f16", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32], wgsl_add_f32, "add_f32",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16], wgsl_add_f16, "add_f16",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_ip_pipeline[GGML_TYPE_F32], wgsl_add_in_place_f32,
+                                "add_in_place_f32", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_ip_pipeline[GGML_TYPE_F16], wgsl_add_in_place_f16,
+                                "add_in_place_f16", constants);
+}
+
+static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) {
+    std::vector<wgpu::ConstantEntry> constants(1);
+    constants[0].key   = "wg_size";
+    constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32], wgsl_mul_f32, "mul_f32",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16], wgsl_mul_f16, "mul_f16",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_ip_pipeline[GGML_TYPE_F32], wgsl_mul_in_place_f32,
+                                "mul_in_place_f32", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_ip_pipeline[GGML_TYPE_F16], wgsl_mul_in_place_f16,
+                                "mul_in_place_f16", constants);
 }
 
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -1048,22 +1102,30 @@ static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggm
 static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     GGML_UNUSED(dev);
 
+    bool supports_op = false;
     switch (op->op) {
         case GGML_OP_NONE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
         case GGML_OP_RESHAPE:
+            supports_op = true;
+            break;
         case GGML_OP_ADD:
-            return true;
+        case GGML_OP_MUL:
+            supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (op->src[0]->type == op->type) &&
+                          (op->src[1]->type == op->type);
+            break;
         case GGML_OP_CPY:
         case GGML_OP_SET_ROWS:
-            return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
+            supports_op = (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32);
+            break;
         case GGML_OP_MUL_MAT:
             {
                 switch (op->src[1]->type) {
                     case GGML_TYPE_F16:
-                        return op->src[0]->type == GGML_TYPE_F16;
+                        supports_op = (op->src[0]->type == GGML_TYPE_F16);
+                        break;
                     case GGML_TYPE_F32:
                         switch (op->src[0]->type) {
                             case GGML_TYPE_F32:
@@ -1087,17 +1149,26 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                             case GGML_TYPE_IQ1_M:
                             case GGML_TYPE_IQ4_NL:
                             case GGML_TYPE_IQ4_XS:
-                                return true;
+                                supports_op = true;
+                                break;
                             default:
-                                return false;
+                                break;
                         }
                     default:
-                        return false;
+                        break;
                 }
             }
         default:
-            return false;
+            break;
     }
+#ifdef GGML_WEBGPU_DEBUG
+    if (!supports_op) {
+        WEBGPU_LOG_DEBUG("not supported: " << ggml_op_name(op->op) << " with types dst: " << ggml_type_name(op->type)
+                                           << ", src0: " << (op->src[0] ? ggml_type_name(op->src[0]->type) : "null")
+                                           << ", src1: " << (op->src[1] ? ggml_type_name(op->src[1]->type) : "null"));
+    }
+#endif
+    return supports_op;
 }
 
 static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
@@ -1210,6 +1281,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     ggml_webgpu_init_set_rows_pipeline(ctx);
     ggml_webgpu_init_cpy_pipeline(ctx);
     ggml_webgpu_init_add_pipeline(ctx);
+    ggml_webgpu_init_mul_pipeline(ctx);
 
 #ifdef GGML_WEBGPU_DEBUG
     // Initialize debug buffers
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl
index b888c3d10b..f261cbb553 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl
@@ -19,6 +19,8 @@
 
 enable f16;
 
+#include "binary_head.tmpl"
+
 @group(0) @binding(0)
 var<storage, read_write> src0: array<{{TYPE}}>;
 
@@ -28,71 +30,15 @@ var<storage, read_write> src1: array<{{TYPE}}>;
 @group(0) @binding(2)
 var<storage, read_write> dst: array<{{TYPE}}>;
 
-struct Params {
-    ne: u32,
-
-    // offsets in elements
-    offset_src0: u32,
-    offset_src1: u32,
-    offset_dst: u32,
-
-    stride_src1_0: u32,
-    stride_src1_1: u32,
-    stride_src1_2: u32,
-    stride_src1_3: u32,
-
-    a_ne0: u32,
-    a_ne1: u32,
-    a_ne2: u32,
-
-    b_ne0: u32,
-    b_ne1: u32,
-    b_ne2: u32,
-    b_ne3: u32,
-};
-
 @group(0) @binding(3)
 var<uniform> params: Params;
 
 override wg_size: u32;
 @compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    if (gid.x >= params.ne) {
-        return;
+    if (gid.x < params.ne) {
+        dst[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] + src1[params.offset_src1 + src1_index(gid.x)];
     }
-
-    // i = thread id, ranges from 0 --> total ne - 1
-    // represents the position in the flat array a we are adding with array b
-    var i = gid.x;
-
-    // given the index of linear a, we want to compute the 4d index [a_i0, a_i1, a_i2, a_i3]
-    // we need this because tensor a and b are different shapes
-    // so the same linear index won't work for b, and we can only compute b's linear index from the 4d index of a
-
-    let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
-    i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
-    let a_i2 = i / (params.a_ne1 * params.a_ne0);
-    i = i % (params.a_ne1 * params.a_ne0);
-    let a_i1 = i / params.a_ne0;
-    let a_i0 = i % params.a_ne0;
-
-    // handle repetition of b
-    // index loops back to the beginning and repeats after elements are exhausted = modulo
-    let b_i0 = a_i0 % params.b_ne0;
-    let b_i1 = a_i1 % params.b_ne1;
-    let b_i2 = a_i2 % params.b_ne2;
-    let b_i3 = a_i3 % params.b_ne3;
-
-    // compute index for position in b's flat array
-    let src1_idx = b_i0 * params.stride_src1_0 +
-                b_i1 * params.stride_src1_1 +
-                b_i2 * params.stride_src1_2 +
-                b_i3 * params.stride_src1_3;
-
-    // actual addition operation, now that the indexes are all figured out
-    // ensuring that the offsets are included
-    // gid.x used for flat indexing into dst and a, since variable i was modified during calcs
-    dst[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] + src1[params.offset_src1 + src1_idx];
 }
 
 #end(SHADER)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl
new file mode 100644
index 0000000000..903f7bdbcc
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl
@@ -0,0 +1,41 @@
+#define(VARIANTS)
+
+[
+  {
+    "REPLS": {
+      "TYPE" : "f32",
+    }
+  },
+  {
+    "REPLS": {
+      "TYPE" : "f16",
+    }
+  }
+]
+
+#end(VARIANTS)
+
+#define(SHADER)
+
+enable f16;
+
+#include "binary_head.tmpl"
+
+@group(0) @binding(0)
+var<storage, read_write> src0: array<{{TYPE}}>;
+
+@group(0) @binding(1)
+var<storage, read_write> src1: array<{{TYPE}}>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+override wg_size: u32;
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    if (gid.x < params.ne) {
+        src0[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] + src1[params.offset_src1 + src1_index(gid.x)];
+    }
+}
+
+#end(SHADER)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl
new file mode 100644
index 0000000000..4b254f468d
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl
@@ -0,0 +1,45 @@
+struct Params {
+    ne: u32,
+
+    // offsets in elements
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_dst: u32,
+
+    stride_src1_0: u32,
+    stride_src1_1: u32,
+    stride_src1_2: u32,
+    stride_src1_3: u32,
+
+    a_ne0: u32,
+    a_ne1: u32,
+    a_ne2: u32,
+
+    b_ne0: u32,
+    b_ne1: u32,
+    b_ne2: u32,
+    b_ne3: u32,
+};
+
+fn src1_index(_i: u32) -> u32 {
+    var i = _i;
+    let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    let a_i2 = i / (params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne1 * params.a_ne0);
+    let a_i1 = i / params.a_ne0;
+    let a_i0 = i % params.a_ne0;
+
+    // handle repetition of b
+    // index loops back to the beginning and repeats after elements are exhausted = modulo
+    let b_i0 = a_i0 % params.b_ne0;
+    let b_i1 = a_i1 % params.b_ne1;
+    let b_i2 = a_i2 % params.b_ne2;
+    let b_i3 = a_i3 % params.b_ne3;
+
+    // compute index for position in b's flat array
+    return b_i0 * params.stride_src1_0 +
+           b_i1 * params.stride_src1_1 +
+           b_i2 * params.stride_src1_2 +
+           b_i3 * params.stride_src1_3;
+}
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
index 1e518ec118..a9e73ed295 100755
--- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@@ -26,6 +26,24 @@ def replace_placeholders(shader_text, replacements):
         shader_text = re.sub(pattern, str(val), shader_text)
     return shader_text
 
+def expand_includes(shader, input_dir):
+    """
+    Replace #include "file" lines in the text with the contents of that file.
+    Searches for files relative to input_dir.
+    """
+    include_pattern = re.compile(r'^\s*#include\s+"([^"]+)"\s*$', re.MULTILINE)
+
+    def replacer(match):
+        fname = match.group(1)
+        file_path = os.path.join(input_dir, fname)
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Included file not found: {file_path}")
+        with open(file_path, "r", encoding="utf-8") as f:
+            included_code = f.read()
+        # Recursively expand includes inside the included file
+        return expand_includes(included_code, input_dir)
+
+    return include_pattern.sub(replacer, shader)
 
 def write_shader(shader_name, shader_code, output_dir, outfile):
     if output_dir:
@@ -35,8 +53,9 @@ def write_shader(shader_name, shader_code, output_dir, outfile):
     outfile.write(f'const char* wgsl_{shader_name} = R"({shader_code})";\n\n')
 
 
-def generate_variants(shader_path, output_dir, outfile):
-    shader_base_name = shader_path.split("/")[-1].split(".")[0]
+def generate_variants(fname, input_dir, output_dir, outfile):
+    shader_path = os.path.join(input_dir, fname)
+    shader_base_name = fname.split(".")[0]
 
     with open(shader_path, "r", encoding="utf-8") as f:
         text = f.read()
@@ -52,6 +71,7 @@ def generate_variants(shader_path, output_dir, outfile):
             decls_map = {}
 
         shader_template = extract_block(text, "SHADER")
+        shader_template = expand_includes(shader_template, input_dir)
         for variant in variants:
             if "DECLS" in variant:
                 decls = variant["DECLS"]
@@ -89,7 +109,7 @@ def main():
         out.write("// Auto-generated shader embedding\n\n")
         for fname in sorted(os.listdir(args.input_dir)):
             if fname.endswith(".wgsl"):
-                generate_variants(os.path.join(args.input_dir, fname), args.output_dir, out)
+                generate_variants(fname, args.input_dir, args.output_dir, out)
 
 
 if __name__ == "__main__":
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl
new file mode 100644
index 0000000000..12506e1420
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl
@@ -0,0 +1,44 @@
+#define(VARIANTS)
+
+[
+  {
+    "REPLS": {
+      "TYPE" : "f32",
+    }
+  },
+  {
+    "REPLS": {
+      "TYPE" : "f16",
+    }
+  }
+]
+
+#end(VARIANTS)
+
+#define(SHADER)
+
+enable f16;
+
+#include "binary_head.tmpl"
+
+@group(0) @binding(0)
+var<storage, read_write> src0: array<{{TYPE}}>;
+
+@group(0) @binding(1)
+var<storage, read_write> src1: array<{{TYPE}}>;
+
+@group(0) @binding(2)
+var<storage, read_write> dst: array<{{TYPE}}>;
+
+@group(0) @binding(3)
+var<uniform> params: Params;
+
+override wg_size: u32;
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    if (gid.x < params.ne) {
+        dst[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] * src1[params.offset_src1 + src1_index(gid.x)];
+    }
+}
+
+#end(SHADER)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl
new file mode 100644
index 0000000000..e467e59edb
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl
@@ -0,0 +1,41 @@
+#define(VARIANTS)
+
+[
+  {
+    "REPLS": {
+      "TYPE" : "f32",
+    }
+  },
+  {
+    "REPLS": {
+      "TYPE" : "f16",
+    }
+  }
+]
+
+#end(VARIANTS)
+
+#define(SHADER)
+
+enable f16;
+
+#include "binary_head.tmpl"
+
+@group(0) @binding(0)
+var<storage, read_write> src0: array<{{TYPE}}>;
+
+@group(0) @binding(1)
+var<storage, read_write> src1: array<{{TYPE}}>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+override wg_size: u32;
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    if (gid.x < params.ne) {
+        src0[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] * src1[params.offset_src1 + src1_index(gid.x)];
+    }
+}
+
+#end(SHADER)