ggml-webgpu: parameterize submission size and add iOS specific limits (#21533)

* Work towards removing bitcast

* Move rest of existing types over

* Add timeout back to wait and remove synchronous set_tensor/memset_tensor

* move to unpackf16 for wider compatibility

* cleanup

* Remove deadlock condition in free_bufs

* Start work on removing parameter buffer pools

* Simplify and optimize further

* simplify profile futures

* Fix stride

* Try using a single command buffer per batch

* formatting

* Add parameters for different browsers in-flight submissions

* Update handling of batch size too

* Throttle ios as much as possible

* Increase timeout for llvm-pipe testing
This commit is contained in:
Reese Levine 2026-04-07 10:30:01 -07:00 committed by GitHub
parent de1aa6fa73
commit 957d717ce5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 113 additions and 35 deletions

View File

@ -16,7 +16,6 @@
#include <webgpu/webgpu_cpp.h>
#include <atomic>
#include <condition_variable>
#include <cstdint>
#include <cstring>
#ifdef GGML_WEBGPU_GPU_PROFILE
@ -25,7 +24,6 @@
#if defined(GGML_WEBGPU_DEBUG) || defined(GGML_WEBGPU_CPU_PROFILE) || defined(GGML_WEBGPU_GPU_PROFILE)
# include <iostream>
#endif
#include <map>
#include <memory>
#include <mutex>
#include <optional>
@ -81,13 +79,13 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim
/* Constants */
#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 32u
#define WEBGPU_NUM_PARAM_SLOTS \
(WEBGPU_COMMAND_SUBMIT_BATCH_SIZE + 10) // a few extra for safety, since some operations may need multiple slots
#define WEBGPU_WAIT_ANY_TIMEOUT_MS 100
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
#define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 32u
#define WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN 10u
#define WEBGPU_RUNTIME_WAIT_TIMEOUT_MS 30000u
#define WEBGPU_RUNTIME_WAIT_TIMEOUT_NS (WEBGPU_RUNTIME_WAIT_TIMEOUT_MS * 1e6)
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
// For operations which process a row in parallel, this seems like a reasonable
// default
@ -252,6 +250,8 @@ struct webgpu_global_context_struct {
wgpu::Adapter adapter;
wgpu::Device device;
wgpu::Queue queue;
uint32_t command_submit_batch_size = WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
uint32_t max_inflight_batches = UINT32_MAX;
webgpu_capabilities capabilities;
// Shared buffer to move data from device to host
@ -417,16 +417,72 @@ static void ggml_backend_webgpu_wait_profile_futures(webgpu_global_context &
}
#endif
template <typename T>
static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status,
T callback_status,
T success_status,
const char * wait_name,
const char * failure_name,
const char * callback_message) {
if (wait_status == wgpu::WaitStatus::TimedOut) {
GGML_ABORT("ggml_webgpu: %s timed out after %u ms\n", wait_name, WEBGPU_RUNTIME_WAIT_TIMEOUT_MS);
}
if (wait_status == wgpu::WaitStatus::Error) {
GGML_ABORT("ggml_webgpu: %s failed\n", wait_name);
}
if (callback_status != success_status) {
GGML_ABORT("ggml_webgpu: %s failed with status %d: %s\n", failure_name, static_cast<int>(callback_status),
callback_message);
}
}
#ifdef __EMSCRIPTEN__
// iOS browsers seem to have very strict limits on the number of in-flight GPU commands, so we need to throttle to avoid failures.
EM_JS(int, ggml_webgpu_is_ios_browser, (), {
const ua = navigator.userAgent;
return (ua.includes('iPhone') || ua.includes('iPad')) ? 1 : 0;
});
#endif
static uint32_t ggml_backend_webgpu_get_max_inflight_batches(const wgpu::AdapterInfo & info) {
#ifdef __EMSCRIPTEN__
if (ggml_webgpu_is_ios_browser()) {
return 1;
}
#else
GGML_UNUSED(info);
#endif
return UINT32_MAX;
}
static uint32_t ggml_backend_webgpu_get_command_submit_batch_size(const wgpu::AdapterInfo & info) {
#ifdef __EMSCRIPTEN__
if (ggml_webgpu_is_ios_browser()) {
return 16;
}
#else
GGML_UNUSED(info);
#endif
return WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
}
static void ggml_backend_webgpu_wait_queue(webgpu_global_context & ctx) {
ctx->instance.WaitAny(
ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
[](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
if (status != wgpu::QueueWorkDoneStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
std::string(message).c_str());
}
}),
UINT64_MAX);
wgpu::QueueWorkDoneStatus callback_status = wgpu::QueueWorkDoneStatus::Error;
std::string callback_message;
const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
ctx->queue.OnSubmittedWorkDone(
wgpu::CallbackMode::AllowSpontaneous,
[&callback_status, &callback_message](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
callback_status = status;
callback_message = std::string(message);
}),
WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
ggml_backend_webgpu_check_wait_status(wait_status, callback_status, wgpu::QueueWorkDoneStatus::Success,
"Queue wait", "Queue work", callback_message.c_str());
}
static void ggml_backend_webgpu_map_buffer(webgpu_global_context & ctx,
@ -434,14 +490,31 @@ static void ggml_backend_webgpu_map_buffer(webgpu_global_context & ctx,
wgpu::MapMode mode,
size_t offset,
size_t size) {
ctx->instance.WaitAny(buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
[](wgpu::MapAsyncStatus status, wgpu::StringView message) {
if (status != wgpu::MapAsyncStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n",
message.data);
}
}),
UINT64_MAX);
wgpu::MapAsyncStatus callback_status = wgpu::MapAsyncStatus::Error;
std::string callback_message;
const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
[&callback_status, &callback_message](wgpu::MapAsyncStatus status, wgpu::StringView message) {
callback_status = status;
callback_message = std::string(message);
}),
WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
ggml_backend_webgpu_check_wait_status(wait_status, callback_status, wgpu::MapAsyncStatus::Success,
"Buffer map wait", "Buffer map", callback_message.c_str());
}
static void ggml_backend_webgpu_submit_commands(webgpu_context & ctx,
const wgpu::CommandBuffer commands,
uint32_t & num_inflight_batches) {
if (num_inflight_batches >= ctx->global_ctx->max_inflight_batches) {
ggml_backend_webgpu_wait_queue(ctx->global_ctx);
num_inflight_batches = 0;
}
ctx->global_ctx->queue.Submit(1, &commands);
num_inflight_batches++;
}
#ifdef GGML_WEBGPU_DEBUG
@ -2871,9 +2944,10 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
#ifdef GGML_WEBGPU_GPU_PROFILE
std::vector<wgpu::FutureWaitInfo> profile_futures;
#endif
uint32_t num_batched_kernels = 0;
bool contains_set_rows = false;
wgpu::CommandEncoder batch_encoder = ctx->global_ctx->device.CreateCommandEncoder();
uint32_t num_batched_kernels = 0;
uint32_t num_inflight_batches = 0;
bool contains_set_rows = false;
wgpu::CommandEncoder batch_encoder = ctx->global_ctx->device.CreateCommandEncoder();
for (int i = 0; i < cgraph->n_nodes; i++) {
if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) {
@ -2884,10 +2958,10 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
num_batched_kernels += cmd.value().num_kernels;
}
if (num_batched_kernels >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
if (num_batched_kernels >= ctx->global_ctx->command_submit_batch_size) {
num_batched_kernels = 0;
wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
ctx->global_ctx->queue.Submit(1, &batch_commands);
ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
#ifdef GGML_WEBGPU_GPU_PROFILE
ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures);
#endif
@ -2898,7 +2972,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
}
if (!commands.empty()) {
wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
ctx->global_ctx->queue.Submit(1, &batch_commands);
ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
#ifdef GGML_WEBGPU_GPU_PROFILE
ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures);
#endif
@ -2912,7 +2986,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0,
ctx->set_rows_host_error_buf.GetSize());
wgpu::CommandBuffer set_rows_commands = encoder.Finish();
ctx->global_ctx->queue.Submit(1, &set_rows_commands);
ggml_backend_webgpu_submit_commands(ctx, set_rows_commands, num_inflight_batches);
}
ggml_backend_webgpu_wait_queue(ctx->global_ctx);
@ -3363,6 +3437,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
}
#endif
ctx->webgpu_global_ctx->adapter.GetInfo(&info);
ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size(info);
ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches(info);
wgpu::SupportedFeatures features;
ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
// we require f16 support
@ -3483,8 +3559,10 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
webgpu_context webgpu_ctx = std::make_shared<webgpu_context_struct>();
webgpu_ctx->global_ctx = dev_ctx->webgpu_global_ctx;
webgpu_ctx->shader_lib = std::make_unique<ggml_webgpu_shader_lib>(dev_ctx->webgpu_global_ctx->device);
webgpu_ctx->param_arena.init(webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES, WEBGPU_NUM_PARAM_SLOTS,
webgpu_ctx->global_ctx->capabilities.limits.minUniformBufferOffsetAlignment);
webgpu_ctx->param_arena.init(
webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES,
webgpu_ctx->global_ctx->command_submit_batch_size + WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN,
webgpu_ctx->global_ctx->capabilities.limits.minUniformBufferOffsetAlignment);
ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->set_rows_dev_error_buf,
WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, "set_rows_dev_error_buf");