ggml-webgpu: parameterize submission size and add iOS specific limits (#21533)
* Work towards removing bitcast * Move rest of existing types over * Add timeout back to wait and remove synchronous set_tensor/memset_tensor * move to unpackf16 for wider compatibility * cleanup * Remove deadlock condition in free_bufs * Start work on removing parameter buffer pools * Simplify and optimize further * simplify profile futures * Fix stride * Try using a single command buffer per batch * formatting * Add parameters for different browsers in-flight submissions * Update handling of batch size too * Throttle ios as much as possible * Increase timeout for llvm-pipe testing
This commit is contained in:
parent
de1aa6fa73
commit
957d717ce5
|
|
@ -16,7 +16,6 @@
|
|||
#include <webgpu/webgpu_cpp.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
|
|
@ -25,7 +24,6 @@
|
|||
#if defined(GGML_WEBGPU_DEBUG) || defined(GGML_WEBGPU_CPU_PROFILE) || defined(GGML_WEBGPU_GPU_PROFILE)
|
||||
# include <iostream>
|
||||
#endif
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
|
|
@ -81,13 +79,13 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim
|
|||
|
||||
/* Constants */
|
||||
|
||||
#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 32u
|
||||
#define WEBGPU_NUM_PARAM_SLOTS \
|
||||
(WEBGPU_COMMAND_SUBMIT_BATCH_SIZE + 10) // a few extra for safety, since some operations may need multiple slots
|
||||
#define WEBGPU_WAIT_ANY_TIMEOUT_MS 100
|
||||
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
|
||||
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
|
||||
#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
|
||||
#define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 32u
|
||||
#define WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN 10u
|
||||
#define WEBGPU_RUNTIME_WAIT_TIMEOUT_MS 30000u
|
||||
#define WEBGPU_RUNTIME_WAIT_TIMEOUT_NS (WEBGPU_RUNTIME_WAIT_TIMEOUT_MS * 1e6)
|
||||
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
|
||||
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
|
||||
#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
|
||||
|
||||
// For operations which process a row in parallel, this seems like a reasonable
|
||||
// default
|
||||
|
|
@ -252,6 +250,8 @@ struct webgpu_global_context_struct {
|
|||
wgpu::Adapter adapter;
|
||||
wgpu::Device device;
|
||||
wgpu::Queue queue;
|
||||
uint32_t command_submit_batch_size = WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
|
||||
uint32_t max_inflight_batches = UINT32_MAX;
|
||||
|
||||
webgpu_capabilities capabilities;
|
||||
// Shared buffer to move data from device to host
|
||||
|
|
@ -417,16 +417,72 @@ static void ggml_backend_webgpu_wait_profile_futures(webgpu_global_context &
|
|||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status,
|
||||
T callback_status,
|
||||
T success_status,
|
||||
const char * wait_name,
|
||||
const char * failure_name,
|
||||
const char * callback_message) {
|
||||
if (wait_status == wgpu::WaitStatus::TimedOut) {
|
||||
GGML_ABORT("ggml_webgpu: %s timed out after %u ms\n", wait_name, WEBGPU_RUNTIME_WAIT_TIMEOUT_MS);
|
||||
}
|
||||
if (wait_status == wgpu::WaitStatus::Error) {
|
||||
GGML_ABORT("ggml_webgpu: %s failed\n", wait_name);
|
||||
}
|
||||
if (callback_status != success_status) {
|
||||
GGML_ABORT("ggml_webgpu: %s failed with status %d: %s\n", failure_name, static_cast<int>(callback_status),
|
||||
callback_message);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __EMSCRIPTEN__
|
||||
// iOS browsers seem to have very strict limits on the number of in-flight GPU commands, so we need to throttle to avoid failures.
|
||||
EM_JS(int, ggml_webgpu_is_ios_browser, (), {
|
||||
const ua = navigator.userAgent;
|
||||
return (ua.includes('iPhone') || ua.includes('iPad')) ? 1 : 0;
|
||||
});
|
||||
#endif
|
||||
|
||||
static uint32_t ggml_backend_webgpu_get_max_inflight_batches(const wgpu::AdapterInfo & info) {
|
||||
#ifdef __EMSCRIPTEN__
|
||||
if (ggml_webgpu_is_ios_browser()) {
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(info);
|
||||
#endif
|
||||
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
static uint32_t ggml_backend_webgpu_get_command_submit_batch_size(const wgpu::AdapterInfo & info) {
|
||||
#ifdef __EMSCRIPTEN__
|
||||
if (ggml_webgpu_is_ios_browser()) {
|
||||
return 16;
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(info);
|
||||
#endif
|
||||
|
||||
return WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
|
||||
}
|
||||
|
||||
static void ggml_backend_webgpu_wait_queue(webgpu_global_context & ctx) {
|
||||
ctx->instance.WaitAny(
|
||||
ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
|
||||
[](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
|
||||
if (status != wgpu::QueueWorkDoneStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
|
||||
std::string(message).c_str());
|
||||
}
|
||||
}),
|
||||
UINT64_MAX);
|
||||
wgpu::QueueWorkDoneStatus callback_status = wgpu::QueueWorkDoneStatus::Error;
|
||||
std::string callback_message;
|
||||
|
||||
const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
|
||||
ctx->queue.OnSubmittedWorkDone(
|
||||
wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&callback_status, &callback_message](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
|
||||
callback_status = status;
|
||||
callback_message = std::string(message);
|
||||
}),
|
||||
WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
|
||||
|
||||
ggml_backend_webgpu_check_wait_status(wait_status, callback_status, wgpu::QueueWorkDoneStatus::Success,
|
||||
"Queue wait", "Queue work", callback_message.c_str());
|
||||
}
|
||||
|
||||
static void ggml_backend_webgpu_map_buffer(webgpu_global_context & ctx,
|
||||
|
|
@ -434,14 +490,31 @@ static void ggml_backend_webgpu_map_buffer(webgpu_global_context & ctx,
|
|||
wgpu::MapMode mode,
|
||||
size_t offset,
|
||||
size_t size) {
|
||||
ctx->instance.WaitAny(buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[](wgpu::MapAsyncStatus status, wgpu::StringView message) {
|
||||
if (status != wgpu::MapAsyncStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n",
|
||||
message.data);
|
||||
}
|
||||
}),
|
||||
UINT64_MAX);
|
||||
wgpu::MapAsyncStatus callback_status = wgpu::MapAsyncStatus::Error;
|
||||
std::string callback_message;
|
||||
|
||||
const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
|
||||
buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&callback_status, &callback_message](wgpu::MapAsyncStatus status, wgpu::StringView message) {
|
||||
callback_status = status;
|
||||
callback_message = std::string(message);
|
||||
}),
|
||||
WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
|
||||
|
||||
ggml_backend_webgpu_check_wait_status(wait_status, callback_status, wgpu::MapAsyncStatus::Success,
|
||||
"Buffer map wait", "Buffer map", callback_message.c_str());
|
||||
}
|
||||
|
||||
static void ggml_backend_webgpu_submit_commands(webgpu_context & ctx,
|
||||
const wgpu::CommandBuffer commands,
|
||||
uint32_t & num_inflight_batches) {
|
||||
if (num_inflight_batches >= ctx->global_ctx->max_inflight_batches) {
|
||||
ggml_backend_webgpu_wait_queue(ctx->global_ctx);
|
||||
num_inflight_batches = 0;
|
||||
}
|
||||
|
||||
ctx->global_ctx->queue.Submit(1, &commands);
|
||||
num_inflight_batches++;
|
||||
}
|
||||
|
||||
#ifdef GGML_WEBGPU_DEBUG
|
||||
|
|
@ -2871,9 +2944,10 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
|||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
std::vector<wgpu::FutureWaitInfo> profile_futures;
|
||||
#endif
|
||||
uint32_t num_batched_kernels = 0;
|
||||
bool contains_set_rows = false;
|
||||
wgpu::CommandEncoder batch_encoder = ctx->global_ctx->device.CreateCommandEncoder();
|
||||
uint32_t num_batched_kernels = 0;
|
||||
uint32_t num_inflight_batches = 0;
|
||||
bool contains_set_rows = false;
|
||||
wgpu::CommandEncoder batch_encoder = ctx->global_ctx->device.CreateCommandEncoder();
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) {
|
||||
|
|
@ -2884,10 +2958,10 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
|||
num_batched_kernels += cmd.value().num_kernels;
|
||||
}
|
||||
|
||||
if (num_batched_kernels >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
|
||||
if (num_batched_kernels >= ctx->global_ctx->command_submit_batch_size) {
|
||||
num_batched_kernels = 0;
|
||||
wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
|
||||
ctx->global_ctx->queue.Submit(1, &batch_commands);
|
||||
ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures);
|
||||
#endif
|
||||
|
|
@ -2898,7 +2972,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
|||
}
|
||||
if (!commands.empty()) {
|
||||
wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
|
||||
ctx->global_ctx->queue.Submit(1, &batch_commands);
|
||||
ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures);
|
||||
#endif
|
||||
|
|
@ -2912,7 +2986,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
|||
encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0,
|
||||
ctx->set_rows_host_error_buf.GetSize());
|
||||
wgpu::CommandBuffer set_rows_commands = encoder.Finish();
|
||||
ctx->global_ctx->queue.Submit(1, &set_rows_commands);
|
||||
ggml_backend_webgpu_submit_commands(ctx, set_rows_commands, num_inflight_batches);
|
||||
}
|
||||
|
||||
ggml_backend_webgpu_wait_queue(ctx->global_ctx);
|
||||
|
|
@ -3363,6 +3437,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
|||
}
|
||||
#endif
|
||||
ctx->webgpu_global_ctx->adapter.GetInfo(&info);
|
||||
ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size(info);
|
||||
ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches(info);
|
||||
wgpu::SupportedFeatures features;
|
||||
ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
|
||||
// we require f16 support
|
||||
|
|
@ -3483,8 +3559,10 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
|
|||
webgpu_context webgpu_ctx = std::make_shared<webgpu_context_struct>();
|
||||
webgpu_ctx->global_ctx = dev_ctx->webgpu_global_ctx;
|
||||
webgpu_ctx->shader_lib = std::make_unique<ggml_webgpu_shader_lib>(dev_ctx->webgpu_global_ctx->device);
|
||||
webgpu_ctx->param_arena.init(webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES, WEBGPU_NUM_PARAM_SLOTS,
|
||||
webgpu_ctx->global_ctx->capabilities.limits.minUniformBufferOffsetAlignment);
|
||||
webgpu_ctx->param_arena.init(
|
||||
webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES,
|
||||
webgpu_ctx->global_ctx->command_submit_batch_size + WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN,
|
||||
webgpu_ctx->global_ctx->capabilities.limits.minUniformBufferOffsetAlignment);
|
||||
ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->set_rows_dev_error_buf,
|
||||
WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
|
||||
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, "set_rows_dev_error_buf");
|
||||
|
|
|
|||
Loading…
Reference in New Issue