diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 666bfbe183..61f0a19f70 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -13,7 +13,9 @@ #include #include +#include #include +#include #include #ifdef GGML_WEBGPU_DEBUG @@ -61,7 +63,8 @@ struct webgpu_param_bufs { struct webgpu_param_buf_pool { std::vector free; - std::mutex mutex; + std::mutex mutex; + std::condition_variable cv; void init(wgpu::Device device) { @@ -108,19 +111,18 @@ struct webgpu_param_buf_pool { // All the base objects needed to run operations on a WebGPU device struct webgpu_context_struct { - wgpu::Instance instance; - wgpu::Adapter adapter; - wgpu::Device device; - wgpu::Queue queue; - wgpu::Limits limits; - wgpu::SupportedFeatures features; + wgpu::Instance instance; + wgpu::Adapter adapter; + wgpu::Device device; + wgpu::Queue queue; + wgpu::Limits limits; - std::recursive_mutex submit_mutex; + std::recursive_mutex mutex; std::mutex get_tensor_mutex; std::mutex init_mutex; - bool device_init = false; - // Parameter buffer pool + bool device_init = false; + webgpu_param_buf_pool param_buf_pool; wgpu::ComputePipeline memset_pipeline; @@ -134,36 +136,33 @@ struct webgpu_context_struct { // Command buffers which need to be submitted std::vector staged_command_bufs; + // Parameter buffers associated with the staged command buffers - std::vector staged_param_bufs; + std::vector staged_param_bufs; }; typedef std::shared_ptr webgpu_context; struct ggml_backend_webgpu_reg_context { webgpu_context webgpu_ctx; - - size_t device_count; - const char * name; + size_t device_count; + const char * name; }; struct ggml_backend_webgpu_device_context { webgpu_context webgpu_ctx; - - std::string device_name; - std::string device_desc; + std::string device_name; + std::string device_desc; }; struct ggml_backend_webgpu_context { webgpu_context webgpu_ctx; - - std::string name; + std::string name; }; struct ggml_backend_webgpu_buffer_context { webgpu_context webgpu_ctx; - - wgpu::Buffer buffer; + wgpu::Buffer buffer; ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) : webgpu_ctx(std::move(ctx)), @@ -180,10 +179,13 @@ static void ggml_webgpu_create_pipeline(wgpu::Device & const char * label, const std::vector & constants = {}) { WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()"); + wgpu::ShaderSourceWGSL shader_source; shader_source.code = shader_code; + wgpu::ShaderModuleDescriptor shader_desc; - shader_desc.nextInChain = &shader_source; + shader_desc.nextInChain = &shader_source; + wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc); wgpu::ComputePipelineDescriptor pipeline_desc; @@ -210,8 +212,9 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, buffer_desc.usage = usage; buffer_desc.label = label; buffer_desc.mappedAtCreation = false; + // TODO: error handling - buffer = device.CreateBuffer(&buffer_desc); + buffer = device.CreateBuffer(&buffer_desc); } /** End WebGPU object initializations */ @@ -231,8 +234,7 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) { } static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) { - std::lock_guard lock(ctx->submit_mutex); - + std::lock_guard lock(ctx->mutex); ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data()); ctx->staged_command_bufs.clear(); std::vector staged_param_bufs = std::move(ctx->staged_param_bufs); @@ -274,6 +276,8 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & bool submit_imm = false) { webgpu_param_bufs params_bufs = ctx->param_buf_pool.alloc_bufs(); + std::lock_guard lock(ctx->mutex); + ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize()); uint32_t * _params = (uint32_t *) params_bufs.host_buf.GetMappedRange(); for (size_t i = 0; i < params.size(); i++) { @@ -315,7 +319,6 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & }); } else { // Enqueue commands and only submit if we have enough staged commands - std::lock_guard lock(ctx->submit_mutex); ctx->staged_command_bufs.push_back(commands); ctx->staged_param_bufs.push_back(params_bufs); if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) { @@ -540,10 +543,12 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); - ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; - size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; + ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; + + size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; + // This is a trick to set all bytes of a u32 to the same 1 byte value. - uint32_t val32 = (uint32_t) value * 0x01010101; + uint32_t val32 = (uint32_t) value * 0x01010101; ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, val32, total_offset, size); } @@ -559,13 +564,16 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; + std::lock_guard lock(webgpu_ctx->mutex); webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4); if (size % 4 != 0) { // If size is not a multiple of 4, we need to memset the remaining bytes - size_t remaining_size = size % 4; + size_t remaining_size = size % 4; + // pack the remaining bytes into a uint32_t - uint32_t val32 = 0; + uint32_t val32 = 0; + for (size_t i = 0; i < remaining_size; i++) { ((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i]; } @@ -613,8 +621,12 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); encoder.CopyBufferToBuffer(buf_ctx->buffer, total_offset, webgpu_ctx->get_tensor_staging_buf, 0, final_size); wgpu::CommandBuffer commands = encoder.Finish(); - // Submit the command buffer to the queue - webgpu_ctx->queue.Submit(1, &commands); + + { + std::lock_guard submit_lock(webgpu_ctx->mutex); + // Submit the command buffer to the queue + webgpu_ctx->queue.Submit(1, &commands); + } // Map the staging buffer to read the data ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, final_size); @@ -628,7 +640,6 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t) value << ")"); - ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buffer->size); } @@ -764,10 +775,11 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co std::lock_guard lock(webgpu_ctx->init_mutex); if (!webgpu_ctx->device_init) { // Initialize device - wgpu::DeviceDescriptor dev_desc; + std::vector required_features = { wgpu::FeatureName::ShaderF16 }; + wgpu::DeviceDescriptor dev_desc; dev_desc.requiredLimits = &webgpu_ctx->limits; - dev_desc.requiredFeatures = webgpu_ctx->features.features; - dev_desc.requiredFeatureCount = webgpu_ctx->features.featureCount; + dev_desc.requiredFeatures = required_features.data(); + dev_desc.requiredFeatureCount = required_features.size(); dev_desc.SetDeviceLostCallback( wgpu::CallbackMode::AllowSpontaneous, [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) { @@ -920,7 +932,6 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t GGML_ASSERT(ctx->adapter != nullptr); ctx->adapter.GetLimits(&ctx->limits); - ctx->adapter.GetFeatures(&ctx->features); wgpu::AdapterInfo info{}; ctx->adapter.GetInfo(&info);