diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 61d112c50a..8807c3e2b6 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -27,6 +27,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher(); #include #include #include +#include #include #include #include @@ -188,6 +189,11 @@ struct ggml_backend_vk_buffer_type_context { struct vk_queue; +struct vk_command_buffer { + vk::CommandBuffer buf; + bool in_use = false; +}; + // Stores command pool/buffers. There's an instance of this // for each (context,queue) pair and for each (device,queue) pair. struct vk_command_pool { @@ -195,10 +201,16 @@ struct vk_command_pool { void destroy(vk::Device& device); vk::CommandPool pool; - uint32_t cmd_buffer_idx; - std::vector cmd_buffers; + // Using deque so the pointers to command buffers + // remain valid even if we add more + std::deque cmd_buffers; vk_queue *q; + + size_t buffers_in_use() const { + return std::count_if(cmd_buffers.begin(), cmd_buffers.end(), + [](const auto& cb) { return cb.in_use; }); + } }; // Prevent simultaneous submissions to the same queue. @@ -878,10 +890,12 @@ struct vk_device_struct { }; void vk_command_pool::init(vk_device& device, vk_queue *q_) { - cmd_buffer_idx = 0; + cmd_buffers.clear(); q = q_; - vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index); + vk::CommandPoolCreateInfo command_pool_create_info( + vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT), + q->queue_family_index); pool = device->device.createCommandPool(command_pool_create_info); } @@ -929,6 +943,7 @@ struct vk_subbuffer { struct vk_event { vk::Event event; vk::Fence fence; + vk_command_buffer* cmd_buffer = nullptr; }; struct vk_semaphore { @@ -937,7 +952,7 @@ struct vk_semaphore { }; struct vk_submission { - vk::CommandBuffer buffer; + vk_command_buffer* buffer = nullptr; std::vector wait_semaphores; std::vector signal_semaphores; }; @@ -2283,25 +2298,15 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx } } -static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) { +static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) { VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()"); - - if (p.cmd_buffers.size() > p.cmd_buffer_idx) { - // Reuse command buffer - return p.cmd_buffers[p.cmd_buffer_idx++]; - } - vk::CommandBufferAllocateInfo command_buffer_alloc_info( p.pool, vk::CommandBufferLevel::ePrimary, 1); const std::vector cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info); - auto buf = cmd_buffers.front(); - - p.cmd_buffers.push_back(buf); - p.cmd_buffer_idx++; - - return buf; + p.cmd_buffers.push_back({ cmd_buffers.front(), true }); + return &p.cmd_buffers[p.cmd_buffers.size()-1]; } static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { @@ -2368,7 +2373,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { tl_wait_semaphores[idx].data(), stage_flags[idx].data(), 1, - &submission.buffer, + &submission.buffer->buf, (uint32_t) submission.signal_semaphores.size(), tl_signal_semaphores[idx].data(), }; @@ -2492,7 +2497,11 @@ static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) // Requires command buffers to be done device->device.resetCommandPool(p.pool); - p.cmd_buffer_idx = 0; + // Don't clear the command buffers and mark them as not in use. + // This allows us to reuse them + for (auto& cmd_buffer : p.cmd_buffers) { + cmd_buffer.in_use = false; + } } static void ggml_vk_queue_command_pools_cleanup(vk_device& device) { @@ -2501,10 +2510,10 @@ static void ggml_vk_queue_command_pools_cleanup(vk_device& device) { // Arbitrary frequency to cleanup/reuse command buffers static constexpr uint32_t cleanup_frequency = 10; - if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) { + if (device->compute_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) { ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool); } - if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) { + if (device->transfer_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) { ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool); } } @@ -2752,7 +2761,7 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false; } - subctx->s->buffer.pipelineBarrier( + subctx->s->buffer->buf.pipelineBarrier( subctx->p->q->stage_flags, subctx->p->q->stage_flags, {}, @@ -2768,7 +2777,7 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) { VK_LOG_DEBUG("ggml_vk_set_event()"); - ctx->s->buffer.setEvent( + ctx->s->buffer->buf.setEvent( event, ctx->p->q->stage_flags ); @@ -2780,7 +2789,7 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events return; } - ctx->s->buffer.waitEvents( + ctx->s->buffer->buf.waitEvents( events, ctx->p->q->stage_flags, ctx->p->q->stage_flags, @@ -6348,13 +6357,24 @@ static vk_subbuffer ggml_vk_tensor_subbuffer( return vk_subbuffer{buffer, offset, size}; } +// Get a command buffer from pool. Create a new one if no reusable buffer is available +static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) { + for (auto& cmd_buffer : pool.cmd_buffers) { + if (!cmd_buffer.in_use) { + cmd_buffer.in_use = true; + return &cmd_buffer; + } + } + return ggml_vk_create_cmd_buffer(device, pool); +} + static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) { vk_submission s; - s.buffer = ggml_vk_create_cmd_buffer(device, p); + s.buffer = ggml_vk_get_or_create_cmd_buffer(device, p); if (one_time) { - s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); + s.buffer->buf.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); } else { - s.buffer.begin({ vk::CommandBufferUsageFlags{} }); + s.buffer->buf.begin({ vk::CommandBufferUsageFlags{} }); } return s; @@ -6407,18 +6427,18 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); - subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants)); - subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); - subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, + subctx->s->buffer->buf.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants)); + subctx->s->buffer->buf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); + subctx->s->buffer->buf.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeline->layout, 0, { descriptor_set }, {}); - subctx->s->buffer.dispatch(wg0, wg1, wg2); + subctx->s->buffer->buf.dispatch(wg0, wg1, wg2); } static void ggml_vk_end_submission(vk_submission& s, std::vector wait_semaphores, std::vector signal_semaphores) { - s.buffer.end(); + s.buffer->buf.end(); s.wait_semaphores = std::move(wait_semaphores); s.signal_semaphores = std::move(signal_semaphores); @@ -6430,7 +6450,7 @@ static void ggml_vk_ctx_end(vk_context& ctx) { return; } - ctx->s->buffer.end(); + ctx->s->buffer->buf.end(); ctx->s = nullptr; } @@ -6584,7 +6604,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont } ggml_vk_sync_buffers(ctx, subctx); - subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); + subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices); return; } @@ -6599,7 +6619,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont VkBufferCopy buf_copy{ 0, offset, copy_size }; ggml_vk_sync_buffers(ctx, subctx); - vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); + vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); for (uint64_t i3 = 0; i3 < ne3; i3++) { for (uint64_t i2 = 0; i2 < ne2; i2++) { @@ -6648,7 +6668,7 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz } ggml_vk_sync_buffers(nullptr, subctx); - subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); + subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices); return true; } VK_LOG_DEBUG("STAGING"); @@ -6670,7 +6690,7 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz copy_size}; ggml_vk_sync_buffers(nullptr, subctx); - vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); + vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); if (width == spitch) { deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys); @@ -6756,7 +6776,7 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size if (buf != nullptr) { // Memory is pinned, use as staging buffer ggml_vk_sync_buffers(nullptr, subctx); - subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices); + subctx->s->buffer->buf.copyBuffer(src->buffer, buf->buffer, slices); return true; } @@ -6774,7 +6794,7 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size vk_buffer& staging_buffer = src->device->sync_staging; ggml_vk_sync_buffers(nullptr, subctx); - subctx->s->buffer.copyBuffer(src->buffer, staging_buffer->buffer, slices); + subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, slices); deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys); return true; @@ -6821,7 +6841,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds VkBufferCopy bc{ src_offset, dst_offset, size }; - vkCmdCopyBuffer(ctx->s->buffer, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc); + vkCmdCopyBuffer(ctx->s->buffer->buf, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc); } static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { @@ -6859,7 +6879,7 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t } // Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers - ctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); + ctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c); } static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) { @@ -6874,7 +6894,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz std::lock_guard guard(dst->device->mutex); vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(dst->device, subctx); - subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); + subctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, dst->device->fence); @@ -12682,7 +12702,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr if (vk_perf_logger_enabled && vk_perf_logger_concurrent) { ctx->query_node_idx[ctx->query_idx] = node_idx; - compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); } } // Add all fused nodes to the unsynchronized lists. @@ -13521,7 +13541,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor buffer_cpy.dstOffset = dst_offset; buffer_cpy.size = size; - cpy_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy }); + cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy }); deferred_memcpy(ctx->sync_staging->ptr, data, size, &cpy_ctx->in_memcpys); ggml_vk_synchronize(ctx); } @@ -13555,7 +13575,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ buffer_cpy.dstOffset = 0; buffer_cpy.size = size; - compute_ctx->s->buffer.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy }); + compute_ctx->s->buffer->buf.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy }); deferred_memcpy(data, ctx->sync_staging->ptr, size, &compute_ctx->out_memcpys); ggml_vk_synchronize(ctx); } @@ -13633,8 +13653,12 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { } vk_context compute_ctx; + vk_command_buffer* cmd_buf = nullptr; if (do_transfer) { compute_ctx = ctx->compute_ctx.lock(); + if (compute_ctx->s) { + cmd_buf = compute_ctx->s->buffer; + } ggml_vk_ctx_end(compute_ctx); @@ -13668,6 +13692,9 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { } ggml_vk_wait_for_fence(ctx); ctx->submit_pending = false; + if (cmd_buf) { + cmd_buf->in_use = false; + } } if (do_transfer) { @@ -14157,7 +14184,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg GGML_ASSERT(ctx->compute_ctx.expired()); compute_ctx = ggml_vk_get_compute_ctx(ctx); ctx->query_idx = 0; - compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); } ctx->prealloc_y_last_pipeline_used = nullptr; @@ -14393,7 +14420,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // track a single node/fusion for the current query ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; ctx->query_fusion_names[ctx->query_idx] = fusion_string; - compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); } else { // track a fusion string and number of fused ops for the current node_idx ctx->query_fusion_names[i] = fusion_string; @@ -14726,6 +14753,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev ggml_vk_submit_transfer_ctx(ctx); vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); + auto* cmd_buf = compute_ctx->s->buffer; // retrieve pointer before it gets reset // the backend interface doesn't have an explicit reset, so reset it here // before we record the command to set it @@ -14738,6 +14766,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev ggml_vk_submit(compute_ctx, {vkev->fence}); ctx->submit_pending = true; + vkev->cmd_buffer = cmd_buf; ctx->compute_ctx.reset(); } @@ -15557,6 +15586,10 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm vk_event *vkev = (vk_event *)event->context; VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize"); + // Finished using current command buffer so we flag for reuse + if (vkev->cmd_buffer) { + vkev->cmd_buffer->in_use = false; + } } static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) {