vulkan: Implement set_tensor_async and the event interfaces

The goal is to enable the async loading code paths in
llama_model_loader::load_all_data, originally from #7896. This works and the
loads themselves are faster, but with host visible vidmem I think the cost of
allocating/mapping vidmem moves and becomes more expensive, and I don't see a
benefit by default. But with GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM=1 I do see a
significant improvement in model loading time.
This commit is contained in:
Jeff Bolz 2025-12-14 18:27:17 -06:00
parent 5c8a717128
commit 6a31e8d40e
1 changed files with 138 additions and 19 deletions

View File

@ -855,6 +855,15 @@ struct vk_subbuffer {
} }
}; };
// vk_event is used for the event-related backend interfaces. It uses 'event' for
// event_wait and 'fence' for event_synchronize. Polling on an event for
// event_synchronize wouldn't be sufficient to wait for command buffers to complete,
// and would lead to validation errors.
struct vk_event {
vk::Event event;
vk::Fence fence;
};
struct vk_semaphore { struct vk_semaphore {
vk::Semaphore s; vk::Semaphore s;
uint64_t value; uint64_t value;
@ -2514,6 +2523,15 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct
); );
} }
static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) {
VK_LOG_DEBUG("ggml_vk_set_event()");
ctx->s->buffer.setEvent(
event,
ctx->p->q->stage_flags
);
}
static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) { static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) {
VK_LOG_DEBUG("ggml_vk_wait_events()"); VK_LOG_DEBUG("ggml_vk_wait_events()");
if (events.empty()) { if (events.empty()) {
@ -6053,13 +6071,8 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
} }
} }
static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) { static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
// Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
GGML_ABORT("fatal error");
}
// Check if src is pinned memory // Check if src is pinned memory
vk_buffer buf = nullptr; vk_buffer buf = nullptr;
size_t buf_offset = 0; size_t buf_offset = 0;
@ -6084,12 +6097,13 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
ggml_vk_sync_buffers(nullptr, subctx); ggml_vk_sync_buffers(nullptr, subctx);
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
return; return true;
} }
VK_LOG_DEBUG("STAGING"); VK_LOG_DEBUG("STAGING");
if (!sync_staging) { if (!sync_staging) {
GGML_ABORT("Asynchronous write to non-pinned memory not supported"); // copy was not handled caller needs to fall back
return false;
} }
// Staging buffer required // Staging buffer required
@ -6113,9 +6127,10 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys); deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
} }
} }
return true;
} }
static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { static bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")"); VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, sync_staging); return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, sync_staging);
} }
@ -6134,7 +6149,8 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx); ggml_vk_ctx_begin(dst->device, subctx);
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true); bool ret = ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
GGML_ASSERT(ret);
ggml_vk_ctx_end(subctx); ggml_vk_ctx_end(subctx);
for (auto& cpy : subctx->in_memcpys) { for (auto& cpy : subctx->in_memcpys) {
@ -12612,7 +12628,23 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
vk_buffer buf = buf_ctx->dev_buffer; vk_buffer buf = buf_ctx->dev_buffer;
ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
bool ret = ggml_vk_buffer_write_async(transfer_ctx, buf, dst_offset, data, size);
if (!ret) {
ggml_vk_ensure_sync_staging_buffer(ctx, size);
ggml_vk_sync_buffers(nullptr, transfer_ctx);
vk::BufferCopy buffer_cpy;
buffer_cpy.srcOffset = 0;
buffer_cpy.dstOffset = dst_offset;
buffer_cpy.size = size;
transfer_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
deferred_memcpy(ctx->sync_staging->ptr, data, size, &transfer_ctx->in_memcpys);
ggml_vk_synchronize(ctx);
}
} }
static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -13568,11 +13600,58 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
} }
} }
static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
vk_event *vkev = (vk_event *)event->context;
vk_context transfer_ctx;
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->transfer_ctx = transfer_ctx;
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
} else {
transfer_ctx = ctx->transfer_ctx.lock();
}
// the backend interface doesn't have an explicit reset, so reset it here
// before we record the command to set it
ctx->device->device.resetEvent(vkev->event);
ctx->device->device.resetFences({ vkev->fence });
ggml_vk_set_event(transfer_ctx, vkev->event);
ggml_vk_ctx_end(transfer_ctx);
ggml_vk_submit(transfer_ctx, {vkev->fence});
ctx->submit_pending = true;
ctx->transfer_ctx.reset();
}
static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
vk_event *vkev = (vk_event *)event->context;
vk_context transfer_ctx;
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->transfer_ctx = transfer_ctx;
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
} else {
transfer_ctx = ctx->transfer_ctx.lock();
}
ggml_vk_wait_events(transfer_ctx, {vkev->event});
}
// TODO: enable async and synchronize // TODO: enable async and synchronize
static ggml_backend_i ggml_backend_vk_interface = { static ggml_backend_i ggml_backend_vk_interface = {
/* .get_name = */ ggml_backend_vk_name, /* .get_name = */ ggml_backend_vk_name,
/* .free = */ ggml_backend_vk_free, /* .free = */ ggml_backend_vk_free,
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async, /* .set_tensor_async = */ ggml_backend_vk_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_vk_get_tensor_async, /* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async, /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
/* .synchronize = */ ggml_backend_vk_synchronize, /* .synchronize = */ ggml_backend_vk_synchronize,
@ -13581,8 +13660,8 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .graph_plan_update = */ NULL, /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute, /* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .event_record = */ NULL, /* .event_record = */ ggml_backend_vk_event_record,
/* .event_wait = */ NULL, /* .event_wait = */ ggml_backend_vk_event_wait,
/* .graph_optimize = */ ggml_vk_graph_optimize, /* .graph_optimize = */ ggml_vk_graph_optimize,
}; };
@ -13757,10 +13836,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = { props->caps = {
/* .async = */ false, /* .async = */ true,
/* .host_buffer = */ true, /* .host_buffer = */ true,
/* .buffer_from_host_ptr = */ false, /* .buffer_from_host_ptr = */ false,
/* .events = */ false, /* .events = */ true,
}; };
} }
@ -14291,6 +14370,46 @@ static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml
UNUSED(dev); UNUSED(dev);
} }
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
auto device = ggml_vk_get_device(ctx->device);
vk_event *vkev = new vk_event;
if (!vkev) {
return nullptr;
}
// The event/fence is expected to initially be in the signaled state.
vkev->event = device->device.createEvent({});
vkev->fence = device->device.createFence({vk::FenceCreateFlagBits::eSignaled});
device->device.setEvent(vkev->event);
return new ggml_backend_event {
/* .device = */ dev,
/* .context = */ vkev,
};
}
static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
auto device = ggml_vk_get_device(ctx->device);
vk_event *vkev = (vk_event *)event->context;
device->device.destroyFence(vkev->fence);
device->device.destroyEvent(vkev->event);
delete vkev;
delete event;
}
static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
auto device = ggml_vk_get_device(ctx->device);
vk_event *vkev = (vk_event *)event->context;
VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
}
static const struct ggml_backend_device_i ggml_backend_vk_device_i = { static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
/* .get_name = */ ggml_backend_vk_device_get_name, /* .get_name = */ ggml_backend_vk_device_get_name,
/* .get_description = */ ggml_backend_vk_device_get_description, /* .get_description = */ ggml_backend_vk_device_get_description,
@ -14304,9 +14423,9 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
/* .supports_op = */ ggml_backend_vk_device_supports_op, /* .supports_op = */ ggml_backend_vk_device_supports_op,
/* .supports_buft = */ ggml_backend_vk_device_supports_buft, /* .supports_buft = */ ggml_backend_vk_device_supports_buft,
/* .offload_op = */ ggml_backend_vk_device_offload_op, /* .offload_op = */ ggml_backend_vk_device_offload_op,
/* .event_new = */ NULL, /* .event_new = */ ggml_backend_vk_device_event_new,
/* .event_free = */ NULL, /* .event_free = */ ggml_backend_vk_device_event_free,
/* .event_synchronize = */ NULL, /* .event_synchronize = */ ggml_backend_vk_device_event_synchronize,
}; };
static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) { static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {