vulkan: improve partial offloading performance on AMD (#19976)
* vulkan: fix and enable cpy_tensor_async function * use transfer_queue for async transfers on AMD, synchronize with timeline semaphore * update offload_op logic * fix missing transfer submission * disable async transfer queue on AMD GCN * revert op batch size change * fix cpy_tensor_async checks
This commit is contained in:
parent
66d65ec29b
commit
319146247e
|
|
@ -590,6 +590,7 @@ struct vk_device_struct {
|
|||
vk_queue transfer_queue;
|
||||
bool single_queue;
|
||||
bool support_async;
|
||||
bool async_use_transfer_queue;
|
||||
uint32_t subgroup_size;
|
||||
uint32_t subgroup_size_log2;
|
||||
uint32_t shader_core_count;
|
||||
|
|
@ -1858,6 +1859,10 @@ struct ggml_backend_vk_context {
|
|||
|
||||
vk_context_ref compute_ctx;
|
||||
|
||||
vk_context_ref transfer_ctx;
|
||||
vk_semaphore transfer_semaphore;
|
||||
uint64_t transfer_semaphore_last_submitted {};
|
||||
|
||||
std::vector<vk_context_ref> tensor_ctxs;
|
||||
|
||||
std::vector<vk::DescriptorPool> descriptor_pools;
|
||||
|
|
@ -1866,6 +1871,7 @@ struct ggml_backend_vk_context {
|
|||
uint32_t pipeline_descriptor_set_requirements {};
|
||||
|
||||
vk_command_pool compute_cmd_pool;
|
||||
vk_command_pool transfer_cmd_pool;
|
||||
|
||||
// number of additional consecutive nodes that are being fused with the
|
||||
// node currently being processed
|
||||
|
|
@ -5391,13 +5397,19 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
|
||||
ggml_vk_load_shaders(device);
|
||||
|
||||
const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN;
|
||||
|
||||
if (!device->single_queue) {
|
||||
const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
|
||||
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
|
||||
|
||||
device->async_use_transfer_queue = prefers_transfer_queue || (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr);
|
||||
} else {
|
||||
// TODO: Use pointer or reference to avoid copy
|
||||
device->transfer_queue.copyFrom(device->compute_queue);
|
||||
device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
|
||||
|
||||
device->async_use_transfer_queue = false;
|
||||
}
|
||||
|
||||
device->buffer_type = {
|
||||
|
|
@ -5871,6 +5883,15 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|||
ctx->almost_ready_fence = ctx->device->device.createFence({});
|
||||
|
||||
ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
|
||||
if (ctx->device->async_use_transfer_queue) {
|
||||
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
|
||||
vk::SemaphoreCreateInfo ci{};
|
||||
ci.setPNext(&tci);
|
||||
ctx->transfer_semaphore.s = ctx->device->device.createSemaphore(ci);
|
||||
ctx->transfer_semaphore.value = 0;
|
||||
|
||||
ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
|
||||
}
|
||||
|
||||
if (vk_perf_logger_enabled) {
|
||||
ctx->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
|
||||
|
|
@ -6419,6 +6440,47 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
|
|||
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
|
||||
}
|
||||
|
||||
static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) {
|
||||
if (!ctx->compute_ctx.expired()) {
|
||||
return ctx->compute_ctx.lock();
|
||||
}
|
||||
|
||||
vk_context result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
|
||||
ctx->compute_ctx = result;
|
||||
ggml_vk_ctx_begin(ctx->device, result);
|
||||
|
||||
if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
|
||||
result->s->wait_semaphores.push_back(ctx->transfer_semaphore);
|
||||
ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Submit any pending transfer queue work and signal the transfer semaphore.
|
||||
// The next compute context created via ggml_vk_get_compute_ctx will wait on this semaphore.
|
||||
// Returns true if work was submitted.
|
||||
static bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx) {
|
||||
if (!ctx->device->async_use_transfer_queue || ctx->transfer_ctx.expired()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vk_context cpy_ctx = ctx->transfer_ctx.lock();
|
||||
ggml_vk_ctx_end(cpy_ctx);
|
||||
|
||||
for (auto& cpy : cpy_ctx->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
ctx->transfer_semaphore.value++;
|
||||
cpy_ctx->seqs.back().back().signal_semaphores.push_back(ctx->transfer_semaphore);
|
||||
|
||||
ggml_vk_submit(cpy_ctx, {});
|
||||
ctx->transfer_ctx.reset();
|
||||
return true;
|
||||
}
|
||||
|
||||
static size_t ggml_vk_align_size(size_t width, size_t align) {
|
||||
VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
|
||||
return CEIL_DIV(width, align) * align;
|
||||
|
|
@ -12529,15 +12591,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|||
}
|
||||
}
|
||||
|
||||
vk_context compute_ctx;
|
||||
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
|
||||
{
|
||||
// This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers
|
||||
|
|
@ -13055,6 +13109,9 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|||
ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false;
|
||||
|
||||
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
||||
if (ctx->device->async_use_transfer_queue) {
|
||||
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
|
||||
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
|
||||
|
|
@ -13116,6 +13173,11 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|||
ctx->descriptor_sets.clear();
|
||||
|
||||
ctx->compute_cmd_pool.destroy(ctx->device->device);
|
||||
if (ctx->device->async_use_transfer_queue) {
|
||||
ctx->device->device.destroySemaphore(ctx->transfer_semaphore.s);
|
||||
|
||||
ctx->transfer_cmd_pool.destroy(ctx->device->device);
|
||||
}
|
||||
if (vk_perf_logger_enabled) {
|
||||
ctx->perf_logger->print_timings(true);
|
||||
}
|
||||
|
|
@ -13387,34 +13449,38 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
|||
|
||||
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
||||
|
||||
vk_context compute_ctx;
|
||||
vk_context cpy_ctx;
|
||||
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
if (ctx->device->async_use_transfer_queue) {
|
||||
if (ctx->transfer_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
||||
ctx->transfer_ctx = cpy_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, cpy_ctx);
|
||||
} else {
|
||||
cpy_ctx = ctx->transfer_ctx.lock();
|
||||
}
|
||||
} else {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
}
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
|
||||
|
||||
bool ret = ggml_vk_buffer_write_async(compute_ctx, buf, dst_offset, data, size);
|
||||
bool ret = ggml_vk_buffer_write_async(cpy_ctx, buf, dst_offset, data, size);
|
||||
|
||||
if (!ret) {
|
||||
ggml_vk_ensure_sync_staging_buffer(ctx, size);
|
||||
ggml_vk_sync_buffers(nullptr, compute_ctx);
|
||||
ggml_vk_sync_buffers(nullptr, cpy_ctx);
|
||||
|
||||
vk::BufferCopy buffer_cpy;
|
||||
buffer_cpy.srcOffset = 0;
|
||||
buffer_cpy.dstOffset = dst_offset;
|
||||
buffer_cpy.size = size;
|
||||
|
||||
compute_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
|
||||
deferred_memcpy(ctx->sync_staging->ptr, data, size, &compute_ctx->in_memcpys);
|
||||
cpy_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
|
||||
deferred_memcpy(ctx->sync_staging->ptr, data, size, &cpy_ctx->in_memcpys);
|
||||
ggml_vk_synchronize(ctx);
|
||||
}
|
||||
}
|
||||
|
|
@ -13426,16 +13492,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
|||
|
||||
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
||||
|
||||
vk_context compute_ctx;
|
||||
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
|
||||
vk_buffer buf = buf_ctx->dev_buffer;
|
||||
|
||||
|
|
@ -13458,31 +13515,60 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
|||
}
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;
|
||||
|
||||
if (dst->buffer->buft != ggml_backend_vk_get_default_buffer_type(backend_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
|
||||
|
||||
if (ggml_backend_buffer_is_vk(src->buffer)) {
|
||||
ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
|
||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
|
||||
vk_context compute_ctx;
|
||||
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
// Async copy only works within the same device
|
||||
if (src_buf_ctx->dev_buffer->device != dst_buf->device) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vk_buffer src_buf = src_buf_ctx->dev_buffer;
|
||||
vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
|
||||
ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
|
||||
ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs,
|
||||
src_buf_ctx->dev_buffer, vk_tensor_offset(src) + src->view_offs,
|
||||
ggml_nbytes(src));
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||
vk_buffer pinned_buf = nullptr;
|
||||
size_t pinned_offset = 0;
|
||||
ggml_vk_host_get(ctx->device, src->data, pinned_buf, pinned_offset);
|
||||
if (pinned_buf == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vk_context cpy_ctx;
|
||||
if (ctx->device->async_use_transfer_queue) {
|
||||
if (ctx->transfer_ctx.expired()) {
|
||||
cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
||||
ctx->transfer_ctx = cpy_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, cpy_ctx);
|
||||
} else {
|
||||
cpy_ctx = ctx->transfer_ctx.lock();
|
||||
}
|
||||
} else {
|
||||
cpy_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
}
|
||||
|
||||
return ggml_vk_buffer_write_async(cpy_ctx, dst_buf,
|
||||
vk_tensor_offset(dst) + dst->view_offs,
|
||||
src->data, ggml_nbytes(src));
|
||||
}
|
||||
|
||||
GGML_UNUSED(backend_src);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -13491,6 +13577,10 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
|||
|
||||
bool do_transfer = !ctx->compute_ctx.expired();
|
||||
|
||||
if (ggml_vk_submit_transfer_ctx(ctx)) {
|
||||
ctx->submit_pending = true;
|
||||
}
|
||||
|
||||
vk_context compute_ctx;
|
||||
if (do_transfer) {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
|
|
@ -13506,7 +13596,22 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
|
|||
}
|
||||
|
||||
if (ctx->submit_pending) {
|
||||
{
|
||||
if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
|
||||
vk::TimelineSemaphoreSubmitInfo tl_info{
|
||||
1, &ctx->transfer_semaphore.value,
|
||||
0, nullptr,
|
||||
};
|
||||
vk::PipelineStageFlags stage = ctx->device->transfer_queue.stage_flags;
|
||||
vk::SubmitInfo si{
|
||||
1, &ctx->transfer_semaphore.s, &stage,
|
||||
0, nullptr,
|
||||
0, nullptr,
|
||||
};
|
||||
si.setPNext(&tl_info);
|
||||
std::lock_guard<std::mutex> guard(queue_mutex);
|
||||
ctx->device->compute_queue.queue.submit({ si }, ctx->fence);
|
||||
ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value;
|
||||
} else {
|
||||
std::lock_guard<std::mutex> guard(queue_mutex);
|
||||
ctx->device->compute_queue.queue.submit({}, ctx->fence);
|
||||
}
|
||||
|
|
@ -13972,6 +14077,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
||||
int submit_node_idx = 0; // index to first node in a batch
|
||||
|
||||
ggml_vk_submit_transfer_ctx(ctx);
|
||||
|
||||
vk_context compute_ctx;
|
||||
if (vk_perf_logger_enabled) {
|
||||
// allocate/resize the query pool
|
||||
|
|
@ -13997,9 +14104,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);
|
||||
|
||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
ctx->query_idx = 0;
|
||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
}
|
||||
|
|
@ -14009,13 +14114,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
|
||||
if (ctx->prealloc_size_add_rms_partials) {
|
||||
ggml_vk_preallocate_buffers(ctx, nullptr);
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
// initialize partial sums to zero.
|
||||
ggml_vk_buffer_memset_async(compute_ctx, ctx->prealloc_add_rms_partials, 0, 0, ctx->prealloc_size_add_rms_partials);
|
||||
ggml_vk_sync_buffers(ctx, compute_ctx);
|
||||
|
|
@ -14238,13 +14337,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
|
||||
|
||||
if (vk_perf_logger_enabled && enqueued) {
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
if (!vk_perf_logger_concurrent) {
|
||||
// track a single node/fusion for the current query
|
||||
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
|
||||
|
|
@ -14579,16 +14672,9 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev
|
|||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
vk_context compute_ctx;
|
||||
ggml_vk_submit_transfer_ctx(ctx);
|
||||
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
|
||||
// the backend interface doesn't have an explicit reset, so reset it here
|
||||
// before we record the command to set it
|
||||
|
|
@ -14609,16 +14695,7 @@ static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_even
|
|||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
vk_event *vkev = (vk_event *)event->context;
|
||||
|
||||
vk_context compute_ctx;
|
||||
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
|
||||
ggml_vk_wait_events(compute_ctx, {vkev->event});
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
|
|
@ -14631,7 +14708,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|||
/* .free = */ ggml_backend_vk_free,
|
||||
/* .set_tensor_async = */ ggml_backend_vk_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||
/* .cpy_tensor_async = */ ggml_backend_vk_cpy_tensor_async,
|
||||
/* .synchronize = */ ggml_backend_vk_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
|
|
@ -15367,11 +15444,25 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
|
|||
return buft_ctx->device->idx == ctx->device;
|
||||
}
|
||||
|
||||
static int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_GET_ROWS:
|
||||
return 0;
|
||||
case GGML_OP_MUL_MAT:
|
||||
return op->ne[1];
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK:
|
||||
return op->ne[2];
|
||||
default:
|
||||
return ggml_nrows(op);
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
|
||||
return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
return ggml_vk_get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue