diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 4ed5f35774..0d5bf469fd 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -93,8 +93,11 @@ extern "C" { GGML_API void ggml_backend_synchronize(ggml_backend_t backend); + GGML_API bool ggml_backend_supports_graph_plan(ggml_backend_t backend); + GGML_API bool ggml_backend_supports_graph_plan_update(ggml_backend_t backend); GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API void ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 8547ecc849..6d60e09d33 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -332,6 +332,18 @@ void ggml_backend_synchronize(ggml_backend_t backend) { backend->iface.synchronize(backend); } +bool ggml_backend_supports_graph_plan(ggml_backend_t backend) { + GGML_ASSERT(backend); + + return (bool) backend->iface.graph_plan_create; +} + +bool ggml_backend_supports_graph_plan_update(ggml_backend_t backend) { + GGML_ASSERT(backend); + + return (bool) backend->iface.graph_plan_update; +} + ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { GGML_ASSERT(backend); GGML_ASSERT(backend->iface.graph_plan_create != NULL); @@ -346,6 +358,13 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla backend->iface.graph_plan_free(backend, plan); } +void ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph* cgraph) { + GGML_ASSERT(backend); + GGML_ASSERT(backend->iface.graph_plan_update != NULL); + + backend->iface.graph_plan_update(backend, plan, cgraph); +} + enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(backend); GGML_ASSERT(backend->iface.graph_plan_compute != NULL); @@ -680,6 +699,11 @@ struct ggml_backend_sched_split { struct ggml_cgraph graph; }; +struct ggml_backend_sched_plan { + int backend_id; + ggml_backend_graph_plan_t plan; +}; + struct ggml_backend_sched { bool is_reset; // true if the scheduler has been reset since the last graph split bool is_alloc; @@ -709,6 +733,12 @@ struct ggml_backend_sched { int n_splits; int splits_capacity; + // graph plans + struct ggml_backend_sched_plan * plans; + int n_plans; + int plans_capacity; + bool plan_needs_update; + // pipeline parallelism support int n_copies; int cur_copy; @@ -919,6 +949,16 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru } } +static void ggml_backend_sched_free_plans(ggml_backend_sched_t sched) { + for (int i = 0; i < sched->n_plans; i++) { + ggml_backend_t backend = sched->backends[sched->plans[i].backend_id]; + if (ggml_backend_supports_graph_plan(backend)) { + ggml_backend_graph_plan_free(backend, sched->plans[i].plan); + } + } + sched->n_plans = 0; +} + // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { // reset splits @@ -1386,6 +1426,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra assert(graph_copy->size > graph_copy->n_leafs); graph_copy->leafs[graph_copy->n_leafs++] = leaf; } + sched->plan_needs_update = true; } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { @@ -1440,6 +1481,62 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { return true; } +static void ggml_backend_sched_update_plans(ggml_backend_sched_t sched) { + // create graph plans + if (sched->plan_needs_update) { + bool create_new_plans; + if (sched->n_plans == sched->n_splits) { + create_new_plans = false; + for (int i = 0; i < sched->n_splits; i++) { + if (sched->splits[i].backend_id != sched->plans[i].backend_id) { + create_new_plans = true; + break; + } + } + } else { + create_new_plans = true; + } + if (create_new_plans) { + // free previous and recreate new plans + ggml_backend_sched_free_plans(sched); + if (sched->plans_capacity < sched->n_splits) { + while (sched->plans_capacity < sched->n_splits) { + sched->plans_capacity *= 2; + } + sched->plans = (ggml_backend_sched_plan *) realloc( + sched->plans, sched->plans_capacity * sizeof(struct ggml_backend_sched_plan)); + GGML_ASSERT(sched->plans); + } + sched->n_plans = sched->n_splits; + for (int i = 0; i < sched->n_splits; i++) { + ggml_backend_t backend = sched->backends[sched->splits[i].backend_id]; + sched->plans[i].backend_id = sched->splits[i].backend_id; + if (ggml_backend_supports_graph_plan(backend)) { + sched->plans[i].plan = ggml_backend_graph_plan_create(backend, &sched->splits[i].graph); + } else { + sched->plans[i].plan = nullptr; + } + } + } else { + // update existing plans + for (int i = 0; i < sched->n_splits; i++) { + ggml_backend_t backend = sched->backends[sched->splits[i].backend_id]; + if (ggml_backend_supports_graph_plan(backend)) { + if (ggml_backend_supports_graph_plan_update(backend)) { + ggml_backend_graph_plan_update(backend, sched->plans[i].plan, &sched->splits[i].graph); + } else { + ggml_backend_graph_plan_free(backend, sched->plans[i].plan); + sched->plans[i].plan = ggml_backend_graph_plan_create(backend, &sched->splits[i].graph); + } + } else { + sched->plans[i].plan = nullptr; + } + } + } + sched->plan_needs_update = false; + } +} + static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { GGML_ASSERT(sched); struct ggml_backend_sched_split * splits = sched->splits; @@ -1448,6 +1545,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s std::vector ids; std::vector used_ids; + ggml_backend_sched_update_plans(sched); + for (int split_id = 0; split_id < sched->n_splits; split_id++) { struct ggml_backend_sched_split * split = &splits[split_id]; int split_backend_id = split->backend_id; @@ -1577,7 +1676,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); + enum ggml_status ec; + if (ggml_backend_supports_graph_plan(split_backend) && sched->plans[split_id].plan) { + ec = ggml_backend_graph_plan_compute(split_backend, sched->plans[split_id].plan); + } else { + ec = ggml_backend_graph_compute_async(split_backend, &split->graph); + } if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1675,6 +1779,10 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0])); sched->splits_capacity = initial_splits_capacity; + const int initial_plans_capacity = 16; + sched->plans = (ggml_backend_sched_plan *) calloc(initial_plans_capacity, sizeof(sched->plans[0])); + sched->plans_capacity = initial_plans_capacity; + for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); @@ -1708,6 +1816,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { ggml_free(sched->ctx); ggml_hash_set_free(&sched->hash_set); free(sched->splits); + ggml_backend_sched_free_plans(sched); + free(sched->plans); free(sched->hv_tensor_backend_ids); free(sched->hv_tensor_copies); free(sched->node_backend_ids); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 9fcb2f9fd2..641fda2a55 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1021,13 +1021,12 @@ struct ggml_cuda_graph { } cudaGraph_t graph = nullptr; cudaGraphExec_t instance = nullptr; + const ggml_cgraph * cgraph; size_t num_nodes = 0; std::vector nodes; std::vector params; - bool disable_due_to_gpu_arch = false; - bool disable_due_to_too_many_updates = false; - bool disable_due_to_failed_graph_capture = false; int number_consecutive_updates = 0; + int number_consecutive_computes = 0; std::vector ggml_graph_properties; #endif }; @@ -1191,7 +1190,12 @@ struct ggml_backend_cuda_context { cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - std::unique_ptr cuda_graph; +#ifdef USE_CUDA_GRAPH + bool cuda_graph_initialized = false; + bool disable_graph_due_to_env = false; + bool disable_graph_due_to_gpu_arch = false; + bool disable_graph_due_to_too_many_updates = false; +#endif int curr_stream_no = 0; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ab0f6fe9ce..6d836d4f16 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2412,7 +2412,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * nb1, nb2, nb3, stream); } -static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { +static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, struct ggml_tensor * dst) { // why is this here instead of mul_mat? if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) { ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); @@ -2691,7 +2691,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ggml_cuda_op_sum_rows(ctx, dst); break; case GGML_OP_MEAN: - ggml_cuda_op_mean(ctx, dst); + ggml_cuda_op_mean(ctx, cuda_graph, dst); break; case GGML_OP_SSM_CONV: ggml_cuda_op_ssm_conv(ctx, dst); @@ -2850,8 +2850,8 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { } #ifdef USE_CUDA_GRAPH -static bool check_node_graph_compatibility(ggml_cgraph * cgraph, - bool use_cuda_graph) { +static bool check_node_graph_compatibility(const ggml_cgraph * cgraph) { + bool use_cuda_graph = true; // Loop over nodes in GGML graph to obtain info needed for CUDA graph @@ -2961,45 +2961,49 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra return true; } -static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) { +static void update_cuda_graph_properties(ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph) { + cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; i++) { + set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i]); + } +} +static bool is_cuda_graph_update_required(ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph) { bool cuda_graph_update_required = false; - if (cuda_ctx->cuda_graph->instance == nullptr) { + if (cuda_graph->instance == nullptr) { cuda_graph_update_required = true; } // Check if the graph size has changed - if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { + if (cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { cuda_graph_update_required = true; - cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); + cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); } // Loop over nodes in GGML graph to determine if CUDA graph update is required - // and store properties to allow this comparison for the next token for (int i = 0; i < cgraph->n_nodes; i++) { bool has_matching_properties = true; if (!cuda_graph_update_required) { - has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); + has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i]); } if (!has_matching_properties) { cuda_graph_update_required = true; } - set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); } return cuda_graph_update_required; } -static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { +static void update_cuda_graph_executable(ggml_cuda_graph * cuda_graph) { #if CUDART_VERSION >= 12000 cudaGraphExecUpdateResultInfo result_info; - cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); + cudaError_t stat = cudaGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &result_info); #else cudaGraphNode_t errorNode; cudaGraphExecUpdateResult result_info; - cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info); + cudaError_t stat = cudaGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &errorNode, &result_info); #endif // CUDART_VERSION >= 12000 if (stat == cudaErrorGraphExecUpdateFailure) { @@ -3010,9 +3014,9 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { // The pre-existing graph exec cannot be updated due to violated constraints // so instead clear error and re-instantiate (void)cudaGetLastError(); - CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance)); - cuda_ctx->cuda_graph->instance = nullptr; - CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); + CUDA_CHECK(cudaGraphExecDestroy(cuda_graph->instance)); + cuda_graph->instance = nullptr; + CUDA_CHECK(cudaGraphInstantiate(&cuda_graph->instance, cuda_graph->graph, NULL, NULL, 0)); } else { GGML_ASSERT(stat == cudaSuccess); } @@ -3212,8 +3216,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, return false; } -static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, - bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { +static void evaluate_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph) { // flag used to determine whether it is an integrated_gpu const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; @@ -3241,530 +3244,571 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } }; - while (!graph_evaluated_or_captured) { - // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. - // With the use of CUDA graphs, the execution will be performed by the graph launch. - if (!use_cuda_graph || cuda_graph_update_required) { - [[maybe_unused]] int prev_i = 0; + [[maybe_unused]] int prev_i = 0; - if (stream_ctx.concurrent_events.size() > 0) { - should_launch_concurrent_events = true; - for (const auto & [tensor, event] : stream_ctx.concurrent_events) { - should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid(); - } - } - if (should_launch_concurrent_events) { - // Restore original node order within each concurrent region to enable fusion within streams + if (stream_ctx.concurrent_events.size() > 0) { + should_launch_concurrent_events = true; + for (const auto & [tensor, event] : stream_ctx.concurrent_events) { + should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid(); + } + } + if (should_launch_concurrent_events) { + // Restore original node order within each concurrent region to enable fusion within streams - std::unordered_map node_to_idx; - node_to_idx.reserve(cgraph->n_nodes); - for (int i = 0; i < cgraph->n_nodes; ++i) { - node_to_idx[cgraph->nodes[i]] = i; - } + std::unordered_map node_to_idx; + node_to_idx.reserve(cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; ++i) { + node_to_idx[cgraph->nodes[i]] = i; + } - for (auto & [fork_node, event] : stream_ctx.concurrent_events) { - // Find positions of all nodes from this event in the current graph - std::vector positions; - positions.reserve(event.original_order.size()); + for (auto & [fork_node, event] : stream_ctx.concurrent_events) { + // Find positions of all nodes from this event in the current graph + std::vector positions; + positions.reserve(event.original_order.size()); - bool all_found = true; - for (const ggml_tensor * orig_node : event.original_order) { - auto it = node_to_idx.find(orig_node); - if (it != node_to_idx.end()) { - positions.push_back(it->second); - } else { - all_found = false; - break; - } - } - - if (!all_found || positions.size() != event.original_order.size()) { - continue; - } - - // Sort positions to get contiguous range - std::vector sorted_positions = positions; - std::sort(sorted_positions.begin(), sorted_positions.end()); - - bool is_contiguous = true; - for (size_t i = 1; i < sorted_positions.size(); ++i) { - if (sorted_positions[i] != sorted_positions[i-1] + 1) { - is_contiguous = false; - break; - } - } - - if (!is_contiguous) { - continue; - } - - // Restore original order at the sorted positions - int start_pos = sorted_positions[0]; - for (size_t i = 0; i < event.original_order.size(); ++i) { - cgraph->nodes[start_pos + i] = const_cast(event.original_order[i]); - } + bool all_found = true; + for (const ggml_tensor * orig_node : event.original_order) { + auto it = node_to_idx.find(orig_node); + if (it != node_to_idx.end()) { + positions.push_back(it->second); + } else { + all_found = false; + break; } } - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - if (is_concurrent_event_active) { - GGML_ASSERT(concurrent_event); + if (!all_found || positions.size() != event.original_order.size()) { + continue; + } - if (node == concurrent_event->join_node) { - cuda_ctx->curr_stream_no = 0; - for (int i = 1; i <= concurrent_event->n_streams; ++i) { - // Wait on join events of forked streams in the main stream - CUDA_CHECK(cudaEventRecord(concurrent_event->join_events[i - 1], - cuda_ctx->stream(cuda_ctx->device, i))); - CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->join_events[i - 1])); - } + // Sort positions to get contiguous range + std::vector sorted_positions = positions; + std::sort(sorted_positions.begin(), sorted_positions.end()); - is_concurrent_event_active = false; - concurrent_event = nullptr; - } else { - GGML_ASSERT (concurrent_event->stream_mapping.find(node) != concurrent_event->stream_mapping.end()); - cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node]; - GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name); - } - } else if (i - prev_i > 1) { - //the previous node was fused - const ggml_tensor * prev_node = cgraph->nodes[i - 1]; - try_launch_concurrent_event(prev_node); - - if (is_concurrent_event_active) { - cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node]; - GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name); - } + bool is_contiguous = true; + for (size_t i = 1; i < sorted_positions.size(); ++i) { + if (sorted_positions[i] != sorted_positions[i-1] + 1) { + is_contiguous = false; + break; } + } + + if (!is_contiguous) { + continue; + } + + // Restore original order at the sorted positions + int start_pos = sorted_positions[0]; + for (size_t i = 0; i < event.original_order.size(); ++i) { + cgraph->nodes[start_pos + i] = const_cast(event.original_order[i]); + } + } + } + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + if (is_concurrent_event_active) { + GGML_ASSERT(concurrent_event); + + if (node == concurrent_event->join_node) { + cuda_ctx->curr_stream_no = 0; + for (int i = 1; i <= concurrent_event->n_streams; ++i) { + // Wait on join events of forked streams in the main stream + CUDA_CHECK(cudaEventRecord(concurrent_event->join_events[i - 1], + cuda_ctx->stream(cuda_ctx->device, i))); + CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->join_events[i - 1])); + } + + is_concurrent_event_active = false; + concurrent_event = nullptr; + } else { + GGML_ASSERT (concurrent_event->stream_mapping.find(node) != concurrent_event->stream_mapping.end()); + cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node]; + GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name); + } + } else if (i - prev_i > 1) { + //the previous node was fused + const ggml_tensor * prev_node = cgraph->nodes[i - 1]; + try_launch_concurrent_event(prev_node); + + if (is_concurrent_event_active) { + cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node]; + GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name); + } + } #ifdef GGML_CUDA_DEBUG - const int nodes_fused = i - prev_i - 1; - if (nodes_fused > 0) { - GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused); - } + const int nodes_fused = i - prev_i - 1; + if (nodes_fused > 0) { + GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused); + } #endif - prev_i = i; + prev_i = i; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + + + // start of fusion operations + static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); + if (!disable_fusion) { + + if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) { + ggml_tensor * weights = cgraph->nodes[i + 9]; + ggml_tensor * selected_experts = cgraph->nodes[i + 3]; + ggml_tensor * clamp = cgraph->nodes[i + 7]; + ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ true, + /*delayed softmax*/ false, clamp); + i += 9; + continue; + } + + if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) { + ggml_tensor * weights = cgraph->nodes[i + 4]; + ggml_tensor * selected_experts = cgraph->nodes[i + 3]; + ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ false, + /*delayed softmax*/ false); + i += 4; + continue; + } + + if (ggml_cuda_can_fuse(cgraph, i, + ggml_cuda_topk_moe_ops(/*with norm*/ false, /*delayed softmax*/ true), {})) { + ggml_tensor * weights = cgraph->nodes[i + 5]; + ggml_tensor * ids = cgraph->nodes[i + 1]; + + ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, ids, /*with norm*/ false, + /*delayed_softmax*/ true); + i += 5; + continue; + } + + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) { + ggml_tensor * rope = cgraph->nodes[i]; + ggml_tensor * set_rows = cgraph->nodes[i + 2]; + + ggml_cuda_op_rope_fused(*cuda_ctx, rope, set_rows); + i += 2; + continue; + } + + if (node->op == GGML_OP_ADD) { + int n_fuse = 0; + ggml_op ops[8]; + std::fill(ops, ops + 8, GGML_OP_ADD); + + for (; n_fuse <= 6; ++n_fuse){ + if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) { + break; + } + if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) { + break; + } + if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) { + break; + } + } + + n_fuse++; + + if (n_fuse > 1) { + for (int j = 0; j < n_fuse - 1; ++j) { + node->src[j + 2] = cgraph->nodes[i + j + 1]->src[1]; + } + cgraph->nodes[i + n_fuse - 1]->data = node->data; + ggml_cuda_op_fused_add(*cuda_ctx, node, n_fuse); + i += n_fuse - 1; + + continue; + } + } + + bool fused_mul_mat_vec = false; + int fused_node_count = 0; + + for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) { + const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID; + + if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) { + ggml_tensor * glu = cgraph->nodes[i + 4]; + ggml_tensor * gate_bias_n = glu->src[0]; + ggml_tensor * up_bias_n = glu->src[1]; + + //we don't assume the order for {gate, up}. Instead infer it from the bias tensor + ggml_tensor * gate_n = nullptr; + ggml_tensor * up_n = nullptr; + + if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) { + gate_n = cgraph->nodes[i]; + up_n = cgraph->nodes[i + 2]; + } else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) { + gate_n = cgraph->nodes[i + 2]; + up_n = cgraph->nodes[i]; + } else { + continue; + } + + auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) { + if (op_bias == GGML_OP_ADD) { + if (bias_node->src[0] == mul_node) { + return bias_node->src[1]; + } + if (bias_node->src[1] == mul_node) { + return bias_node->src[0]; + } + return (ggml_tensor *) nullptr; + } + GGML_ASSERT(op_bias == GGML_OP_ADD_ID); + GGML_ASSERT(bias_node->src[0] == mul_node); + return bias_node->src[1]; + }; + + ggml_tensor * up_bias_tensor = get_bias_tensor(up_bias_n, up_n, bias_op); + ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op); + + if (!up_bias_tensor || !gate_bias_tensor) { + continue; + } + + // we don't support repeating adds + if (bias_op == GGML_OP_ADD && + (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) || + !ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) { + continue; + } + + const ggml_tensor * src0 = up_n->src[0]; + const ggml_tensor * src1 = up_n->src[1]; + const ggml_tensor * ids = up_n->src[2]; + + if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) { + ggml_cuda_mm_fusion_args_host fusion_data{}; + fusion_data.gate = gate_n->src[0]; + fusion_data.x_bias = up_bias_tensor; + fusion_data.gate_bias = gate_bias_tensor; + fusion_data.glu_op = ggml_get_glu_op(glu); + + ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data); + fused_mul_mat_vec = true; + fused_node_count = 5; + break; + } + + if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) { + ggml_cuda_mm_fusion_args_host fusion_data{}; + fusion_data.gate = gate_n->src[0]; + fusion_data.x_bias = up_bias_tensor; + fusion_data.gate_bias = gate_bias_tensor; + fusion_data.glu_op = ggml_get_glu_op(glu); + + ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data); + fused_mul_mat_vec = true; + fused_node_count = 5; + break; + } + } else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) { + ggml_tensor * glu = cgraph->nodes[i + 2]; + ggml_tensor * gate = glu->src[0]; + ggml_tensor * up = glu->src[1]; + + bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1]) + || (gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]); + + if (!ok) continue; + + const ggml_tensor * src0 = up->src[0]; + const ggml_tensor * src1 = up->src[1]; + const ggml_tensor * ids = up->src[2]; + + if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) { + ggml_cuda_mm_fusion_args_host fusion_data{}; + fusion_data.gate = gate->src[0]; + fusion_data.glu_op = ggml_get_glu_op(glu); + + ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data); + fused_mul_mat_vec = true; + fused_node_count = 3; + break; + } + + if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) { + ggml_cuda_mm_fusion_args_host fusion_data{}; + fusion_data.gate = gate->src[0]; + fusion_data.glu_op = ggml_get_glu_op(glu); + + ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data); + fused_mul_mat_vec = true; + fused_node_count = 3; + break; + } + } + } + + if (fused_mul_mat_vec) { + i += fused_node_count - 1; + continue; + } + + fused_mul_mat_vec = false; + fused_node_count = 0; + + for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) { + const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID; + + if (!ggml_can_fuse(cgraph, i, { op, bias_op })) { continue; } + ggml_tensor * mm_node = cgraph->nodes[i]; + ggml_tensor * bias_node = cgraph->nodes[i + 1]; - // start of fusion operations - static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); - if (!disable_fusion) { - - if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) { - ggml_tensor * weights = cgraph->nodes[i + 9]; - ggml_tensor * selected_experts = cgraph->nodes[i + 3]; - ggml_tensor * clamp = cgraph->nodes[i + 7]; - ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ true, - /*delayed softmax*/ false, clamp); - i += 9; + ggml_tensor * bias_tensor = nullptr; + if (bias_op == GGML_OP_ADD) { + if (bias_node->src[0] == mm_node) { + bias_tensor = bias_node->src[1]; + } else if (bias_node->src[1] == mm_node) { + bias_tensor = bias_node->src[0]; + } else { continue; } - - if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) { - ggml_tensor * weights = cgraph->nodes[i + 4]; - ggml_tensor * selected_experts = cgraph->nodes[i + 3]; - ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ false, - /*delayed softmax*/ false); - i += 4; - continue; - } - - if (ggml_cuda_can_fuse(cgraph, i, - ggml_cuda_topk_moe_ops(/*with norm*/ false, /*delayed softmax*/ true), {})) { - ggml_tensor * weights = cgraph->nodes[i + 5]; - ggml_tensor * ids = cgraph->nodes[i + 1]; - - ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, ids, /*with norm*/ false, - /*delayed_softmax*/ true); - i += 5; - continue; - } - - if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) { - ggml_tensor * rope = cgraph->nodes[i]; - ggml_tensor * set_rows = cgraph->nodes[i + 2]; - - ggml_cuda_op_rope_fused(*cuda_ctx, rope, set_rows); - i += 2; - continue; - } - - if (node->op == GGML_OP_ADD) { - int n_fuse = 0; - ggml_op ops[8]; - std::fill(ops, ops + 8, GGML_OP_ADD); - - for (; n_fuse <= 6; ++n_fuse){ - if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) { - break; - } - if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) { - break; - } - if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) { - break; - } - } - - n_fuse++; - - if (n_fuse > 1) { - for (int j = 0; j < n_fuse - 1; ++j) { - node->src[j + 2] = cgraph->nodes[i + j + 1]->src[1]; - } - cgraph->nodes[i + n_fuse - 1]->data = node->data; - ggml_cuda_op_fused_add(*cuda_ctx, node, n_fuse); - i += n_fuse - 1; - - continue; - } - } - - bool fused_mul_mat_vec = false; - int fused_node_count = 0; - - for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) { - const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID; - - if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) { - ggml_tensor * glu = cgraph->nodes[i + 4]; - ggml_tensor * gate_bias_n = glu->src[0]; - ggml_tensor * up_bias_n = glu->src[1]; - - //we don't assume the order for {gate, up}. Instead infer it from the bias tensor - ggml_tensor * gate_n = nullptr; - ggml_tensor * up_n = nullptr; - - if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) { - gate_n = cgraph->nodes[i]; - up_n = cgraph->nodes[i + 2]; - } else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) { - gate_n = cgraph->nodes[i + 2]; - up_n = cgraph->nodes[i]; - } else { - continue; - } - - auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) { - if (op_bias == GGML_OP_ADD) { - if (bias_node->src[0] == mul_node) { - return bias_node->src[1]; - } - if (bias_node->src[1] == mul_node) { - return bias_node->src[0]; - } - return (ggml_tensor *) nullptr; - } - GGML_ASSERT(op_bias == GGML_OP_ADD_ID); - GGML_ASSERT(bias_node->src[0] == mul_node); - return bias_node->src[1]; - }; - - ggml_tensor * up_bias_tensor = get_bias_tensor(up_bias_n, up_n, bias_op); - ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op); - - if (!up_bias_tensor || !gate_bias_tensor) { - continue; - } - - // we don't support repeating adds - if (bias_op == GGML_OP_ADD && - (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) || - !ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) { - continue; - } - - const ggml_tensor * src0 = up_n->src[0]; - const ggml_tensor * src1 = up_n->src[1]; - const ggml_tensor * ids = up_n->src[2]; - - if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) { - ggml_cuda_mm_fusion_args_host fusion_data{}; - fusion_data.gate = gate_n->src[0]; - fusion_data.x_bias = up_bias_tensor; - fusion_data.gate_bias = gate_bias_tensor; - fusion_data.glu_op = ggml_get_glu_op(glu); - - ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data); - fused_mul_mat_vec = true; - fused_node_count = 5; - break; - } - - if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) { - ggml_cuda_mm_fusion_args_host fusion_data{}; - fusion_data.gate = gate_n->src[0]; - fusion_data.x_bias = up_bias_tensor; - fusion_data.gate_bias = gate_bias_tensor; - fusion_data.glu_op = ggml_get_glu_op(glu); - - ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data); - fused_mul_mat_vec = true; - fused_node_count = 5; - break; - } - } else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) { - ggml_tensor * glu = cgraph->nodes[i + 2]; - ggml_tensor * gate = glu->src[0]; - ggml_tensor * up = glu->src[1]; - - bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1]) - || (gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]); - - if (!ok) continue; - - const ggml_tensor * src0 = up->src[0]; - const ggml_tensor * src1 = up->src[1]; - const ggml_tensor * ids = up->src[2]; - - if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) { - ggml_cuda_mm_fusion_args_host fusion_data{}; - fusion_data.gate = gate->src[0]; - fusion_data.glu_op = ggml_get_glu_op(glu); - - ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data); - fused_mul_mat_vec = true; - fused_node_count = 3; - break; - } - - if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) { - ggml_cuda_mm_fusion_args_host fusion_data{}; - fusion_data.gate = gate->src[0]; - fusion_data.glu_op = ggml_get_glu_op(glu); - - ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data); - fused_mul_mat_vec = true; - fused_node_count = 3; - break; - } - } - } - - if (fused_mul_mat_vec) { - i += fused_node_count - 1; - continue; - } - - fused_mul_mat_vec = false; - fused_node_count = 0; - - for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) { - const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID; - - if (!ggml_can_fuse(cgraph, i, { op, bias_op })) { - continue; - } - - ggml_tensor * mm_node = cgraph->nodes[i]; - ggml_tensor * bias_node = cgraph->nodes[i + 1]; - - ggml_tensor * bias_tensor = nullptr; - if (bias_op == GGML_OP_ADD) { - if (bias_node->src[0] == mm_node) { - bias_tensor = bias_node->src[1]; - } else if (bias_node->src[1] == mm_node) { - bias_tensor = bias_node->src[0]; - } else { - continue; - } - } else { - if (bias_node->src[0] != mm_node) { - continue; - } - bias_tensor = bias_node->src[1]; - } - - const ggml_tensor * src0 = mm_node->src[0]; - const ggml_tensor * src1 = mm_node->src[1]; - const ggml_tensor * ids = mm_node->src[2]; - - if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) { - continue; - } - - if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) { - continue; - } - - ggml_cuda_mm_fusion_args_host fusion_data{}; - fusion_data.x_bias = bias_tensor; - - if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) { - ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data); - fused_mul_mat_vec = true; - fused_node_count = 2; - break; - } - - if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) { - ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data); - fused_mul_mat_vec = true; - fused_node_count = 2; - break; - } - } - - if (fused_mul_mat_vec) { - i += fused_node_count - 1; - continue; - } - - if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) { - ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); - i += 2; - continue; - } - - if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) { - ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]); - i++; - continue; - } - - if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) { - i += 2; - ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node); + } else { + if (bias_node->src[0] != mm_node) { continue; } + bias_tensor = bias_node->src[1]; } + + const ggml_tensor * src0 = mm_node->src[0]; + const ggml_tensor * src1 = mm_node->src[1]; + const ggml_tensor * ids = mm_node->src[2]; + + if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) { + continue; + } + + if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) { + continue; + } + + ggml_cuda_mm_fusion_args_host fusion_data{}; + fusion_data.x_bias = bias_tensor; + + if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) { + ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data); + fused_mul_mat_vec = true; + fused_node_count = 2; + break; + } + + if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) { + ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data); + fused_mul_mat_vec = true; + fused_node_count = 2; + break; + } + } + + if (fused_mul_mat_vec) { + i += fused_node_count - 1; + continue; + } + + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) { + ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); + i += 2; + continue; + } + + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) { + ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]); + i++; + continue; + } + + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) { + i += 2; + ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node); + continue; + } + } #ifndef NDEBUG - assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - if (node->src[j] != nullptr) { - assert(node->src[j]->buffer); - assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || - ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft))); - } - } + assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] != nullptr) { + assert(node->src[j]->buffer); + assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || + ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft))); + } + } #else - GGML_UNUSED(integrated); + GGML_UNUSED(integrated); #endif // NDEBUG - bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); - if (!ok) { - GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); - } - GGML_ASSERT(ok); - - if (!is_concurrent_event_active) { - try_launch_concurrent_event(node); - } - } + bool ok = ggml_cuda_compute_forward(*cuda_ctx, cuda_graph, node); + if (!ok) { + GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } + GGML_ASSERT(ok); -#ifdef USE_CUDA_GRAPH - if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture - if (cuda_ctx->cuda_graph->graph != nullptr) { - CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph)); - cuda_ctx->cuda_graph->graph = nullptr; - } - - CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph)); - graph_evaluated_or_captured = true; // CUDA graph has been captured - - std::lock_guard lock(ggml_cuda_lock); - if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) { - ggml_cuda_lock_cv.notify_all(); - } - } else { - graph_evaluated_or_captured = true; // ggml graph has been directly evaluated + if (!is_concurrent_event_active) { + try_launch_concurrent_event(node); } } - - if (use_cuda_graph) { - if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph. - CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); - } - if (cuda_graph_update_required) { // Update graph executable - update_cuda_graph_executable(cuda_ctx); - } - // Launch graph - CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream())); -#else - graph_evaluated_or_captured = true; -#endif // USE_CUDA_GRAPH - } } -static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - ggml_cuda_set_device(cuda_ctx->device); - #ifdef USE_CUDA_GRAPH - static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); - - // Objects required for CUDA Graph - if (cuda_ctx->cuda_graph == nullptr) { - cuda_ctx->cuda_graph.reset(new ggml_cuda_graph()); +static void capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cuda_graph * cuda_graph, const ggml_cgraph * cgraph) { + // Start CUDA graph capture + { + std::lock_guard lock(ggml_cuda_lock); + ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed); } - bool use_cuda_graph = true; - bool cuda_graph_update_required = false; + CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); + + evaluate_cuda_graph(cuda_ctx, cuda_graph, cgraph); + + if (cuda_graph->graph != nullptr) { + CUDA_CHECK(cudaGraphDestroy(cuda_graph->graph)); + cuda_graph->graph = nullptr; + } + + CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_graph->graph)); + + if (cuda_graph->instance == nullptr) { + CUDA_CHECK(cudaGraphInstantiate(&cuda_graph->instance, cuda_graph->graph, NULL, NULL, 0)); + } else { + update_cuda_graph_executable(cuda_graph); + } + + std::lock_guard lock(ggml_cuda_lock); + if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) { + ggml_cuda_lock_cv.notify_all(); + } +} + +static bool should_use_cuda_graph(ggml_backend_cuda_context * cuda_ctx, const struct ggml_cgraph * cgraph) { + bool use_cuda_graph = true; + + if (!cuda_ctx->cuda_graph_initialized) { + cuda_ctx->disable_graph_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); - if (cuda_ctx->cuda_graph->graph == nullptr) { if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { - cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; + cuda_ctx->disable_graph_due_to_gpu_arch = true; #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); #endif } + cuda_ctx->cuda_graph_initialized = true; } // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly, // or previous graph capture failure. // Also disable for multi-gpu for now. TO DO investigate - if (disable_cuda_graphs_due_to_env - || cuda_ctx->cuda_graph->disable_due_to_gpu_arch - || cuda_ctx->cuda_graph->disable_due_to_too_many_updates - || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) { + if (cuda_ctx->disable_graph_due_to_env || cuda_ctx->disable_graph_due_to_gpu_arch || + cuda_ctx->disable_graph_due_to_too_many_updates) { use_cuda_graph = false; } if (use_cuda_graph) { - cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); + use_cuda_graph = check_node_graph_compatibility(cgraph); + } - use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph); + return use_cuda_graph; +} - // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. - if (use_cuda_graph && cuda_graph_update_required) { - cuda_ctx->cuda_graph->number_consecutive_updates++; +static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + ggml_cuda_set_device(cuda_ctx->device); + + ggml_cuda_graph * cuda_graph = new ggml_cuda_graph(); + + cuda_graph->cgraph = cgraph; + + if (should_use_cuda_graph(cuda_ctx, cgraph)) { + capture_cuda_graph(cuda_ctx, cuda_graph, cgraph); + update_cuda_graph_properties(cuda_graph, cgraph); + } + + return cuda_graph; +} + +static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + delete (ggml_cuda_graph *) plan; + + GGML_UNUSED(backend); +} + +static void ggml_backend_cuda_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const ggml_cgraph * cgraph) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; + + ggml_cuda_set_device(cuda_ctx->device); + + ggml_cuda_graph * cuda_graph = (ggml_cuda_graph *) plan; + + cuda_graph->cgraph = cgraph; + + bool use_cuda_graph = should_use_cuda_graph(cuda_ctx, cgraph); + bool cuda_graph_update_required = false; + + // check if we are doing a graph update + if (cuda_graph->instance == nullptr && use_cuda_graph // no graph -> graph + || cuda_graph->instance != nullptr && !use_cuda_graph // graph -> no graph + || use_cuda_graph && is_cuda_graph_update_required(cuda_graph, cgraph)) { // graph property mismatch + cuda_graph->number_consecutive_updates++; + if (cuda_graph->number_consecutive_updates >= 4) { + cuda_ctx->disable_graph_due_to_too_many_updates = true; + use_cuda_graph = false; } else { - cuda_ctx->cuda_graph->number_consecutive_updates = 0; - } - - if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { - cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); -#endif + cuda_graph_update_required = true; } + cuda_graph->number_consecutive_computes = 0; } if (use_cuda_graph && cuda_graph_update_required) { - // Start CUDA graph capture - { - std::lock_guard lock(ggml_cuda_lock); - ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed); + capture_cuda_graph(cuda_ctx, cuda_graph, cgraph); + update_cuda_graph_properties(cuda_graph, cgraph); + } else if (!use_cuda_graph) { + if (cuda_graph->instance != nullptr) { + CUDA_CHECK(cudaGraphExecDestroy(cuda_graph->instance)); } + if (cuda_graph->graph != nullptr) { + CUDA_CHECK(cudaGraphDestroy(cuda_graph->graph)); + } + cuda_graph->instance = nullptr; + cuda_graph->graph = nullptr; + } +} - CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); +static enum ggml_status ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; + + ggml_cuda_graph * cuda_graph = (ggml_cuda_graph *) plan; + + ggml_cuda_set_device(cuda_ctx->device); + + cuda_graph->number_consecutive_computes++; + if (cuda_graph->number_consecutive_computes > 1) { + cuda_graph->number_consecutive_updates = 0; } -#else - bool use_cuda_graph = false; - bool cuda_graph_update_required = false; -#endif // USE_CUDA_GRAPH + if (cuda_graph->instance) { + CUDA_CHECK(cudaGraphLaunch(cuda_graph->instance, cuda_ctx->stream())); + } else { + evaluate_cuda_graph(cuda_ctx, cuda_graph, cuda_graph->cgraph); + } + return GGML_STATUS_SUCCESS; +} +#endif - bool graph_evaluated_or_captured = false; +static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; - evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required); + ggml_cuda_set_device(cuda_ctx->device); + + evaluate_cuda_graph(cuda_ctx, nullptr, cgraph); return GGML_STATUS_SUCCESS; } @@ -4029,10 +4073,17 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, /* .synchronize = */ ggml_backend_cuda_synchronize, +#ifdef USE_CUDA_GRAPH + /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, + /* .graph_plan_update = */ ggml_backend_cuda_graph_plan_update, + /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, +#else /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, +#endif /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu index 347abc1866..6e91f236b9 100644 --- a/ggml/src/ggml-cuda/mean.cu +++ b/ggml/src/ggml-cuda/mean.cu @@ -10,7 +10,7 @@ template __global__ void divide_by_count(T * result, size_t count) *result /= static_cast(count); } -void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *) src0->data; float * dst_d = (float *) dst->data; @@ -33,14 +33,12 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { #ifdef USE_CUDA_GRAPH // CUDA_GRAPHS_DISABLED ((ncols > 65536) && - ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || - ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates || - ctx.cuda_graph->disable_due_to_failed_graph_capture)) || + ((cuda_graph && cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || + ctx.disable_graph_due_to_env || ctx.disable_graph_due_to_gpu_arch || ctx.disable_graph_due_to_too_many_updates)) || // CUDA_GRAPHS ENABLED ((ncols > 32768) && - !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || - ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates || - ctx.cuda_graph->disable_due_to_failed_graph_capture))) { + !((cuda_graph && cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || + ctx.disable_graph_due_to_env || ctx.disable_graph_due_to_gpu_arch || ctx.disable_graph_due_to_too_many_updates))) { #else (ncols > 65536)) { #endif // USE_CUDA_GRAPH diff --git a/ggml/src/ggml-cuda/mean.cuh b/ggml/src/ggml-cuda/mean.cuh index 2b9b104334..14dca24736 100644 --- a/ggml/src/ggml-cuda/mean.cuh +++ b/ggml/src/ggml-cuda/mean.cuh @@ -1,3 +1,3 @@ #include "common.cuh" -void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, ggml_tensor * dst);