From 25ae798615d6ba78e822cccca30765023619c5f5 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 30 Dec 2025 18:20:13 +0100 Subject: [PATCH 1/3] ggml-cuda: enable concurrent streams by default This PR enables concurrent streams introduced in #16991 by default. To disable a new env flag `GGML_CUDA_DISABLE_GRAPH_OPT` is introduced --- ggml/src/ggml-cuda/common.cuh | 1 + ggml/src/ggml-cuda/ggml-cuda.cu | 65 ++++++++++++++++++++++++--------- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 62e618850b..302065ce9f 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1063,6 +1063,7 @@ struct ggml_cuda_graph { bool disable_due_to_too_many_updates = false; bool disable_due_to_failed_graph_capture = false; int number_consecutive_updates = 0; + bool cuda_graphs_enabled = false; std::vector ggml_graph_properties; #endif }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 55e1c20c96..6d10f93c07 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3263,6 +3263,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid(); } } + if (should_launch_concurrent_events) { // Restore original node order within each concurrent region to enable fusion within streams @@ -3314,6 +3315,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx cgraph->nodes[start_pos + i] = const_cast(event.original_order[i]); } } + } else { + stream_ctx.concurrent_events.clear(); } for (int i = 0; i < cgraph->n_nodes; i++) { @@ -3702,11 +3705,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } } -static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - ggml_cuda_set_device(cuda_ctx->device); - +static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) { #ifdef USE_CUDA_GRAPH static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); @@ -3716,7 +3715,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, } bool use_cuda_graph = true; - bool cuda_graph_update_required = false; if (cuda_ctx->cuda_graph->graph == nullptr) { if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { @@ -3737,6 +3735,29 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, use_cuda_graph = false; } + cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph; +#else + bool use_cuda_graph = false; +#endif // USE_CUDA_GRAPH + + return use_cuda_graph; +} + +static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; + + bool use_cuda_graph = false; + bool cuda_graph_update_required = false; + + // graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called) + // we call it here instead. +#ifdef USE_CUDA_GRAPH + if (!cuda_ctx->cuda_graph) { + use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx); + } else { + use_cuda_graph = cuda_ctx->cuda_graph && cuda_ctx->cuda_graph->cuda_graphs_enabled; + } + if (use_cuda_graph) { cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); @@ -3756,6 +3777,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, #endif } } +#endif // USE_CUDA_GRAPH if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture @@ -3767,11 +3789,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); } -#else - bool use_cuda_graph = false; - bool cuda_graph_update_required = false; -#endif // USE_CUDA_GRAPH - bool graph_evaluated_or_captured = false; evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required); @@ -3807,21 +3824,28 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; - static bool enable_graph_optimization = [] { - const char * env = getenv("GGML_CUDA_GRAPH_OPT"); - return env != nullptr && atoi(env) == 1; + const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx); + + static bool disable_graph_optimization = [] { + const char * env = getenv("GGML_CUDA_DISABLE_GRAPH_OPT"); + bool disable = env != nullptr && atoi(env) == 1; + + env = getenv("GGML_CUDA_GRAPH_OPT"); + GGML_ASSERT(env == nullptr && "GGML_CUDA_GRAPH_OPT is deprecated, use GGML_CUDA_DISABLE_GRAPH_OPT instead"); + return disable; }(); - if (!enable_graph_optimization) { + if (disable_graph_optimization) { return; } - GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend"); - GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes); - ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context(); stream_context.reset(); + if (!use_cuda_graph || ggml_backend_cuda_get_device_count() != 1) { + return; + } + // number of out-degrees for a particular node std::unordered_map fan_out; // reverse mapping of node to index in the cgraph @@ -3882,6 +3906,11 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph if (count >= min_fan_out && count <= max_fan_out) { const int root_node_idx = node_indices[root_node]; + // only optimize for attn_norm + if (!strstr(root_node->name, "attn_norm")) { + continue; + } + bool is_part_of_event = false; for (const auto & [start, end] : concurrent_node_ranges) { if (root_node_idx >= start && root_node_idx <= end) { From 93cfa8d156ac0b26b0e354480ccc6696a280dc2a Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 31 Dec 2025 14:47:41 +0100 Subject: [PATCH 2/3] make flag opt-in --- ggml/src/ggml-cuda/ggml-cuda.cu | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 6d10f93c07..30ee84c5f4 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3826,16 +3826,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx); - static bool disable_graph_optimization = [] { - const char * env = getenv("GGML_CUDA_DISABLE_GRAPH_OPT"); - bool disable = env != nullptr && atoi(env) == 1; - - env = getenv("GGML_CUDA_GRAPH_OPT"); - GGML_ASSERT(env == nullptr && "GGML_CUDA_GRAPH_OPT is deprecated, use GGML_CUDA_DISABLE_GRAPH_OPT instead"); - return disable; + static bool enable_graph_optimization = [] { + const char * env = getenv("GGML_CUDA_GRAPH_OPT"); + return env != nullptr && atoi(env) == 1; }(); - if (disable_graph_optimization) { + if (!enable_graph_optimization) { return; } From d405fa1c16b337f22bade685b425aa5e269199cd Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 2 Jan 2026 07:52:15 +0100 Subject: [PATCH 3/3] add todo about special casing --- ggml/src/ggml-cuda/ggml-cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 30ee84c5f4..3b079eb161 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3903,6 +3903,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph const int root_node_idx = node_indices[root_node]; // only optimize for attn_norm + // TODO: make this generic if (!strstr(root_node->name, "attn_norm")) { continue; }