Improve CUDA graph capture (#19754)

* Improve CUDA graph capture

Currently, CUDA graphs are eagerly enabled on the first call to ggml_backend_cuda_graph_compute. If the graph properties keep changing (4+ consecutive updates), the graph is permanently disabled. This is suboptimal because:

- The first call always incurs CUDA graph capture overhead even if the graph is unstable
- Once permanently disabled, CUDA graphs never re-enable even after the graph stabilizes (e.g., switching from prompt processing to decode)

The new approach delays CUDA graph activation until warmup completes: the same cgraph must be called at least twice with matching properties before CUDA graph capture begins. This avoids wasted capture overhead on volatile graphs and allows graphs to become eligible once they stabilize.
This also fixes issues such as https://github.com/ggml-org/llama.cpp/discussions/19708

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Remove EM dashes

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Co-authored-by: Aman Gupta <amangupta052@gmail.com>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Aman Gupta <amangupta052@gmail.com>
This commit is contained in:
Gaurav Garg 2026-02-21 15:09:36 +05:30 committed by GitHub
parent 07968d53e4
commit a0c91e8f9f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 27 additions and 23 deletions

View File

@ -1149,8 +1149,7 @@ struct ggml_cuda_graph {
size_t num_nodes = 0;
std::vector<cudaGraphNode_t> nodes;
bool disable_due_to_gpu_arch = false;
bool disable_due_to_too_many_updates = false;
int number_consecutive_updates = 0;
bool warmup_complete = false;
std::vector<ggml_cuda_graph_node_properties> props;
// these are extra tensors (inputs) that participate in the ggml graph but are not nodes
@ -1159,21 +1158,9 @@ struct ggml_cuda_graph {
// ref: https://github.com/ggml-org/llama.cpp/pull/19165
std::vector<ggml_cuda_graph_node_properties> extra;
void record_update(bool use_graph, bool update_required) {
if (use_graph && update_required) {
number_consecutive_updates++;
} else {
number_consecutive_updates = 0;
}
if (number_consecutive_updates >= 4) {
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
disable_due_to_too_many_updates = true;
}
}
bool is_enabled() const {
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env);
}
#endif
};

View File

@ -2979,10 +2979,6 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
const void * graph_key = ggml_cuda_graph_get_key(cgraph);
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
if (graph->instance == nullptr) {
res = true;
}
// Check if the graph size has changed
if (graph->props.size() != (size_t)cgraph->n_nodes) {
res = true;
@ -3931,14 +3927,35 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
#ifdef USE_CUDA_GRAPH
graph_key = ggml_cuda_graph_get_key(cgraph);
use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
if (graph->is_enabled()) {
cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
use_cuda_graph = ggml_cuda_graph_check_compability(cgraph);
const bool graph_compatible = ggml_cuda_graph_check_compability(cgraph);
if (graph_compatible) {
const bool properties_changed = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
graph->record_update(use_cuda_graph, cuda_graph_update_required);
if (!graph->warmup_complete) {
// Warmup: need at least 2 calls with no property change on the 2nd call
if (!properties_changed) {
graph->warmup_complete = true;
GGML_LOG_DEBUG("%s: CUDA graph warmup complete\n", __func__);
use_cuda_graph = true;
cuda_graph_update_required = true;
}
// else: properties changed or first call - execute directly (use_cuda_graph stays false)
} else {
// Post-warmup: normal CUDA graph operation
if (properties_changed) {
// Properties changed - reset warmup, execute directly until stable again
graph->warmup_complete = false;
GGML_LOG_DEBUG("%s: CUDA graph warmup reset\n", __func__);
} else {
use_cuda_graph = true;
cuda_graph_update_required = graph->instance == nullptr;
}
}
}
}
#endif // USE_CUDA_GRAPH