diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 56a67f1edc..8a4246223b 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1185,7 +1185,9 @@ struct ggml_cuda_graph { bool warmup_complete = false; struct node_properties { ggml_tensor node; - void * node_src_data_ptrs[GGML_MAX_SRC]; + void * node_src_data_ptrs[GGML_MAX_SRC]; + int64_t node_src_ne[GGML_MAX_SRC][GGML_MAX_DIMS]; + size_t node_src_nb[GGML_MAX_SRC][GGML_MAX_DIMS]; }; std::vector node_props; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 8613d20b9f..3113de017f 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3070,16 +3070,18 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx ggml_cuda_graph::node_properties prop = {}; memcpy(&prop.node, cgraph->nodes[i], sizeof(ggml_tensor)); - // if the backend scheduler is making copies of CPU tensors, the src pointers can be the same but with different data, see: - // https://github.com/ggml-org/llama.cpp/pull/21472#discussion_r3052235188 for (int j = 0; j < GGML_MAX_SRC; ++j) { - prop.node_src_data_ptrs[j] = cgraph->nodes[i]->src[j] ? cgraph->nodes[i]->src[j]->data : nullptr; + if (cgraph->nodes[i]->src[j]) { + prop.node_src_data_ptrs[j] = cgraph->nodes[i]->src[j]->data; + memcpy(prop.node_src_ne[j], cgraph->nodes[i]->src[j]->ne, sizeof(prop.node_src_ne[j])); + memcpy(prop.node_src_nb[j], cgraph->nodes[i]->src[j]->nb, sizeof(prop.node_src_nb[j])); + } } - if (!res && memcmp(&graph->node_props[i], &prop, sizeof(prop)) != 0) { + if (res || memcmp(&graph->node_props[i], &prop, sizeof(prop)) != 0) { + graph->node_props[i] = prop; res = true; } - graph->node_props[i] = prop; } return res;