ggml-backend: re-enable graph reuse with pipeline parallelism (#20927)
This commit is contained in:
parent
29771a0a4c
commit
3fc6f1aed1
|
|
@ -342,14 +342,6 @@ llama_context::llama_context(
|
|||
|
||||
if (cparams.pipeline_parallel) {
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
|
||||
|
||||
if (!graph_reuse_disable) {
|
||||
// TODO: figure out a way to make graph reuse work with pipeline parallelism
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/20463
|
||||
LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
|
||||
|
||||
graph_reuse_disable = true;
|
||||
}
|
||||
}
|
||||
|
||||
sched_reserve();
|
||||
|
|
@ -1189,6 +1181,13 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
|||
if (!graph_reuse_disable && res->can_reuse(gparams)) {
|
||||
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
||||
|
||||
// with pipeline parallelism, the previous graph_compute_async may still be running
|
||||
// on the GPU. we must synchronize before set_inputs to avoid overwriting input tensors
|
||||
// that the previous compute is still reading.
|
||||
if (cparams.pipeline_parallel) {
|
||||
ggml_backend_sched_synchronize(sched.get());
|
||||
}
|
||||
|
||||
n_reused++;
|
||||
} else {
|
||||
res->reset();
|
||||
|
|
|
|||
Loading…
Reference in New Issue