diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e505ef40c2..16d81c95be 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -351,6 +351,8 @@ llama_context::~llama_context() { void llama_context::reserve() { LLAMA_LOG_INFO("%s: reserving ...\n", __func__); + synchronize(); + const uint32_t n_seqs = cparams.n_seq_max; const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); @@ -497,6 +499,10 @@ void llama_context::reserve() { } void llama_context::synchronize() { + if (!sched) { + return; + } + ggml_backend_sched_synchronize(sched.get()); // FIXME: if multiple single tokens are evaluated without a synchronization,