diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index fa73e8216b..686da3dbd1 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -629,7 +629,6 @@ extern "C" { GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) - GGML_TENSOR_FLAG_SYNC = 16, // ...forces a new split/sync point in the scheduler (e.g. for EAGLE3 decoder) }; enum ggml_tri_type { @@ -854,7 +853,6 @@ extern "C" { GGML_API void ggml_set_output(struct ggml_tensor * tensor); GGML_API void ggml_set_param(struct ggml_tensor * tensor); GGML_API void ggml_set_loss(struct ggml_tensor * tensor); - GGML_API void ggml_set_sync(struct ggml_tensor * tensor); // force sync point in scheduler // // operations on tensors with backpropagation diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 8e30d48ccc..08681f35e3 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1202,11 +1202,6 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra } } - // check if this node requires a sync point (e.g. for EAGLE3 parallel path fix) - if (node->flags & GGML_TENSOR_FLAG_SYNC) { - need_new_split = true; - } - if (node_backend_id != cur_backend_id || need_new_split) { split->i_end = i; i_split++; @@ -1581,15 +1576,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s if (ec != GGML_STATUS_SUCCESS) { return ec; } - - // If any node in this split has SYNC flag, synchronize after compute - // This ensures the sync node is complete before next split (e.g. for EAGLE3 parallel path sync fix) - for (int j = 0; j < split->graph.n_nodes; j++) { - if (split->graph.nodes[j]->flags & GGML_TENSOR_FLAG_SYNC) { - ggml_backend_synchronize(split_backend); - break; - } - } } else { // similar to ggml_backend_compare_graph_backend for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4625c3bd77..f0913cd359 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7451,10 +7451,6 @@ void ggml_set_loss(struct ggml_tensor * tensor) { tensor->flags |= GGML_TENSOR_FLAG_LOSS; } -void ggml_set_sync(struct ggml_tensor * tensor) { - tensor->flags |= GGML_TENSOR_FLAG_SYNC; -} - //////////////////////////////////////////////////////////////////////////////// void ggml_quantize_init(enum ggml_type type) { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index ea6dfaea3c..3506edd92b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1261,7 +1261,8 @@ int llama_context::decode(const llama_batch & batch_inp) { // Read only the last token's draft logits eagle3_draft_logits.resize(draft_vocab_size); const size_t last_offset = last_idx * draft_vocab_size * sizeof(float); - ggml_backend_tensor_get(t_logits, eagle3_draft_logits.data(), last_offset, draft_vocab_size * sizeof(float)); + ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(), last_offset, draft_vocab_size * sizeof(float)); + synchronize(); // Map only the last token's draft logits to target vocab diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 8987a0c581..dea887bdd3 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -63,10 +63,6 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons LLM_NORM_RMS, 0); cb(input_embeds_normed, "input_layernorm", -1); - // Force a sync point between the two parallel RMS_NORM paths - // This prevents buffer reuse issues on GPU (EAGLE3 GPU fix) - ggml_set_sync(input_embeds_normed); - // Apply hidden_norm to g_embeddings ggml_tensor * g_embeddings_normed = build_norm(g_embeddings, model.layers[0].eagle3_hidden_norm, NULL,