fix eagle3 logits sync bug & remove ggml_set_sync()
This commit is contained in:
parent
8fac4b1cc8
commit
ac5667dcc6
|
|
@ -629,7 +629,6 @@ extern "C" {
|
||||||
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
|
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
|
||||||
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
|
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
|
||||||
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
||||||
GGML_TENSOR_FLAG_SYNC = 16, // ...forces a new split/sync point in the scheduler (e.g. for EAGLE3 decoder)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_tri_type {
|
enum ggml_tri_type {
|
||||||
|
|
@ -854,7 +853,6 @@ extern "C" {
|
||||||
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_set_param(struct ggml_tensor * tensor);
|
GGML_API void ggml_set_param(struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_set_sync(struct ggml_tensor * tensor); // force sync point in scheduler
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// operations on tensors with backpropagation
|
// operations on tensors with backpropagation
|
||||||
|
|
|
||||||
|
|
@ -1202,11 +1202,6 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if this node requires a sync point (e.g. for EAGLE3 parallel path fix)
|
|
||||||
if (node->flags & GGML_TENSOR_FLAG_SYNC) {
|
|
||||||
need_new_split = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (node_backend_id != cur_backend_id || need_new_split) {
|
if (node_backend_id != cur_backend_id || need_new_split) {
|
||||||
split->i_end = i;
|
split->i_end = i;
|
||||||
i_split++;
|
i_split++;
|
||||||
|
|
@ -1581,15 +1576,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
if (ec != GGML_STATUS_SUCCESS) {
|
if (ec != GGML_STATUS_SUCCESS) {
|
||||||
return ec;
|
return ec;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If any node in this split has SYNC flag, synchronize after compute
|
|
||||||
// This ensures the sync node is complete before next split (e.g. for EAGLE3 parallel path sync fix)
|
|
||||||
for (int j = 0; j < split->graph.n_nodes; j++) {
|
|
||||||
if (split->graph.nodes[j]->flags & GGML_TENSOR_FLAG_SYNC) {
|
|
||||||
ggml_backend_synchronize(split_backend);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// similar to ggml_backend_compare_graph_backend
|
// similar to ggml_backend_compare_graph_backend
|
||||||
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
||||||
|
|
|
||||||
|
|
@ -7451,10 +7451,6 @@ void ggml_set_loss(struct ggml_tensor * tensor) {
|
||||||
tensor->flags |= GGML_TENSOR_FLAG_LOSS;
|
tensor->flags |= GGML_TENSOR_FLAG_LOSS;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_set_sync(struct ggml_tensor * tensor) {
|
|
||||||
tensor->flags |= GGML_TENSOR_FLAG_SYNC;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void ggml_quantize_init(enum ggml_type type) {
|
void ggml_quantize_init(enum ggml_type type) {
|
||||||
|
|
|
||||||
|
|
@ -1261,7 +1261,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
// Read only the last token's draft logits
|
// Read only the last token's draft logits
|
||||||
eagle3_draft_logits.resize(draft_vocab_size);
|
eagle3_draft_logits.resize(draft_vocab_size);
|
||||||
const size_t last_offset = last_idx * draft_vocab_size * sizeof(float);
|
const size_t last_offset = last_idx * draft_vocab_size * sizeof(float);
|
||||||
ggml_backend_tensor_get(t_logits, eagle3_draft_logits.data(), last_offset, draft_vocab_size * sizeof(float));
|
ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(), last_offset, draft_vocab_size * sizeof(float));
|
||||||
|
synchronize();
|
||||||
|
|
||||||
|
|
||||||
// Map only the last token's draft logits to target vocab
|
// Map only the last token's draft logits to target vocab
|
||||||
|
|
|
||||||
|
|
@ -63,10 +63,6 @@ llm_build_eagle3_decode::llm_build_eagle3_decode(const llama_model & model, cons
|
||||||
LLM_NORM_RMS, 0);
|
LLM_NORM_RMS, 0);
|
||||||
cb(input_embeds_normed, "input_layernorm", -1);
|
cb(input_embeds_normed, "input_layernorm", -1);
|
||||||
|
|
||||||
// Force a sync point between the two parallel RMS_NORM paths
|
|
||||||
// This prevents buffer reuse issues on GPU (EAGLE3 GPU fix)
|
|
||||||
ggml_set_sync(input_embeds_normed);
|
|
||||||
|
|
||||||
// Apply hidden_norm to g_embeddings
|
// Apply hidden_norm to g_embeddings
|
||||||
ggml_tensor * g_embeddings_normed = build_norm(g_embeddings,
|
ggml_tensor * g_embeddings_normed = build_norm(g_embeddings,
|
||||||
model.layers[0].eagle3_hidden_norm, NULL,
|
model.layers[0].eagle3_hidden_norm, NULL,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue