llama-graph: avoid expand_forward for fusion (#17633)

This commit is contained in:
Aman Gupta 2025-12-01 17:12:48 +08:00 committed by GitHub
parent ff90508d68
commit 6eea666912
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 1 additions and 7 deletions

View File

@ -3274,7 +3274,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name); GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
} }
} }
prev_i = i;
#ifdef GGML_CUDA_DEBUG #ifdef GGML_CUDA_DEBUG
const int nodes_fused = i - prev_i - 1; const int nodes_fused = i - prev_i - 1;
@ -3282,6 +3281,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused); GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
} }
#endif #endif
prev_i = i;
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue; continue;

View File

@ -810,9 +810,6 @@ ggml_tensor * llm_graph_context::build_ffn(
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
//expand here so that we can fuse ffn gate
ggml_build_forward_expand(gf, cur);
if (gate && type_gate == LLM_FFN_PAR) { if (gate && type_gate == LLM_FFN_PAR) {
cur = ggml_mul(ctx0, cur, tmp); cur = ggml_mul(ctx0, cur, tmp);
cb(cur, "ffn_gate_par", il); cb(cur, "ffn_gate_par", il);
@ -1093,9 +1090,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
//expand here so that we can fuse ffn gate
ggml_build_forward_expand(gf, cur);
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il); cb(experts, "ffn_moe_down", il);