refactor(moe): use ggml_moe_sum for efficient expert aggregation

Replace the loop of ggml_add operations with ggml_moe_sum when the experts
tensor is contiguous. This is more efficient, especially for GPU kernels.

- Fast path: Use ggml_moe_sum for contiguous tensors with n_expert_used > 1
- Fallback: Keep the ggml_add loop for non-contiguous tensors or single expert
This commit is contained in:
shaobo.xie 2026-02-05 17:07:09 +08:00
parent 58aff30b0c
commit 64e965c7b9
1 changed files with 29 additions and 18 deletions

View File

@ -1328,30 +1328,41 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(cur, "ffn_moe_weighted", il);
}
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
assert(n_expert_used > 0);
// order the views before the adds
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
ggml_build_forward_expand(gf, cur_experts[i]);
}
// aggregate experts
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
// to avoid potentially a large number of add nodes during warmup
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
ggml_tensor * moe_out = cur_experts[0];
//
// Use ggml_moe_sum when experts tensor is contiguous and we have multiple experts
// This is more efficient than the ggml_add loop, especially for GPU kernels
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
}
ggml_tensor * moe_out = nullptr;
if (hparams.n_expert_used == 1) {
// avoid returning a non-contiguous tensor
moe_out = ggml_cont(ctx0, moe_out);
if (n_expert_used > 1 && ggml_is_contiguous(experts)) {
// Fast path: use ggml_moe_sum for contiguous tensors
moe_out = ggml_moe_sum(ctx0, experts, n_expert_used);
} else {
// Fallback to ggml_add loop for non-contiguous tensors or single expert
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
assert(n_expert_used > 0);
// order the views before the adds
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
ggml_build_forward_expand(gf, cur_experts[i]);
}
moe_out = cur_experts[0];
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
}
if (hparams.n_expert_used == 1) {
// avoid returning a non-contiguous tensor
moe_out = ggml_cont(ctx0, moe_out);
}
}
cb(moe_out, "ffn_moe_out", il);