diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 54f4ed2481..cfca89f8a9 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1328,30 +1328,41 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_weighted", il); } - ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr }; - - assert(n_expert_used > 0); - - // order the views before the adds - for (uint32_t i = 0; i < hparams.n_expert_used; ++i) { - cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]); - - ggml_build_forward_expand(gf, cur_experts[i]); - } - // aggregate experts // note: here we explicitly use hparams.n_expert_used instead of n_expert_used // to avoid potentially a large number of add nodes during warmup // ref: https://github.com/ggml-org/llama.cpp/pull/14753 - ggml_tensor * moe_out = cur_experts[0]; + // + // Use ggml_moe_sum when experts tensor is contiguous and we have multiple experts + // This is more efficient than the ggml_add loop, especially for GPU kernels - for (uint32_t i = 1; i < hparams.n_expert_used; ++i) { - moe_out = ggml_add(ctx0, moe_out, cur_experts[i]); - } + ggml_tensor * moe_out = nullptr; - if (hparams.n_expert_used == 1) { - // avoid returning a non-contiguous tensor - moe_out = ggml_cont(ctx0, moe_out); + if (n_expert_used > 1 && ggml_is_contiguous(experts)) { + // Fast path: use ggml_moe_sum for contiguous tensors + moe_out = ggml_moe_sum(ctx0, experts, n_expert_used); + } else { + // Fallback to ggml_add loop for non-contiguous tensors or single expert + ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr }; + + assert(n_expert_used > 0); + + // order the views before the adds + for (uint32_t i = 0; i < hparams.n_expert_used; ++i) { + cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]); + ggml_build_forward_expand(gf, cur_experts[i]); + } + + moe_out = cur_experts[0]; + + for (uint32_t i = 1; i < hparams.n_expert_used; ++i) { + moe_out = ggml_add(ctx0, moe_out, cur_experts[i]); + } + + if (hparams.n_expert_used == 1) { + // avoid returning a non-contiguous tensor + moe_out = ggml_cont(ctx0, moe_out); + } } cb(moe_out, "ffn_moe_out", il);