diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 54f4ed2481..cfca89f8a9 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1328,30 +1328,41 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         cb(cur, "ffn_moe_weighted", il);
     }
 
-    ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
-
-    assert(n_expert_used > 0);
-
-    // order the views before the adds
-    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
-        cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
-
-        ggml_build_forward_expand(gf, cur_experts[i]);
-    }
-
     // aggregate experts
     // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
     //       to avoid potentially a large number of add nodes during warmup
     //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
-    ggml_tensor * moe_out = cur_experts[0];
+    //
+    // Use ggml_moe_sum when experts tensor is contiguous and we have multiple experts
+    // This is more efficient than the ggml_add loop, especially for GPU kernels
 
-    for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
-        moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
-    }
+    ggml_tensor * moe_out = nullptr;
 
-    if (hparams.n_expert_used == 1) {
-        // avoid returning a non-contiguous tensor
-        moe_out = ggml_cont(ctx0, moe_out);
+    if (n_expert_used > 1 && ggml_is_contiguous(experts)) {
+        // Fast path: use ggml_moe_sum for contiguous tensors
+        moe_out = ggml_moe_sum(ctx0, experts, n_expert_used);
+    } else {
+        // Fallback to ggml_add loop for non-contiguous tensors or single expert
+        ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
+
+        assert(n_expert_used > 0);
+
+        // order the views before the adds
+        for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
+            cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
+            ggml_build_forward_expand(gf, cur_experts[i]);
+        }
+
+        moe_out = cur_experts[0];
+
+        for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
+            moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
+        }
+
+        if (hparams.n_expert_used == 1) {
+            // avoid returning a non-contiguous tensor
+            moe_out = ggml_cont(ctx0, moe_out);
+        }
     }
 
     cb(moe_out, "ffn_moe_out", il);