From 43fffa622ac4b872abe83610aa286156d1053620 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sun, 15 Mar 2026 15:30:06 +0100
Subject: [PATCH 1/3] ggml-cpu: improve `--n-cpu-moe` TG performance

---
 ggml/src/ggml-cpu/ggml-cpu.c | 112 ++++++++++++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 8b323bd9b0..0747c5b272 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1681,6 +1681,88 @@ static void ggml_compute_forward_mul_mat_id(
     }
 }
 
+static void ggml_compute_forward_fused_moe_silu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * node0,
+        struct ggml_tensor * node1,
+        struct ggml_tensor * glu_node) {
+
+    const struct ggml_tensor * gate_weights;
+    const struct ggml_tensor * up_weights;
+
+    if (glu_node->src[0] == node0) {
+        gate_weights = node0->src[0];
+        up_weights   = node1->src[0];
+    } else {
+        gate_weights = node1->src[0];
+        up_weights   = node0->src[0];
+    }
+
+    const struct ggml_tensor * src1 = node0->src[1];
+    const struct ggml_tensor * ids  = node0->src[2];
+
+    const int64_t ne00 = gate_weights->ne[0];
+    const int64_t ne01 = gate_weights->ne[1];
+
+    const size_t gate_nb01 = gate_weights->nb[1];
+    const size_t gate_nb02 = gate_weights->nb[2];
+    const size_t up_nb01   = up_weights->nb[1];
+    const size_t up_nb02   = up_weights->nb[2];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const size_t  nb11 = src1->nb[1];
+
+    const size_t glu_nb1 = glu_node->nb[1];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = gate_weights->type;
+
+    ggml_vec_dot_t    const vec_dot      = type_traits_cpu[type].vec_dot;
+    enum ggml_type    const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+    ggml_from_float_t const from_float   = type_traits_cpu[vec_dot_type].from_float;
+
+    const int n_ids = ids->ne[0]; // n_expert_used
+
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+    const char * src1_q = (const char *) src1->data;
+    if (src1->type != vec_dot_type) {
+        char * wdata = (char *) params->wdata + ith * ne11 * row_size;
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+            from_float((float *)((char *) src1->data + i11*nb11),
+                       (void *)(wdata + i11*row_size),
+                       ne10);
+        }
+        src1_q = wdata;
+    }
+
+    // Process each selected expert directly (no row mapping needed)
+    for (int id = 0; id < n_ids; ++id) {
+        const int32_t expert_idx = *(const int32_t *) ((const char *) ids->data + id*ids->nb[0]);
+
+        const char * gate_cur = (const char *) gate_weights->data + expert_idx * gate_nb02;
+        const char * up_cur   = (const char *) up_weights->data   + expert_idx * up_nb02;
+        const char * src1_col = src1_q;
+
+        float * glu_col = (float *) ((char *) glu_node->data + id*glu_nb1);
+
+        // Static work division: each thread gets a contiguous range of rows
+        const int64_t ir0_start = (ith * ne01) / nth;
+        const int64_t ir0_end   = ((ith + 1) * ne01) / nth;
+
+        for (int64_t ir0 = ir0_start; ir0 < ir0_end; ++ir0) {
+            float gate_val, up_val;
+            vec_dot(ne00, &gate_val, 0, gate_cur + ir0*gate_nb01, 0, src1_col, 0, 1);
+            vec_dot(ne00, &up_val,   0, up_cur   + ir0*up_nb01,   0, src1_col, 0, 1);
+            glu_col[ir0] = ggml_silu_f32(gate_val) * up_val;
+        }
+    }
+}
+
 /////////////////////////////////
 
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
@@ -2809,7 +2891,12 @@ struct ggml_cplan ggml_graph_plan(
                         const int n_as = src0->ne[2];
                         // src1
                         if (src1->type != vec_dot_type) {
-                            cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)) + sizeof(int64_t);
+                            size_t quant_buf = ggml_row_size(vec_dot_type, ggml_nelements(src1));
+                            // fused MoE path: each thread needs its own quantization buffer
+                            if (src1->ne[2] == 1) {
+                                quant_buf *= n_tasks;
+                            }
+                            cur += quant_buf + sizeof(int64_t);
                         }
                         // matrix_row_counts
                         cur += n_as * sizeof(int64_t) + sizeof(int64_t);
@@ -2981,7 +3068,28 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             continue;
         }
 
-        ggml_compute_forward(&params, node);
+        // Try fusion: MUL_MAT_ID + MUL_MAT_ID + GLU
+        int fused_nodes = 0;
+        if (node->op == GGML_OP_MUL_MAT_ID) {
+            enum ggml_op fuse_ops[3] = {GGML_OP_MUL_MAT_ID, GGML_OP_MUL_MAT_ID, GGML_OP_GLU};
+            int outputs[1] = {node_n + 2};
+            if (ggml_can_fuse_subgraph(cgraph, node_n, 3, fuse_ops, outputs, 1)) {
+                struct ggml_tensor * node1 = cgraph->nodes[node_n + 1];
+                struct ggml_tensor * glu   = cgraph->nodes[node_n + 2];
+                // Fused path for `--n-cpu-moe` when n_tokens = 1
+                if (node->src[1] == node1->src[1] && node->src[2] == node1->src[2] &&
+                        node->src[1]->ne[2] == 1 && ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU) {
+                    ggml_compute_forward_fused_moe_silu(&params, node, node1, glu);
+                    fused_nodes = 2;
+                }
+            }
+        }
+
+        if (fused_nodes == 0) {
+            ggml_compute_forward(&params, node);
+        }
+
+        node_n += fused_nodes;
 
         if (state->ith == 0 && cplan->abort_callback &&
                 cplan->abort_callback(cplan->abort_callback_data)) {

From a57641d189f5d68087427644b0dda6a7f0776e49 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Mon, 16 Mar 2026 05:49:02 +0100
Subject: [PATCH 2/3] rename, add cache line padding

---
 ggml/src/ggml-cpu/ggml-cpu.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 0747c5b272..569ba57b4e 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1687,27 +1687,27 @@ static void ggml_compute_forward_fused_moe_silu(
         struct ggml_tensor * node1,
         struct ggml_tensor * glu_node) {
 
-    const struct ggml_tensor * gate_weights;
-    const struct ggml_tensor * up_weights;
+    const struct ggml_tensor * weights_gate;
+    const struct ggml_tensor * weights_up;
 
     if (glu_node->src[0] == node0) {
-        gate_weights = node0->src[0];
-        up_weights   = node1->src[0];
+        weights_gate = node0->src[0];
+        weights_up   = node1->src[0];
     } else {
-        gate_weights = node1->src[0];
-        up_weights   = node0->src[0];
+        weights_gate = node1->src[0];
+        weights_up   = node0->src[0];
     }
 
     const struct ggml_tensor * src1 = node0->src[1];
     const struct ggml_tensor * ids  = node0->src[2];
 
-    const int64_t ne00 = gate_weights->ne[0];
-    const int64_t ne01 = gate_weights->ne[1];
+    const int64_t ne00 = weights_gate->ne[0];
+    const int64_t ne01 = weights_gate->ne[1];
 
-    const size_t gate_nb01 = gate_weights->nb[1];
-    const size_t gate_nb02 = gate_weights->nb[2];
-    const size_t up_nb01   = up_weights->nb[1];
-    const size_t up_nb02   = up_weights->nb[2];
+    const size_t gate_nb01 = weights_gate->nb[1];
+    const size_t gate_nb02 = weights_gate->nb[2];
+    const size_t up_nb01   = weights_up->nb[1];
+    const size_t up_nb02   = weights_up->nb[2];
 
     const int64_t ne10 = src1->ne[0];
     const int64_t ne11 = src1->ne[1];
@@ -1718,7 +1718,7 @@ static void ggml_compute_forward_fused_moe_silu(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const enum ggml_type type = gate_weights->type;
+    const enum ggml_type type = weights_gate->type;
 
     ggml_vec_dot_t    const vec_dot      = type_traits_cpu[type].vec_dot;
     enum ggml_type    const vec_dot_type = type_traits_cpu[type].vec_dot_type;
@@ -1730,7 +1730,7 @@ static void ggml_compute_forward_fused_moe_silu(
 
     const char * src1_q = (const char *) src1->data;
     if (src1->type != vec_dot_type) {
-        char * wdata = (char *) params->wdata + ith * ne11 * row_size;
+        char * wdata = (char *) params->wdata + ith * (ne11 * row_size + CACHE_LINE_SIZE);
         GGML_ASSERT(src1->type == GGML_TYPE_F32);
         for (int64_t i11 = 0; i11 < ne11; ++i11) {
             from_float((float *)((char *) src1->data + i11*nb11),
@@ -1744,8 +1744,8 @@ static void ggml_compute_forward_fused_moe_silu(
     for (int id = 0; id < n_ids; ++id) {
         const int32_t expert_idx = *(const int32_t *) ((const char *) ids->data + id*ids->nb[0]);
 
-        const char * gate_cur = (const char *) gate_weights->data + expert_idx * gate_nb02;
-        const char * up_cur   = (const char *) up_weights->data   + expert_idx * up_nb02;
+        const char * gate_cur = (const char *) weights_gate->data + expert_idx * gate_nb02;
+        const char * up_cur   = (const char *) weights_up->data   + expert_idx * up_nb02;
         const char * src1_col = src1_q;
 
         float * glu_col = (float *) ((char *) glu_node->data + id*glu_nb1);

From 2b685f2f0572358d8ec993444ebd2c3a0d8334a4 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Mon, 16 Mar 2026 09:30:32 +0100
Subject: [PATCH 3/3] handle broadcast

---
 ggml/src/ggml-cpu/ggml-cpu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 569ba57b4e..2a85949765 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3078,7 +3078,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 struct ggml_tensor * glu   = cgraph->nodes[node_n + 2];
                 // Fused path for `--n-cpu-moe` when n_tokens = 1
                 if (node->src[1] == node1->src[1] && node->src[2] == node1->src[2] &&
-                        node->src[1]->ne[2] == 1 && ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU) {
+                        ggml_nrows(node->src[1]) == 1 &&
+                        ggml_get_glu_op(glu) == GGML_GLU_OP_SWIGLU) {
                     ggml_compute_forward_fused_moe_silu(&params, node, node1, glu);
                     fused_nodes = 2;
                 }