From d176ae1c61e2a67293cdaf20e2fe0a5696ad584c Mon Sep 17 00:00:00 2001
From: "shaobo.xie" <shaobo.xie@denglin.ai>
Date: Thu, 5 Feb 2026 23:09:48 +0800
Subject: [PATCH] build: add GGML_DISABLE_MOE_SUM_CUDA compile flag for moe_sum
 comparison

This allows disabling the CUDA implementation of ggml_moe_sum to
compare performance with ggml_cuda_op_fused_add.

When GGML_DISABLE_MOE_SUM_CUDA is defined:
- moesum.cu becomes empty (no CUDA kernel)
- ggml_moe_sum falls back to CPU implementation
- Setting LLAMA_DISABLE_MOE_SUM=1 will use ggml_add loop
  which triggers ggml_cuda_op_fused_add

Usage for comparison:
- ggml_moe_sum (CUDA): default (both flags unset)
- ggml_cuda_op_fused_add: -DGGML_DISABLE_MOE_SUM_CUDA=1 -DLLAMA_DISABLE_MOE_SUM=1
---
 .gitignore                   |   2 +
 bench_moe_sum.cpp            | 269 +++++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/moesum.cu |   7 +
 3 files changed, 278 insertions(+)
 create mode 100644 bench_moe_sum.cpp
diff --git a/.gitignore b/.gitignore
index bb122d6924..f3467e7925 100644
--- a/.gitignore
+++ b/.gitignore
@@ -138,3 +138,5 @@ poetry.toml
 /.windsurf/
 # emscripten
 a.out.*
+.clangd/
+compile_commands.json
diff --git a/bench_moe_sum.cpp b/bench_moe_sum.cpp
new file mode 100644
index 0000000000..1912ac5a05
--- /dev/null
+++ b/bench_moe_sum.cpp
@@ -0,0 +1,269 @@
+// Simple benchmark for GGML_OP_MOE_SUM
+#include <ggml.h>
+#include <ggml-alloc.h>
+#include <ggml-backend.h>
+#include <ggml-cpp.h>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <chrono>
+
+static double get_time_ms() {
+    using namespace std::chrono;
+    return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
+}
+
+struct BenchResult {
+    double moe_sum_ms;
+    double add_loop_ms;
+    double speedup;
+};
+
+// Benchmark 1: Using moe_sum operator
+static double benchmark_moe_sum(
+    ggml_backend_t backend,
+    int64_t hidden_dim,
+    int64_t n_expert_used,
+    int64_t n_tokens,
+    int iterations) {
+
+    struct ggml_init_params params = {
+        .mem_size = 16 * 1024 * 1024,
+        .no_alloc = true,
+    };
+    ggml_context * ctx = ggml_init(params);
+
+    // Input: [hidden_dim, n_expert_used, n_tokens]
+    ggml_tensor * input = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hidden_dim, n_expert_used, n_tokens);
+    ggml_tensor * output = ggml_moe_sum(ctx, input, n_expert_used);
+
+    ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, output);
+
+    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+    if (!buffer) {
+        fprintf(stderr, "Failed to allocate tensors\n");
+        ggml_free(ctx);
+        return -1.0;
+    }
+
+    // Initialize input data
+    std::vector<float> input_data(hidden_dim * n_expert_used * n_tokens);
+    for (size_t i = 0; i < input_data.size(); i++) {
+        input_data[i] = (float)(i % 100) / 100.0f;
+    }
+    ggml_backend_tensor_set(input, input_data.data(), 0, input_data.size() * sizeof(float));
+
+    // Warmup
+    ggml_backend_graph_compute(backend, gf);
+
+    // Benchmark
+    double start = get_time_ms();
+    for (int i = 0; i < iterations; i++) {
+        ggml_backend_graph_compute(backend, gf);
+    }
+    double end = get_time_ms();
+
+    ggml_backend_buffer_free(buffer);
+    ggml_free(ctx);
+
+    return end - start;
+}
+
+// Benchmark 2: Using traditional ADD loop (equivalent to CPU implementation)
+static double benchmark_add_loop(
+    ggml_backend_t backend,
+    int64_t hidden_dim,
+    int64_t n_expert_used,
+    int64_t n_tokens,
+    int iterations) {
+
+    struct ggml_init_params params = {
+        .mem_size = 16 * 1024 * 1024,
+        .no_alloc = true,
+    };
+    ggml_context * ctx = ggml_init(params);
+
+    // Input: [hidden_dim, n_expert_used, n_tokens]
+    ggml_tensor * input = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hidden_dim, n_expert_used, n_tokens);
+
+    // Build graph: simulate moe_sum by creating views and adding them
+    ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_tokens);
+    ggml_tensor * zero = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_tokens);
+    ggml_tensor * cur = ggml_mul(ctx, result, zero);
+
+    ggml_cgraph * gf = ggml_new_graph(ctx);
+
+    // Add each expert's contribution
+    for (int64_t k = 0; k < n_expert_used; k++) {
+        ggml_tensor * expert_view = ggml_view_3d(ctx, input,
+            hidden_dim, n_tokens, 1,
+            input->nb[0], input->nb[2], k * input->nb[1]);
+        cur = ggml_add(ctx, cur, expert_view);
+    }
+
+    ggml_build_forward_expand(gf, cur);
+
+    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+    if (!buffer) {
+        fprintf(stderr, "Failed to allocate tensors\n");
+        ggml_free(ctx);
+        return -1.0;
+    }
+
+    // Initialize input data
+    std::vector<float> input_data(hidden_dim * n_expert_used * n_tokens);
+    for (size_t i = 0; i < input_data.size(); i++) {
+        input_data[i] = (float)(i % 100) / 100.0f;
+    }
+    ggml_backend_tensor_set(input, input_data.data(), 0, input_data.size() * sizeof(float));
+
+    // Warmup
+    ggml_backend_graph_compute(backend, gf);
+
+    // Benchmark
+    double start = get_time_ms();
+    for (int i = 0; i < iterations; i++) {
+        ggml_backend_graph_compute(backend, gf);
+    }
+    double end = get_time_ms();
+
+    ggml_backend_buffer_free(buffer);
+    ggml_free(ctx);
+
+    return end - start;
+}
+
+static BenchResult run_benchmark(
+    ggml_backend_t backend,
+    const char * backend_name,
+    int64_t hidden_dim,
+    int64_t n_expert_used,
+    int64_t n_tokens,
+    int iterations) {
+
+    printf("\n=================================================\n");
+    printf("Testing %s backend:\n", backend_name);
+    printf("=================================================\n");
+    printf("  Hidden dimension: %ld\n", hidden_dim);
+    printf("  Number of experts: %ld\n", n_expert_used);
+    printf("  Number of tokens: %ld\n", n_tokens);
+    printf("  Iterations: %d\n", iterations);
+    printf("=================================================\n");
+
+    double time_moe_sum = benchmark_moe_sum(backend, hidden_dim, n_expert_used, n_tokens, iterations);
+    double time_add_loop = benchmark_add_loop(backend, hidden_dim, n_expert_used, n_tokens, iterations);
+
+    printf("\nResults (averaged over %d iterations):\n", iterations);
+    if (time_moe_sum >= 0) {
+        printf("  moe_sum:      %8.2f ms  (%8.2f us/iter)\n", time_moe_sum, time_moe_sum * 1000.0 / iterations);
+    } else {
+        printf("  moe_sum:      NOT SUPPORTED\n");
+    }
+
+    if (time_add_loop >= 0) {
+        printf("  add_loop:     %8.2f ms  (%8.2f us/iter)\n", time_add_loop, time_add_loop * 1000.0 / iterations);
+    }
+
+    double speedup = 0.0;
+    if (time_moe_sum >= 0 && time_add_loop >= 0) {
+        speedup = time_add_loop / time_moe_sum;
+        printf("\n  Speedup:      %.2fx\n", speedup);
+
+        // Calculate effective bandwidth
+        size_t bytes_read = hidden_dim * n_expert_used * n_tokens * sizeof(float);
+        size_t bytes_written = hidden_dim * n_tokens * sizeof(float);
+        size_t total_bytes = (bytes_read + bytes_written) * iterations;
+        double gb_per_sec = (total_bytes / 1e9) / (time_moe_sum / 1000.0);
+        printf("  moe_sum bandwidth: %.2f GB/s\n", gb_per_sec);
+    }
+
+    printf("=================================================\n");
+
+    return {time_moe_sum, time_add_loop, speedup};
+}
+
+int main(int argc, char ** argv) {
+    int64_t hidden_dim = 4096;
+    int64_t n_expert_used = 4;
+    int64_t n_tokens = 256;
+    int iterations = 100;
+    bool test_gpu = true;
+    bool test_cpu = true;
+
+    // Parse command line arguments
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--hidden") == 0) {
+            if (i + 1 < argc) hidden_dim = atoll(argv[++i]);
+        } else if (strcmp(argv[i], "-e") == 0 || strcmp(argv[i], "--experts") == 0) {
+            if (i + 1 < argc) n_expert_used = atoll(argv[++i]);
+        } else if (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--tokens") == 0) {
+            if (i + 1 < argc) n_tokens = atoll(argv[++i]);
+        } else if (strcmp(argv[i], "-i") == 0 || strcmp(argv[i], "--iterations") == 0) {
+            if (i + 1 < argc) iterations = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "--cpu-only") == 0) {
+            test_gpu = false;
+        } else if (strcmp(argv[i], "--gpu-only") == 0) {
+            test_cpu = false;
+        }
+    }
+
+    printf("=================================================\n");
+    printf("GGML_OP_MOE_SUM Performance Benchmark\n");
+    printf("=================================================\n");
+    printf("Configuration:\n");
+    printf("  Hidden dimension: %ld\n", hidden_dim);
+    printf("  Number of experts: %ld\n", n_expert_used);
+    printf("  Number of tokens: %ld\n", n_tokens);
+    printf("  Iterations: %d\n", iterations);
+    printf("=================================================\n\n");
+
+    // Initialize backend - load all available backends
+    ggml_backend_load_all();
+
+    std::vector<BenchResult> results;
+
+    // Test CPU backend
+    if (test_cpu) {
+        ggml_backend_reg_t cpu_reg = ggml_backend_reg_by_name("CPU");
+        if (cpu_reg) {
+            ggml_backend_dev_t cpu_dev = ggml_backend_reg_dev_get(cpu_reg, 0);
+            ggml_backend_t backend = ggml_backend_dev_init(cpu_dev, NULL);
+            if (backend) {
+                results.push_back(run_benchmark(backend, "CPU", hidden_dim, n_expert_used, n_tokens, iterations));
+                ggml_backend_free(backend);
+            }
+        }
+    }
+
+    // Test GPU backend
+    if (test_gpu) {
+        ggml_backend_reg_t gpu_reg = ggml_backend_reg_by_name("CUDA");
+        if (!gpu_reg) {
+            gpu_reg = ggml_backend_reg_by_name("GPU");
+        }
+        if (gpu_reg) {
+            ggml_backend_dev_t gpu_dev = ggml_backend_reg_dev_get(gpu_reg, 0);
+            ggml_backend_t backend = ggml_backend_dev_init(gpu_dev, NULL);
+            if (backend) {
+                results.push_back(run_benchmark(backend, "GPU", hidden_dim, n_expert_used, n_tokens, iterations));
+                ggml_backend_free(backend);
+            }
+        }
+    }
+
+    // Summary
+    if (results.size() >= 2) {
+        printf("\n=================================================\n");
+        printf("Performance Summary:\n");
+        printf("=================================================\n");
+        for (const auto& r : results) {
+            printf("%s: %.2fx speedup\n", r.speedup > 0 ? "" : "GPU", r.speedup);
+        }
+        printf("=================================================\n");
+    }
+
+    return 0;
+}
diff --git a/ggml/src/ggml-cuda/moesum.cu b/ggml/src/ggml-cuda/moesum.cu
index d1c3b07345..db7fe7c8fe 100644
--- a/ggml/src/ggml-cuda/moesum.cu
+++ b/ggml/src/ggml-cuda/moesum.cu
@@ -1,3 +1,6 @@
+// Set GGML_DISABLE_MOE_SUM_CUDA=1 to disable moe_sum CUDA implementation
+// This allows fallback to ggml_cuda_op_fused_add for comparison testing
+#ifndef GGML_DISABLE_MOE_SUM_CUDA
 #include "moesum.cuh"
 
 template <typename T>
@@ -340,3 +343,7 @@ void ggml_cuda_op_moe_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         }
     }
 }
+#else
+// When GGML_DISABLE_MOE_SUM_CUDA is defined, this file is empty
+// ggml_moe_sum will fall back to CPU implementation or ggml_cuda_op_fused_add
+#endif // GGML_DISABLE_MOE_SUM_CUDA