diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index cfca89f8a9..aa6e17cf9d 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -1335,10 +1336,19 @@ ggml_tensor * llm_graph_context::build_moe_ffn( // // Use ggml_moe_sum when experts tensor is contiguous and we have multiple experts // This is more efficient than the ggml_add loop, especially for GPU kernels + // + // Set LLAMA_DISABLE_MOE_SUM=1 environment variable to disable moe_sum and use + // the ggml_add loop instead (useful for performance comparison) ggml_tensor * moe_out = nullptr; - if (n_expert_used > 1 && ggml_is_contiguous(experts)) { + // Check for environment variable to disable moe_sum (for benchmarking/comparison) + static const bool disable_moe_sum = []() { + const char * env = std::getenv("LLAMA_DISABLE_MOE_SUM"); + return env != nullptr && (env[0] == '1' || strcmp(env, "true") == 0); + }(); + + if (n_expert_used > 1 && !disable_moe_sum && ggml_is_contiguous(experts)) { // Fast path: use ggml_moe_sum for contiguous tensors moe_out = ggml_moe_sum(ctx0, experts, n_expert_used); } else {