feat(moe): add LLAMA_DISABLE_MOE_SUM env var for benchmarking

This commit is contained in:
shaobo.xie 2026-02-05 17:37:34 +08:00
parent f01ce7ba30
commit b089db5aa7
1 changed files with 11 additions and 1 deletions

View File

@ -12,6 +12,7 @@
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <unordered_set>
@ -1335,10 +1336,19 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
//
// Use ggml_moe_sum when experts tensor is contiguous and we have multiple experts
// This is more efficient than the ggml_add loop, especially for GPU kernels
//
// Set LLAMA_DISABLE_MOE_SUM=1 environment variable to disable moe_sum and use
// the ggml_add loop instead (useful for performance comparison)
ggml_tensor * moe_out = nullptr;
if (n_expert_used > 1 && ggml_is_contiguous(experts)) {
// Check for environment variable to disable moe_sum (for benchmarking/comparison)
static const bool disable_moe_sum = []() {
const char * env = std::getenv("LLAMA_DISABLE_MOE_SUM");
return env != nullptr && (env[0] == '1' || strcmp(env, "true") == 0);
}();
if (n_expert_used > 1 && !disable_moe_sum && ggml_is_contiguous(experts)) {
// Fast path: use ggml_moe_sum for contiguous tensors
moe_out = ggml_moe_sum(ctx0, experts, n_expert_used);
} else {