feat(moe): add LLAMA_DISABLE_MOE_SUM env var for benchmarking
This commit is contained in:
parent
f01ce7ba30
commit
b089db5aa7
|
|
@ -12,6 +12,7 @@
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
|
||||||
|
|
@ -1335,10 +1336,19 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
//
|
//
|
||||||
// Use ggml_moe_sum when experts tensor is contiguous and we have multiple experts
|
// Use ggml_moe_sum when experts tensor is contiguous and we have multiple experts
|
||||||
// This is more efficient than the ggml_add loop, especially for GPU kernels
|
// This is more efficient than the ggml_add loop, especially for GPU kernels
|
||||||
|
//
|
||||||
|
// Set LLAMA_DISABLE_MOE_SUM=1 environment variable to disable moe_sum and use
|
||||||
|
// the ggml_add loop instead (useful for performance comparison)
|
||||||
|
|
||||||
ggml_tensor * moe_out = nullptr;
|
ggml_tensor * moe_out = nullptr;
|
||||||
|
|
||||||
if (n_expert_used > 1 && ggml_is_contiguous(experts)) {
|
// Check for environment variable to disable moe_sum (for benchmarking/comparison)
|
||||||
|
static const bool disable_moe_sum = []() {
|
||||||
|
const char * env = std::getenv("LLAMA_DISABLE_MOE_SUM");
|
||||||
|
return env != nullptr && (env[0] == '1' || strcmp(env, "true") == 0);
|
||||||
|
}();
|
||||||
|
|
||||||
|
if (n_expert_used > 1 && !disable_moe_sum && ggml_is_contiguous(experts)) {
|
||||||
// Fast path: use ggml_moe_sum for contiguous tensors
|
// Fast path: use ggml_moe_sum for contiguous tensors
|
||||||
moe_out = ggml_moe_sum(ctx0, experts, n_expert_used);
|
moe_out = ggml_moe_sum(ctx0, experts, n_expert_used);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue