From 49162df87a4ef721c0ab0e4525e9b5dd3d496196 Mon Sep 17 00:00:00 2001 From: pestopoppa Date: Sun, 14 Dec 2025 13:32:50 +0100 Subject: [PATCH] feat: add --moe-n-expert flag for MoE expert count override (Hard Mask) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ability to reduce the number of active experts in MoE models at runtime, providing significant speedup with minimal quality loss when using 50% of default experts. Implementation: - Add moe_n_expert_override parameter to llama_context_params - Add --moe-n-expert CLI flag to override n_expert_used - Implement "Hard Mask" in build_moe_ffn() that slices expert tensors - Uses ggml_view_2d/3d + ggml_cont to reduce actual computation Benchmark results (AOCL BLIS 5.0, AMD EPYC 9655): - Qwen3-Coder-480B-A35B: 2.5 → 3.7 t/s (48% speedup) - GLM-4.6-355B-A32B: 2.2 → 3.0 t/s (36% speedup) - Qwen3-Coder-30B-A3B: 26.6 → 33.6 t/s (26% speedup) - Qwen3-VL-30B-A3B: 32.2 → 38.9 t/s (21% speedup) Quality: Excellent at 50% experts, degraded at 25%, gibberish at 12.5% Usage: llama-cli -m model.gguf --moe-n-expert 4 -p "prompt" 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- common/arg.cpp | 8 ++++++ common/common.cpp | 1 + common/common.h | 1 + include/llama.h | 5 ++++ src/llama-context.cpp | 2 ++ src/llama-cparams.h | 4 +++ src/llama-graph.cpp | 57 +++++++++++++++++++++++++++++++------------ 7 files changed, 63 insertions(+), 15 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5528eeb169..691048881c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1687,6 +1687,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.yarn_beta_fast = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_FAST")); + add_opt(common_arg( + {"--moe-n-expert"}, "N", + string_format("MoE: override number of active experts (default: %d = model default)\n" + "for MoE self-draft speculation, use 1 for draft context", params.moe_n_expert_override), + [](common_params & params, int value) { + params.moe_n_expert_override = value; + } + ).set_env("LLAMA_ARG_MOE_N_EXPERT")); add_opt(common_arg( {"-gan", "--grp-attn-n"}, "N", string_format("group-attention factor (default: %d)", params.grp_attn_n), diff --git a/common/common.cpp b/common/common.cpp index 0497f90a28..2161f189ca 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1328,6 +1328,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_orig_ctx = params.yarn_orig_ctx; + cparams.moe_n_expert_override = params.moe_n_expert_override; cparams.pooling_type = params.pooling_type; cparams.attention_type = params.attention_type; cparams.flash_attn_type = params.flash_attn_type; diff --git a/common/common.h b/common/common.h index 2fd83f0cf9..8b03ee03d5 100644 --- a/common/common.h +++ b/common/common.h @@ -321,6 +321,7 @@ struct common_params { float yarn_beta_fast = -1.0f; // YaRN low correction dim float yarn_beta_slow = -1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length + int32_t moe_n_expert_override = 0; // MoE self-draft: override n_expert_used (0 = use model default) // offload params std::vector devices; // devices to use for offloading diff --git a/include/llama.h b/include/llama.h index b52eaacfa7..8156605b1e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -340,6 +340,11 @@ extern "C" { uint32_t yarn_orig_ctx; // YaRN original context size float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default) + // MoE self-drafting: override n_expert_used for this context + // 0 = use model default, 1+ = force exactly N active experts + // Used for MoE self-draft speculation: draft context uses n=1, verify uses full + int32_t moe_n_expert_override; + ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2692297dca..92f7f9dd77 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -97,6 +97,7 @@ llama_context::llama_context( cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified; + cparams.moe_n_expert_override = params.moe_n_expert_override; { const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE"); @@ -2303,6 +2304,7 @@ llama_context_params llama_context_default_params() { /*.yarn_beta_slow =*/ -1.0f, /*.yarn_orig_ctx =*/ 0, /*.defrag_thold =*/ -1.0f, + /*.moe_n_expert_override =*/ 0, /*.cb_eval =*/ nullptr, /*.cb_eval_user_data =*/ nullptr, /*.type_k =*/ GGML_TYPE_F16, diff --git a/src/llama-cparams.h b/src/llama-cparams.h index fcef8fa976..90d55e19e4 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -35,6 +35,10 @@ struct llama_cparams { bool op_offload; bool kv_unified; + // MoE self-drafting: override n_expert_used + // 0 = use model default, 1+ = force exactly N active experts + int32_t moe_n_expert_override; + enum llama_pooling_type pooling_type; ggml_backend_sched_eval_callback cb_eval; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index a1a32494b7..ecf0def473 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -995,16 +995,38 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens] cb(weights, "ffn_moe_weights", il); + // HARD MASK: If moe_n_expert_override is set, slice tensors to only use first N experts + // This actually reduces computation by only loading/computing N experts instead of all n_expert_used + // Unlike soft mask (which zeros weights but still computes all experts), hard mask skips the computation entirely + int32_t n_expert_exec = n_expert_used; // Default: execute all selected experts + if (cparams.moe_n_expert_override > 0 && cparams.moe_n_expert_override < n_expert_used) { + n_expert_exec = cparams.moe_n_expert_override; + + // Slice selected_experts from [n_expert_used, n_tokens] to [n_expert_exec, n_tokens] + // This causes ggml_mul_mat_id to only load and compute the first n_expert_exec experts + selected_experts = ggml_view_2d(ctx0, selected_experts, n_expert_exec, n_tokens, + selected_experts->nb[1], 0); + // Make contiguous for subsequent operations + selected_experts = ggml_cont(ctx0, selected_experts); + cb(selected_experts, "ffn_moe_topk_sliced", il); + + // Slice weights from [1, n_expert_used, n_tokens] to [1, n_expert_exec, n_tokens] + weights = ggml_view_3d(ctx0, weights, 1, n_expert_exec, n_tokens, + weights->nb[1], weights->nb[2], 0); + // Make contiguous for subsequent reshape operations + weights = ggml_cont(ctx0, weights); + cb(weights, "ffn_moe_weights_sliced", il); + } if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) { - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); - weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens] - weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); + weights = ggml_reshape_2d(ctx0, weights, n_expert_exec, n_tokens); + weights = ggml_soft_max(ctx0, weights); // [n_expert_exec, n_tokens] + weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_exec, n_tokens); cb(weights, "ffn_moe_weights_softmax", il); } if (norm_w) { - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); + weights = ggml_reshape_2d(ctx0, weights, n_expert_exec, n_tokens); ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] cb(weights_sum, "ffn_moe_weights_sum", il); @@ -1013,10 +1035,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn( weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY); cb(weights_sum, "ffn_moe_weights_sum_clamped", il); - weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] + weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_exec, n_tokens] cb(weights, "ffn_moe_weights_norm", il); - weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); + weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_exec, n_tokens); } if (scale_w) { weights = ggml_scale(ctx0, weights, w_scale); @@ -1029,8 +1051,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); if (weight_before_ffn) { - // repeat cur to [n_embd, n_expert_used, n_tokens] - ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_used, n_tokens, 1); + // repeat cur to [n_embd, n_expert_exec, n_tokens] + ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_exec, n_tokens, 1); cur = ggml_mul(ctx0, repeated, weights); cb(cur, "ffn_moe_weighted", il); } @@ -1108,26 +1130,31 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr }; - assert(n_expert_used > 0); + // Determine actual expert count for aggregation + // When --moe-n-expert is set (hard mask mode), use n_expert_exec + // Otherwise use hparams.n_expert_used to avoid dynamic allocation issues during warmup + // ref: https://github.com/ggml-org/llama.cpp/pull/14753 + const uint32_t n_expert_agg = (cparams.moe_n_expert_override > 0) + ? (uint32_t)n_expert_exec + : hparams.n_expert_used; + + assert(n_expert_agg > 0); // order the views before the adds - for (uint32_t i = 0; i < hparams.n_expert_used; ++i) { + for (uint32_t i = 0; i < n_expert_agg; ++i) { cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]); ggml_build_forward_expand(gf, cur_experts[i]); } // aggregate experts - // note: here we explicitly use hparams.n_expert_used instead of n_expert_used - // to avoid potentially a large number of add nodes during warmup - // ref: https://github.com/ggml-org/llama.cpp/pull/14753 ggml_tensor * moe_out = cur_experts[0]; - for (uint32_t i = 1; i < hparams.n_expert_used; ++i) { + for (uint32_t i = 1; i < n_expert_agg; ++i) { moe_out = ggml_add(ctx0, moe_out, cur_experts[i]); } - if (hparams.n_expert_used == 1) { + if (n_expert_agg == 1) { // avoid returning a non-contiguous tensor moe_out = ggml_cont(ctx0, moe_out); }