This commit is contained in:
Daniele Pinna 2025-12-17 11:06:32 +07:00 committed by GitHub
commit 9600225304
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 63 additions and 15 deletions

View File

@ -1758,6 +1758,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.yarn_beta_fast = std::stof(value); params.yarn_beta_fast = std::stof(value);
} }
).set_env("LLAMA_ARG_YARN_BETA_FAST")); ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
add_opt(common_arg(
{"--moe-n-expert"}, "N",
string_format("MoE: override number of active experts (default: %d = model default)\n"
"for MoE self-draft speculation, use 1 for draft context", params.moe_n_expert_override),
[](common_params & params, int value) {
params.moe_n_expert_override = value;
}
).set_env("LLAMA_ARG_MOE_N_EXPERT"));
add_opt(common_arg( add_opt(common_arg(
{"-gan", "--grp-attn-n"}, "N", {"-gan", "--grp-attn-n"}, "N",
string_format("group-attention factor (default: %d)", params.grp_attn_n), string_format("group-attention factor (default: %d)", params.grp_attn_n),

View File

@ -1391,6 +1391,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.yarn_orig_ctx = params.yarn_orig_ctx; cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.moe_n_expert_override = params.moe_n_expert_override;
cparams.pooling_type = params.pooling_type; cparams.pooling_type = params.pooling_type;
cparams.attention_type = params.attention_type; cparams.attention_type = params.attention_type;
cparams.flash_attn_type = params.flash_attn_type; cparams.flash_attn_type = params.flash_attn_type;

View File

@ -325,6 +325,7 @@ struct common_params {
float yarn_beta_fast = -1.0f; // YaRN low correction dim float yarn_beta_fast = -1.0f; // YaRN low correction dim
float yarn_beta_slow = -1.0f; // YaRN high correction dim float yarn_beta_slow = -1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
int32_t moe_n_expert_override = 0; // MoE self-draft: override n_expert_used (0 = use model default)
// offload params // offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

View File

@ -341,6 +341,11 @@ extern "C" {
uint32_t yarn_orig_ctx; // YaRN original context size uint32_t yarn_orig_ctx; // YaRN original context size
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default) float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
// MoE self-drafting: override n_expert_used for this context
// 0 = use model default, 1+ = force exactly N active experts
// Used for MoE self-draft speculation: draft context uses n=1, verify uses full
int32_t moe_n_expert_override;
ggml_backend_sched_eval_callback cb_eval; ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data; void * cb_eval_user_data;

View File

@ -135,6 +135,7 @@ llama_context::llama_context(
cparams.op_offload = params.op_offload; cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified; cparams.kv_unified = params.kv_unified;
cparams.moe_n_expert_override = params.moe_n_expert_override;
{ {
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE"); const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
@ -2381,6 +2382,7 @@ llama_context_params llama_context_default_params() {
/*.yarn_beta_slow =*/ -1.0f, /*.yarn_beta_slow =*/ -1.0f,
/*.yarn_orig_ctx =*/ 0, /*.yarn_orig_ctx =*/ 0,
/*.defrag_thold =*/ -1.0f, /*.defrag_thold =*/ -1.0f,
/*.moe_n_expert_override =*/ 0,
/*.cb_eval =*/ nullptr, /*.cb_eval =*/ nullptr,
/*.cb_eval_user_data =*/ nullptr, /*.cb_eval_user_data =*/ nullptr,
/*.type_k =*/ GGML_TYPE_F16, /*.type_k =*/ GGML_TYPE_F16,

View File

@ -35,6 +35,10 @@ struct llama_cparams {
bool op_offload; bool op_offload;
bool kv_unified; bool kv_unified;
// MoE self-drafting: override n_expert_used
// 0 = use model default, 1+ = force exactly N active experts
int32_t moe_n_expert_override;
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;
ggml_backend_sched_eval_callback cb_eval; ggml_backend_sched_eval_callback cb_eval;

View File

@ -1051,16 +1051,38 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens] ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
cb(weights, "ffn_moe_weights", il); cb(weights, "ffn_moe_weights", il);
// HARD MASK: If moe_n_expert_override is set, slice tensors to only use first N experts
// This actually reduces computation by only loading/computing N experts instead of all n_expert_used
// Unlike soft mask (which zeros weights but still computes all experts), hard mask skips the computation entirely
int32_t n_expert_exec = n_expert_used; // Default: execute all selected experts
if (cparams.moe_n_expert_override > 0 && cparams.moe_n_expert_override < n_expert_used) {
n_expert_exec = cparams.moe_n_expert_override;
// Slice selected_experts from [n_expert_used, n_tokens] to [n_expert_exec, n_tokens]
// This causes ggml_mul_mat_id to only load and compute the first n_expert_exec experts
selected_experts = ggml_view_2d(ctx0, selected_experts, n_expert_exec, n_tokens,
selected_experts->nb[1], 0);
// Make contiguous for subsequent operations
selected_experts = ggml_cont(ctx0, selected_experts);
cb(selected_experts, "ffn_moe_topk_sliced", il);
// Slice weights from [1, n_expert_used, n_tokens] to [1, n_expert_exec, n_tokens]
weights = ggml_view_3d(ctx0, weights, 1, n_expert_exec, n_tokens,
weights->nb[1], weights->nb[2], 0);
// Make contiguous for subsequent reshape operations
weights = ggml_cont(ctx0, weights);
cb(weights, "ffn_moe_weights_sliced", il);
}
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) { if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); weights = ggml_reshape_2d(ctx0, weights, n_expert_exec, n_tokens);
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens] weights = ggml_soft_max(ctx0, weights); // [n_expert_exec, n_tokens]
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_exec, n_tokens);
cb(weights, "ffn_moe_weights_softmax", il); cb(weights, "ffn_moe_weights_softmax", il);
} }
if (norm_w) { if (norm_w) {
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); weights = ggml_reshape_2d(ctx0, weights, n_expert_exec, n_tokens);
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
cb(weights_sum, "ffn_moe_weights_sum", il); cb(weights_sum, "ffn_moe_weights_sum", il);
@ -1069,10 +1091,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY); weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
cb(weights_sum, "ffn_moe_weights_sum_clamped", il); cb(weights_sum, "ffn_moe_weights_sum_clamped", il);
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_exec, n_tokens]
cb(weights, "ffn_moe_weights_norm", il); cb(weights, "ffn_moe_weights_norm", il);
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_exec, n_tokens);
} }
if (scale_w) { if (scale_w) {
weights = ggml_scale(ctx0, weights, w_scale); weights = ggml_scale(ctx0, weights, w_scale);
@ -1085,8 +1107,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
if (weight_before_ffn) { if (weight_before_ffn) {
// repeat cur to [n_embd, n_expert_used, n_tokens] // repeat cur to [n_embd, n_expert_exec, n_tokens]
ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_used, n_tokens, 1); ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_exec, n_tokens, 1);
cur = ggml_mul(ctx0, repeated, weights); cur = ggml_mul(ctx0, repeated, weights);
cb(cur, "ffn_moe_weighted", il); cb(cur, "ffn_moe_weighted", il);
} }
@ -1173,26 +1195,31 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr }; ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
assert(n_expert_used > 0); // Determine actual expert count for aggregation
// When --moe-n-expert is set (hard mask mode), use n_expert_exec
// Otherwise use hparams.n_expert_used to avoid dynamic allocation issues during warmup
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
const uint32_t n_expert_agg = (cparams.moe_n_expert_override > 0)
? (uint32_t)n_expert_exec
: hparams.n_expert_used;
assert(n_expert_agg > 0);
// order the views before the adds // order the views before the adds
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) { for (uint32_t i = 0; i < n_expert_agg; ++i) {
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]); cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
ggml_build_forward_expand(gf, cur_experts[i]); ggml_build_forward_expand(gf, cur_experts[i]);
} }
// aggregate experts // aggregate experts
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
// to avoid potentially a large number of add nodes during warmup
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
ggml_tensor * moe_out = cur_experts[0]; ggml_tensor * moe_out = cur_experts[0];
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) { for (uint32_t i = 1; i < n_expert_agg; ++i) {
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]); moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
} }
if (hparams.n_expert_used == 1) { if (n_expert_agg == 1) {
// avoid returning a non-contiguous tensor // avoid returning a non-contiguous tensor
moe_out = ggml_cont(ctx0, moe_out); moe_out = ggml_cont(ctx0, moe_out);
} }