53 lines
2.0 KiB
C++
53 lines
2.0 KiB
C++
#include "models.h"
|
|
|
|
llm_build_qwen3_5_moe::llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params) :
|
|
llm_build_qwen3_5(model, params, defer_graph_build_t{}) {
|
|
build_graph();
|
|
}
|
|
|
|
ggml_tensor * llm_build_qwen3_5_moe::build_layer_ffn(ggml_tensor * cur, const int il) {
|
|
// Check if this is an MoE layer
|
|
if (model.layers[il].ffn_gate_inp != nullptr) {
|
|
// MoE branch
|
|
ggml_tensor * moe_out =
|
|
build_moe_ffn(cur,
|
|
model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
|
|
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
|
nullptr,
|
|
n_expert, n_expert_used, LLM_FFN_SILU,
|
|
true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
|
cb(moe_out, "ffn_moe_out", il);
|
|
|
|
// Add shared experts if present
|
|
if (model.layers[il].ffn_up_shexp != nullptr) {
|
|
ggml_tensor * ffn_shexp =
|
|
build_ffn(cur,
|
|
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
NULL,
|
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
cb(ffn_shexp, "ffn_shexp", il);
|
|
|
|
// Apply shared expert gating (sigmoid)
|
|
ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
|
|
cb(shared_gate, "shared_expert_gate", il);
|
|
|
|
shared_gate = ggml_sigmoid(ctx0, shared_gate);
|
|
cb(shared_gate, "shared_expert_gate_sigmoid", il);
|
|
|
|
ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
|
|
cb(ffn_shexp, "ffn_shexp_gated", il);
|
|
|
|
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
cb(cur, "ffn_out", il);
|
|
} else {
|
|
cur = moe_out;
|
|
}
|
|
} else {
|
|
// Dense FFN branch (fallback)
|
|
cur = llm_build_qwen3_5::build_layer_ffn(cur, il);
|
|
}
|
|
return cur;
|
|
}
|