model: using single llm_build per arch

This commit is contained in:
Xuan Son Nguyen 2026-04-16 00:21:01 +02:00
parent 707c0b7a6e
commit be46a5096f
7 changed files with 138 additions and 127 deletions

View File

@ -92,8 +92,8 @@ add_library(llama
models/lfm2.cpp
models/llada-moe.cpp
models/llada.cpp
models/llama-iswa.cpp
models/llama.cpp
models/llama4.cpp
models/maincoder.cpp
models/mamba-base.cpp
models/mamba.cpp
@ -145,8 +145,8 @@ add_library(llama
models/starcoder.cpp
models/starcoder2.cpp
models/step35-iswa.cpp
models/t5-dec.cpp
models/t5-enc.cpp
models/t5.cpp
models/t5encoder.cpp
models/wavtokenizer-dec.cpp
models/xverse.cpp
)

View File

@ -1274,8 +1274,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
// Set non-causal attention for diffusion models
hparams.causal_attn = false;
}
break;
} break;
case LLM_ARCH_LLADA:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -1289,8 +1288,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
// Set non-causal attention for diffusion models
hparams.causal_attn = false;
}
break;
} break;
case LLM_ARCH_LLADA_MOE:
{
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@ -8769,9 +8767,9 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
case LLM_ARCH_LLAMA4:
{
if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
llm = std::make_unique<llm_build_llama<false>>(*this, params);
llm = std::make_unique<llm_build_llama4<false>>(*this, params);
} else {
llm = std::make_unique<llm_build_llama_iswa>(*this, params);
llm = std::make_unique<llm_build_llama4<true>>(*this, params);
}
} break;
case LLM_ARCH_LLAMA_EMBED:
@ -8849,23 +8847,19 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
case LLM_ARCH_DREAM:
{
llm = std::make_unique<llm_build_dream>(*this, params);
}
break;
} break;
case LLM_ARCH_LLADA:
{
llm = std::make_unique<llm_build_llada>(*this, params);
}
break;
} break;
case LLM_ARCH_LLADA_MOE:
{
llm = std::make_unique<llm_build_llada_moe>(*this, params);
}
break;
} break;
case LLM_ARCH_RND1:
{
llm = std::make_unique<llm_build_rnd1>(*this, params);
}
break;
} break;
case LLM_ARCH_QWEN2VL:
{
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@ -9055,11 +9049,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
switch (params.gtype) {
case LLM_GRAPH_TYPE_ENCODER:
llm = std::make_unique<llm_build_t5_enc>(*this, params);
llm = std::make_unique<llm_build_t5<true>>(*this, params);
break;
case LLM_GRAPH_TYPE_DEFAULT:
case LLM_GRAPH_TYPE_DECODER:
llm = std::make_unique<llm_build_t5_dec>(*this, params);
llm = std::make_unique<llm_build_t5<false>>(*this, params);
break;
default:
GGML_ABORT("invalid graph type");
@ -9067,9 +9061,8 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
} break;
case LLM_ARCH_T5ENCODER:
{
llm = std::make_unique<llm_build_t5_enc>(*this, params);
}
break;
llm = std::make_unique<llm_build_t5encoder>(*this, params);
} break;
case LLM_ARCH_JAIS:
{
llm = std::make_unique<llm_build_jais>(*this, params);

View File

@ -1,6 +1,7 @@
#include "models.h"
llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
template <bool iswa>
llm_build_llama4<iswa>::llm_build_llama4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
@ -18,7 +19,14 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
ggml_tensor * inp_attn_scale = nullptr;
inp_attn_scale = build_inp_attn_scale();
auto * inp_attn = build_attn_inp_kv_iswa();
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
inp_attn_type * inp_attn = nullptr;
if constexpr (iswa) {
inp_attn = build_attn_inp_kv_iswa();
} else {
inp_attn = build_attn_inp_kv();
}
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@ -176,3 +184,7 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
ggml_build_forward_expand(gf, cur);
}
// Explicit template instantiations
template struct llm_build_llama4<false>;
template struct llm_build_llama4<true>;

View File

@ -407,8 +407,9 @@ struct llm_build_llama : public llm_graph_context {
llm_build_llama(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_llama_iswa : public llm_graph_context {
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
template <bool iswa>
struct llm_build_llama4 : public llm_graph_context {
llm_build_llama4(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_maincoder : public llm_graph_context {
@ -701,12 +702,13 @@ struct llm_build_step35_iswa : public llm_graph_context {
llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_t5_dec : public llm_graph_context {
llm_build_t5_dec(const llama_model & model, const llm_graph_params & params);
template <bool is_enc>
struct llm_build_t5 : public llm_graph_context {
llm_build_t5(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_t5_enc : public llm_graph_context {
llm_build_t5_enc(const llama_model & model, const llm_graph_params & params);
struct llm_build_t5encoder : public llm_build_t5<true> {
llm_build_t5encoder(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_wavtokenizer_dec : public llm_graph_context {

View File

@ -1,96 +0,0 @@
#include "models.h"
llm_build_t5_enc::llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
auto * inp_attn = build_attn_inp_no_cache();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm_enc, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
cur = build_attn(inp_attn,
model.layers[il].wo_enc, nullptr,
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
cb(cur, "kqv_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
{
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm_enc, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
// T5 uses relu, flan-T5 uses gelu-gated
cur = build_ffn(cur,
model.layers[il].ffn_up_enc, NULL, NULL,
model.layers[il].ffn_gate_enc, NULL, NULL,
model.layers[il].ffn_down_enc, NULL, NULL,
NULL,
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cb(cur, "result_embd", -1);
cur = build_norm(cur,
model.output_norm_enc, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@ -1,6 +1,7 @@
#include "models.h"
llm_build_t5_dec::llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
template <>
llm_build_t5<false>::llm_build_t5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
//const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -164,3 +165,99 @@ llm_build_t5_dec::llm_build_t5_dec(const llama_model & model, const llm_graph_pa
ggml_build_forward_expand(gf, cur);
}
template <>
llm_build_t5<true>::llm_build_t5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
auto * inp_attn = build_attn_inp_no_cache();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm_enc, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
cur = build_attn(inp_attn,
model.layers[il].wo_enc, nullptr,
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
cb(cur, "kqv_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
{
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm_enc, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
// T5 uses relu, flan-T5 uses gelu-gated
cur = build_ffn(cur,
model.layers[il].ffn_up_enc, NULL, NULL,
model.layers[il].ffn_gate_enc, NULL, NULL,
model.layers[il].ffn_down_enc, NULL, NULL,
NULL,
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cb(cur, "result_embd", -1);
cur = build_norm(cur,
model.output_norm_enc, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
ggml_build_forward_expand(gf, cur);
}

3
src/models/t5encoder.cpp Normal file
View File

@ -0,0 +1,3 @@
#include "models.h"
llm_build_t5encoder::llm_build_t5encoder(const llama_model & model, const llm_graph_params & params) : llm_build_t5<true>(model, params) {}