model : support NVFP4 tensors for Gemma4 (#21971)

* support nvfp4 tensors for Gemma4

* add wo_s to build_attn

* add wo_s to build_attn

* fix glm4
This commit is contained in:
Sigbjørn Skjæret 2026-04-16 16:51:47 +02:00 committed by GitHub
parent b572d1ecd6
commit f772f6e434
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
105 changed files with 149 additions and 148 deletions

View File

@ -2011,6 +2011,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_no_cache * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
@ -2044,7 +2045,7 @@ ggml_tensor * llm_graph_context::build_attn(
cb(cur, "kqv_out", il);
if (wo) {
cur = build_lora_mm(wo, cur);
cur = build_lora_mm(wo, cur, wo_s);
}
if (wo_b) {
@ -2095,6 +2096,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
@ -2146,10 +2148,15 @@ ggml_tensor * llm_graph_context::build_attn(
}
if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
cur = build_lora_mm(wo, cur);
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
if (wo_s) {
cur = ggml_mul(ctx0, cur, wo_s);
}
} else {
cur = build_lora_mm(wo, cur, wo_s);
}
}
@ -2193,6 +2200,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_k * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
@ -2227,10 +2235,15 @@ ggml_tensor * llm_graph_context::build_attn(
cb(cur, "kqv_out", il);
if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
cur = build_lora_mm(wo, cur);
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
if (wo_s) {
cur = ggml_mul(ctx0, cur, wo_s);
}
} else {
cur = build_lora_mm(wo, cur, wo_s);
}
}
@ -2245,6 +2258,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
@ -2313,7 +2327,7 @@ ggml_tensor * llm_graph_context::build_attn(
}
if (wo) {
cur = build_lora_mm(wo, cur);
cur = build_lora_mm(wo, cur, wo_s);
}
if (wo_b) {
@ -2344,6 +2358,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_cross * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
@ -2368,7 +2383,7 @@ ggml_tensor * llm_graph_context::build_attn(
cb(cur, "kqv_out", il);
if (wo) {
cur = build_lora_mm(wo, cur);
cur = build_lora_mm(wo, cur, wo_s);
}
if (wo_b) {

View File

@ -892,6 +892,7 @@ struct llm_graph_context {
llm_graph_input_attn_no_cache * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
@ -907,6 +908,7 @@ struct llm_graph_context {
llm_graph_input_attn_kv * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
@ -922,6 +924,7 @@ struct llm_graph_context {
llm_graph_input_attn_k * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
@ -938,6 +941,7 @@ struct llm_graph_context {
llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
@ -953,6 +957,7 @@ struct llm_graph_context {
llm_graph_input_attn_cross * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]

View File

@ -80,7 +80,7 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
cur = build_attn(inp_attn,
NULL, NULL, // wo will be applied after gating
NULL, NULL, NULL, // wo will be applied after gating
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
@ -91,7 +91,7 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
cb(cur, "attn_gated", il);
// now apply output projection
cur = build_lora_mm(model.layers[il].wo, cur);
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
cb(cur, "attn_o_proj", il);
}

View File

@ -1,7 +1,5 @@
#include "models.h"
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
@ -62,7 +60,7 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur_pos", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@ -1,6 +1,5 @@
#include "models.h"
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
@ -78,7 +77,7 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@ -60,7 +60,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -1,6 +1,5 @@
#include "models.h"
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
@ -67,7 +66,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -70,7 +70,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
}

View File

@ -56,7 +56,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

View File

@ -100,7 +100,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
}

View File

@ -73,7 +73,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
NULL, NULL,
NULL, NULL, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cur = build_norm(cur,

View File

@ -45,7 +45,7 @@ llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -94,7 +94,7 @@ llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
model.layers[il].wo, nullptr, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -80,7 +80,7 @@ llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -55,7 +55,7 @@ llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -28,18 +28,20 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
for (int il = 0; il < n_layer; ++il) {
// get either the text or image weight tensors
ggml_tensor *wqkv, *wo;
ggml_tensor *wqkv, *wo, *wo_s;
ggml_tensor *ffn_gate, *ffn_down, *ffn_up;
if (is_text) {
wqkv = model.layers[il].wqkv;
wo = model.layers[il].wo;
wo_s = model.layers[il].wo_s;
ffn_gate = model.layers[il].ffn_gate;
ffn_down = model.layers[il].ffn_down;
ffn_up = model.layers[il].ffn_up;
} else {
wqkv = model.layers[il].visexp_attn_wqkv;
wo = model.layers[il].visexp_attn_wo;
wo_s = nullptr;
ffn_gate = model.layers[il].visexp_ffn_gate;
ffn_down = model.layers[il].visexp_ffn_down;
ffn_up = model.layers[il].visexp_ffn_up;
@ -64,7 +66,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type);
cur = build_attn(inp_attn,
wo, nullptr,
wo, nullptr, wo_s,
Qcur, Kcur, Vcur,
nullptr, nullptr, nullptr,
kq_scale, il);

View File

@ -80,7 +80,7 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -73,7 +73,7 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -61,7 +61,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -1,7 +1,5 @@
#include "models.h"
llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
@ -80,7 +78,7 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -68,7 +68,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -84,7 +84,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
cb(Kcur, "k_pe", il);
cur = build_attn(inp_attn_kv,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
@ -182,7 +182,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
// note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
cur = build_attn(inp_attn_k,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
} else {
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
@ -219,7 +219,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
cur = build_attn(inp_attn_kv,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
}

View File

@ -59,7 +59,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -1,7 +1,5 @@
#include "models.h"
llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
//copied from qwen2
@ -59,7 +57,7 @@ llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -63,7 +63,7 @@ llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
cb(cur, "attn_out", il);
}

View File

@ -62,7 +62,7 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {

View File

@ -53,7 +53,7 @@ llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
model.layers[il].wo, nullptr, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
}

View File

@ -65,7 +65,7 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn_iswa,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
cb(cur, "attn_out", il);
}

View File

@ -1,7 +1,5 @@
#include "models.h"
llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
@ -67,7 +65,7 @@ llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_pa
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -1,6 +1,5 @@
#include "models.h"
template <bool iswa>
llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
@ -69,7 +68,7 @@ llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
cb(cur, "attn_out", il);
}

View File

@ -52,7 +52,7 @@ llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_gr
cb(Vcur, "Vcur-post-rope", il);
ggml_tensor * attn_out = build_attn(inp->get_attn(),
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(attn_out, "attn_out", il);

View File

@ -1,6 +1,5 @@
#include "models.h"
llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -67,7 +66,7 @@ llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_pa
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -65,7 +65,7 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
cur =
build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}

View File

@ -1,6 +1,5 @@
#include "models.h"
llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
@ -60,7 +59,7 @@ llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_para
cb(Qcur, "Qcur_scaled", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -61,7 +61,7 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -84,7 +84,7 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -103,7 +103,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
cb(Kcur, "Kcur_pos", il);
cur = build_attn(inp_attn, model.layers[il].wo,
NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
NULL, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
hparams.f_attention_scale, il);
} else {
// reuse KV cache of earlier layers
@ -119,7 +119,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
cb(Qcur, "Qcur_pos", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
}
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);

View File

@ -62,7 +62,7 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll
// this is to mirror Gemma4Attention in pytorch code
ggml_tensor * Qcur;
{
Qcur = build_lora_mm(model.layers[il].wq, cur);
Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@ -77,11 +77,11 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll
// self-attention
if (hparams.has_kv(il)) {
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = model.layers[il].wv
? build_lora_mm(model.layers[il].wv, cur)
? build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s)
: Kcur; // if v_proj is not present, use Kcur as Vcur
cb(Vcur, "Vcur", il);
@ -100,12 +100,12 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll
cb(Kcur, "Kcur_pos", il);
cur = build_attn(inp_attn, model.layers[il].wo,
nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
nullptr, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
hparams.f_attention_scale, il);
} else {
// reuse KV cache of earlier layers
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
model.layers[il].wo, nullptr, model.layers[il].wo_s,
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
}
@ -132,9 +132,9 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll
cb(cur_mlp, "ffn_norm_1", il);
cur_mlp = build_ffn(cur_mlp,
model.layers[il].ffn_up, nullptr, nullptr,
model.layers[il].ffn_gate, nullptr, nullptr,
model.layers[il].ffn_down, nullptr, nullptr,
model.layers[il].ffn_up, nullptr, model.layers[il].ffn_up_s,
model.layers[il].ffn_gate, nullptr, model.layers[il].ffn_gate_s,
model.layers[il].ffn_down, nullptr, model.layers[il].ffn_down_s,
nullptr,
LLM_FFN_GELU, LLM_FFN_PAR, il);
cur_mlp = build_norm(cur_mlp,
@ -184,9 +184,9 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, nullptr, nullptr,
model.layers[il].ffn_gate, nullptr, nullptr,
model.layers[il].ffn_down, nullptr, nullptr,
model.layers[il].ffn_up, nullptr, model.layers[il].ffn_up_s,
model.layers[il].ffn_gate, nullptr, model.layers[il].ffn_gate_s,
model.layers[il].ffn_down, nullptr, model.layers[il].ffn_down_s,
nullptr,
LLM_FFN_GELU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);

View File

@ -94,7 +94,7 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_transformer_layers - 1 && inp_out_ids) {

View File

@ -1,7 +1,5 @@
#include "models.h"
llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -100,7 +98,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_transformer_layers - 1 && inp_out_ids) {

View File

@ -49,7 +49,7 @@ llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -1,6 +1,5 @@
#include "models.h"
llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -55,7 +54,7 @@ llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -116,7 +116,7 @@ ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor *
const float kq_scale =
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
return cur;

View File

@ -124,7 +124,7 @@ ggml_tensor * llm_build_granite::build_attention_layer(
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
return cur;

View File

@ -69,7 +69,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -60,7 +60,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

View File

@ -83,7 +83,7 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
cb(Qcur, "Qcur_norm", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@ -84,7 +84,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
cb(Qcur, "Qcur_norm", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@ -69,7 +69,7 @@ llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -39,7 +39,7 @@ llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -68,7 +68,7 @@ llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_para
cb(Kcur, "Kcur_rope", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -42,7 +42,7 @@ llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_para
// No RoPE :)
cur = build_attn(inp_hybrid->get_attn(),
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -268,7 +268,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
ggml_tensor * Vcur = kv_cmpr;
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn_k, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il);
cur = build_attn(inp_attn_k, layer.wo, NULL, layer.wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il);
cb(cur, "mla_out", il);
} else { // MLA KV cache disabled. Fall back to MHA KV cache.
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens);
@ -299,7 +299,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
// Direct softmax attention (with MHA KV cache)
// Use build_attn with inp_attn for proper mask handling
cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
cur = build_attn(inp_attn_kv, layer.wo, NULL, layer.wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
cb(cur, "mla_out", il);
}
}

View File

@ -66,7 +66,7 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
attn_factor, beta_fast, beta_slow);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
cb(cur, "model.layers.{}.self_attn.out_proj", il);

View File

@ -66,7 +66,7 @@ llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -53,7 +53,7 @@ llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -95,7 +95,7 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
cb(Kcur, "Kcur_normed", il);
}
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@ -89,7 +89,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
cb(Kcur, "Kcur_normed", il);
}
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);

View File

@ -66,7 +66,7 @@ llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -58,7 +58,7 @@ llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_
ggml_tensor * sinks = model.layers[il].attn_sinks;
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
}

View File

@ -134,7 +134,7 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
cb(k_states, "k_states", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -64,7 +64,7 @@ llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -86,7 +86,7 @@ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_grap
}
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@ -64,7 +64,7 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
model.layers[il].wo, nullptr, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);

View File

@ -1,7 +1,5 @@
#include "models.h"
llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -76,7 +74,7 @@ llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params &
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

View File

@ -98,7 +98,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
const float kq_scale =
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
return cur;

View File

@ -70,7 +70,7 @@ llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -57,7 +57,7 @@ llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
model.layers[il].wo, nullptr, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
}

View File

@ -69,7 +69,7 @@ llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
model.layers[il].wo, nullptr, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -89,7 +89,7 @@ llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -68,7 +68,7 @@ llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -67,7 +67,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
cb(cur, "attn_out", il);

View File

@ -73,7 +73,7 @@ llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_
cb(Qcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -72,7 +72,7 @@ llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -74,7 +74,7 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1) {

View File

@ -1,6 +1,5 @@
#include "models.h"
llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
@ -63,7 +62,7 @@ llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, co
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -1,6 +1,5 @@
#include "models.h"
llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -74,7 +73,7 @@ llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -80,7 +80,7 @@ llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_
cb(Qcur, "Qcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -60,7 +60,7 @@ llm_build_plamo::llm_build_plamo(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -141,7 +141,7 @@ ggml_tensor * llm_build_plamo2::build_plamo2_attn_layer(llm_graph_input_attn_kv
ext_factor, attn_factor, beta_fast, beta_slow);
cur = build_attn(inp,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f / sqrtf(float(n_embd_head_v)), il);
}

View File

@ -73,7 +73,7 @@ llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_gr
const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
cb(cur, "attn_out", il);

View File

@ -120,7 +120,7 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params &
cb(k_states, "k_states", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -1,6 +1,5 @@
#include "models.h"
llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
@ -56,7 +55,7 @@ llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -72,7 +72,7 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -69,7 +69,7 @@ llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -66,7 +66,7 @@ llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -66,7 +66,7 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);

View File

@ -179,7 +179,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp,
nullptr, nullptr,
nullptr, nullptr, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_pregate", il);

View File

@ -179,7 +179,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp,
nullptr, nullptr,
nullptr, nullptr, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_pregate", il);

View File

@ -66,7 +66,7 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);

View File

@ -157,7 +157,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp,
nullptr, nullptr,
nullptr, nullptr, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_pregate", il);
@ -172,7 +172,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
cur = ggml_mul(ctx0, cur, gate);
cb(cur, "attn_gated", il);
cur = build_lora_mm(model.layers[il].wo, cur);
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
cb(cur, "attn_output", il);
return cur;

View File

@ -72,7 +72,7 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -72,7 +72,7 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

View File

@ -42,7 +42,7 @@ llm_build_refact::llm_build_refact(const llama_model & model, const llm_graph_pa
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -68,7 +68,7 @@ llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -71,7 +71,7 @@ llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@ -69,7 +69,7 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
cb(Kcur, "Kcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -74,7 +74,7 @@ llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}

View File

@ -87,7 +87,7 @@ llm_build_stablelm::llm_build_stablelm(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

View File

@ -48,7 +48,7 @@ llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {

Some files were not shown because too many files have changed in this diff Show More