cleanup
This commit is contained in:
parent
bffe3c9092
commit
8f328431a1
|
|
@ -645,11 +645,8 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
llm_ffn_gate_type type_gate,
|
llm_ffn_gate_type type_gate,
|
||||||
int il) const {
|
int il) const {
|
||||||
|
|
||||||
LLAMA_LOG_INFO("building lora: up is {%lld, %lld}\n input is {%lld, %lld}\n", up->ne[0], up->ne[1], cur->ne[0], cur->ne[1]);
|
|
||||||
|
|
||||||
ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
|
ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
|
||||||
LLAMA_LOG_INFO("Building FFN\n");
|
|
||||||
LLAMA_LOG_INFO("built lora: tmp is {%lld, %lld}\n", tmp->ne[0], tmp->ne[1]);
|
|
||||||
cb(tmp, "ffn_up", il);
|
cb(tmp, "ffn_up", il);
|
||||||
|
|
||||||
if (up_b) {
|
if (up_b) {
|
||||||
|
|
@ -672,8 +669,6 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
case LLM_FFN_PAR:
|
case LLM_FFN_PAR:
|
||||||
{
|
{
|
||||||
cur = build_lora_mm(gate, cur);
|
cur = build_lora_mm(gate, cur);
|
||||||
LLAMA_LOG_INFO("built lora: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
|
|
||||||
|
|
||||||
cb(cur, "ffn_gate", il);
|
cb(cur, "ffn_gate", il);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
@ -692,10 +687,6 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
cur = tmp;
|
cur = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( gate && type_gate == LLM_FFN_PAR ) {
|
|
||||||
LLAMA_LOG_INFO("Gate Exists and In Paralell\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (type_op) {
|
switch (type_op) {
|
||||||
case LLM_FFN_SILU:
|
case LLM_FFN_SILU:
|
||||||
if (gate && type_gate == LLM_FFN_PAR) {
|
if (gate && type_gate == LLM_FFN_PAR) {
|
||||||
|
|
@ -744,7 +735,6 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
case LLM_FFN_GEGLU:
|
case LLM_FFN_GEGLU:
|
||||||
{
|
{
|
||||||
cur = ggml_geglu(ctx0, cur);
|
cur = ggml_geglu(ctx0, cur);
|
||||||
LLAMA_LOG_INFO("geglu split: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
|
|
||||||
cb(cur, "ffn_geglu", il);
|
cb(cur, "ffn_geglu", il);
|
||||||
} break;
|
} break;
|
||||||
case LLM_FFN_REGLU:
|
case LLM_FFN_REGLU:
|
||||||
|
|
@ -757,16 +747,12 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (gate && type_gate == LLM_FFN_PAR) {
|
if (gate && type_gate == LLM_FFN_PAR) {
|
||||||
LLAMA_LOG_INFO("cur @ tmp: cur is {%lld, %lld}\n tmp is {%lld, %lld}\n", cur->ne[0], cur->ne[1], tmp->ne[0], tmp->ne[1]);
|
|
||||||
cur = ggml_mul(ctx0, cur, tmp);
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
LLAMA_LOG_INFO("res is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
|
|
||||||
cb(cur, "ffn_gate_par", il);
|
cb(cur, "ffn_gate_par", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (down) {
|
if (down) {
|
||||||
cur = build_lora_mm(down, cur);
|
cur = build_lora_mm(down, cur);
|
||||||
LLAMA_LOG_INFO("built lora: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
|
|
||||||
|
|
||||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
||||||
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
||||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||||
|
|
|
||||||
|
|
@ -2696,11 +2696,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
for(int i = 0; i < n_layer; ++i) {
|
for(int i = 0; i < n_layer; ++i) {
|
||||||
auto& layer = layers[i];
|
auto& layer = layers[i];
|
||||||
|
|
||||||
// layer 0 uses identity so we dont need weights for said layer
|
|
||||||
if ( i != 0 ) {
|
if ( i != 0 ) {
|
||||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
|
// layer 0 uses identity so we dont need weights for said layer
|
||||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -7546,14 +7547,14 @@ struct llm_build_modern_bert : public llm_graph_context {
|
||||||
const int64_t n_head = hparams.n_head();
|
const int64_t n_head = hparams.n_head();
|
||||||
const int64_t n_head_kv = hparams.n_head_kv();
|
const int64_t n_head_kv = hparams.n_head_kv();
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); // == n_head_kv * n_embd_head
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||||
const int64_t n_tokens = ubatch.n_tokens;
|
const int64_t n_tokens = ubatch.n_tokens;
|
||||||
const int64_t n_ff = hparams.n_ff();
|
const int64_t n_ff = hparams.n_ff();
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
// RoPE params
|
// RoPE params
|
||||||
const int32_t rope_type = LLAMA_ROPE_TYPE_NEOX; // ModernBERT uses rotary
|
const int32_t rope_type = LLAMA_ROPE_TYPE_NEOX; // uses rotary
|
||||||
const int32_t n_rot = hparams.n_rot;
|
const int32_t n_rot = hparams.n_rot;
|
||||||
const int32_t n_ctx_orig = hparams.n_ctx_train;
|
const int32_t n_ctx_orig = hparams.n_ctx_train;
|
||||||
|
|
||||||
|
|
@ -7561,7 +7562,7 @@ struct llm_build_modern_bert : public llm_graph_context {
|
||||||
ggml_tensor * inpL;
|
ggml_tensor * inpL;
|
||||||
ggml_tensor * inp_pos = nullptr;
|
ggml_tensor * inp_pos = nullptr;
|
||||||
|
|
||||||
// ModernBERT needs positions for RoPE
|
// needs positions for RoPE
|
||||||
inp_pos = build_inp_pos();
|
inp_pos = build_inp_pos();
|
||||||
|
|
||||||
// embeddings (token + optional type), NO absolute pos embed
|
// embeddings (token + optional type), NO absolute pos embed
|
||||||
|
|
@ -7583,7 +7584,7 @@ struct llm_build_modern_bert : public llm_graph_context {
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * x = inpL;
|
ggml_tensor * x = inpL;
|
||||||
|
|
||||||
// pre-attention norm (attn_norm). Layer 0 may be Identity() -> nullptr
|
// pre attention norm (attn_norm). Layer 0 may be Identity() -> nullptr
|
||||||
ggml_tensor * x_attn_in = x;
|
ggml_tensor * x_attn_in = x;
|
||||||
if (model.layers[il].attn_norm) {
|
if (model.layers[il].attn_norm) {
|
||||||
x_attn_in = build_norm(x,
|
x_attn_in = build_norm(x,
|
||||||
|
|
@ -7592,6 +7593,7 @@ struct llm_build_modern_bert : public llm_graph_context {
|
||||||
LLM_NORM, il);
|
LLM_NORM, il);
|
||||||
cb(x_attn_in, "attn_pre_norm", il);
|
cb(x_attn_in, "attn_pre_norm", il);
|
||||||
} else {
|
} else {
|
||||||
|
LLAMA_LOG_INFO("Identity Tensor\n");
|
||||||
cb(x_attn_in, "attn_pre_norm_identity", il);
|
cb(x_attn_in, "attn_pre_norm_identity", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -7601,7 +7603,7 @@ struct llm_build_modern_bert : public llm_graph_context {
|
||||||
ggml_tensor * Kcur;
|
ggml_tensor * Kcur;
|
||||||
ggml_tensor * Vcur;
|
ggml_tensor * Vcur;
|
||||||
|
|
||||||
GGML_ASSERT(model.layers[il].wqkv); // ModernBERT uses fused QKV
|
GGML_ASSERT(model.layers[il].wqkv); // fused QKV
|
||||||
qkv = build_lora_mm(model.layers[il].wqkv, x_attn_in);
|
qkv = build_lora_mm(model.layers[il].wqkv, x_attn_in);
|
||||||
cb(qkv, "wqkv", il);
|
cb(qkv, "wqkv", il);
|
||||||
|
|
||||||
|
|
@ -7615,7 +7617,7 @@ struct llm_build_modern_bert : public llm_graph_context {
|
||||||
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], 1*sizeof(float)*(n_embd)));
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], 1*sizeof(float)*(n_embd)));
|
||||||
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||||
|
|
||||||
// Optional per Q/K
|
// optional per Q/K
|
||||||
if (model.layers[il].attn_q_norm) {
|
if (model.layers[il].attn_q_norm) {
|
||||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il);
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il);
|
||||||
}
|
}
|
||||||
|
|
@ -7623,12 +7625,12 @@ struct llm_build_modern_bert : public llm_graph_context {
|
||||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il);
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Heads
|
// heads
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
// RoPE (NEOX) on Q and K
|
// RoPE (NEOX ... maybe?) on Q and K
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
@ -7650,99 +7652,33 @@ struct llm_build_modern_bert : public llm_graph_context {
|
||||||
il);
|
il);
|
||||||
cb(attn_out, "attn_out", il);
|
cb(attn_out, "attn_out", il);
|
||||||
|
|
||||||
// Residual after attention
|
// residual after attention
|
||||||
ggml_tensor * cur_attn = ggml_add(ctx0, attn_out, x);
|
ggml_tensor * cur_attn = ggml_add(ctx0, attn_out, x);
|
||||||
|
|
||||||
// If we subselect outputs, do it at the last layer after attn resid
|
// ifwe subselect outputs, do it at the last layer after attn resid
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
cur_attn = ggml_get_rows(ctx0, cur_attn, inp_out_ids);
|
cur_attn = ggml_get_rows(ctx0, cur_attn, inp_out_ids);
|
||||||
x = ggml_get_rows(ctx0, x, inp_out_ids);
|
x = ggml_get_rows(ctx0, x, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
// pre-MLP norm (mlp_norm)
|
// pre mlp norm
|
||||||
ggml_tensor * h = build_norm(cur_attn,
|
ggml_tensor * h = build_norm(cur_attn,
|
||||||
model.layers[il].ffn_norm,
|
model.layers[il].ffn_norm,
|
||||||
model.layers[il].ffn_norm_b,
|
model.layers[il].ffn_norm_b,
|
||||||
LLM_NORM, il);
|
LLM_NORM, il);
|
||||||
cb(h, "mlp_pre_norm", il);
|
cb(h, "mlp_pre_norm", il);
|
||||||
|
|
||||||
// MLP (prefer GEGLU if gate exists or up has 2*n_ff rows)
|
// GEGLU because we will split ffn_up which has shape [n_embd, n_ff * 2] and ffn_down has shape [n_ff, n_embd]
|
||||||
ggml_tensor * mlp_out = nullptr;
|
ggml_tensor * mlp_out = build_ffn(
|
||||||
ggml_tensor * ffn_gate_view = model.layers[il].ffn_gate;
|
h,
|
||||||
ggml_tensor * ffn_up_view = model.layers[il].ffn_up;
|
model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL,
|
||||||
|
/*gate*/ NULL , /*gate_b*/ NULL, /*gate_shexp*/ NULL,
|
||||||
if (ffn_gate_view == nullptr && ffn_up_view) {
|
model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL,
|
||||||
|
/*act_scales*/ NULL,
|
||||||
// Case A: weight stored as (2*ffn, hidden) -> split rows into two (ffn x hidden)
|
LLM_FFN_GEGLU, LLM_FFN_PAR, il
|
||||||
if( ffn_up_view->ne[0] == 2 * n_ff and ffn_up_view->ne[1] == n_embd) {
|
);
|
||||||
|
|
||||||
// top half, (ffn up)
|
|
||||||
ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
|
||||||
/*ne0*/ n_ff, /*ne1*/ n_embd,
|
|
||||||
/*nb1*/ model.layers[il].ffn_up->nb[1],
|
|
||||||
/*offset_bytes*/ (size_t)0);
|
|
||||||
// bottom half (gate)
|
|
||||||
ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
|
||||||
/*ne0*/ n_ff, /*ne1*/ n_embd,
|
|
||||||
/*nb1*/ model.layers[il].ffn_up->nb[1],
|
|
||||||
|
|
||||||
/*offset_bytes*/ (size_t)n_ff * model.layers[il].ffn_up->nb[1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
else if ( ffn_up_view->ne[0] == n_embd && ffn_up_view->ne[1] == 2 * n_ff) {
|
|
||||||
// top half
|
|
||||||
LLAMA_LOG_INFO("Case B:\n");
|
|
||||||
ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
|
||||||
n_embd, n_ff,
|
|
||||||
model.layers[il].ffn_up->nb[1],
|
|
||||||
0);
|
|
||||||
|
|
||||||
ffn_up_view = ggml_cont(ctx0, ffn_up_view);
|
|
||||||
|
|
||||||
ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
|
||||||
n_embd, n_ff,
|
|
||||||
model.layers[il].ffn_up->nb[1],
|
|
||||||
n_ff * model.layers[il].ffn_up->nb[0]);
|
|
||||||
ffn_gate_view = ggml_cont(ctx0, ffn_gate_view);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
//ggml_tensor * ffn_down_view = model.layers[il].ffn_down;
|
|
||||||
//LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld}, Gate: {%lld, %lld}, Down: {%lld, %lld}\n",
|
|
||||||
// ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]);
|
|
||||||
/*
|
|
||||||
ggml_tensor * cur,
|
|
||||||
ggml_tensor * up,
|
|
||||||
ggml_tensor * up_b,
|
|
||||||
ggml_tensor * up_s,
|
|
||||||
ggml_tensor * gate,
|
|
||||||
ggml_tensor * gate_b,
|
|
||||||
ggml_tensor * gate_s,
|
|
||||||
ggml_tensor * down,
|
|
||||||
ggml_tensor * down_b,
|
|
||||||
ggml_tensor * down_s,
|
|
||||||
ggml_tensor * act_scales,*/
|
|
||||||
mlp_out = build_ffn(
|
|
||||||
h,
|
|
||||||
model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL,
|
|
||||||
NULL , /*gate_b*/ NULL, /*gate_shexp*/ NULL,
|
|
||||||
model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL,
|
|
||||||
/*act_scales*/ NULL,
|
|
||||||
LLM_FFN_GEGLU, LLM_FFN_PAR, il
|
|
||||||
);
|
|
||||||
cb(mlp_out, "ffn_out_geglu", il);
|
|
||||||
} else {
|
|
||||||
mlp_out = build_ffn(
|
|
||||||
h,
|
|
||||||
model.layers[il].ffn_up, NULL, NULL,
|
|
||||||
model.layers[il].ffn_gate, NULL, NULL,
|
|
||||||
model.layers[il].ffn_down, NULL, NULL,
|
|
||||||
NULL,
|
|
||||||
LLM_FFN_GEGLU, LLM_FFN_PAR, il
|
|
||||||
);
|
|
||||||
cb(mlp_out, "ffn_out_geglu", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
cb(mlp_out, "ffn_out_geglu", il);
|
||||||
// Residual after MLP
|
// Residual after MLP
|
||||||
ggml_tensor * cur_layer = ggml_add(ctx0, mlp_out, cur_attn);
|
ggml_tensor * cur_layer = ggml_add(ctx0, mlp_out, cur_attn);
|
||||||
|
|
||||||
|
|
@ -7750,7 +7686,7 @@ struct llm_build_modern_bert : public llm_graph_context {
|
||||||
inpL = cur_layer;
|
inpL = cur_layer;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 9) final model norm (final_norm)
|
// final model norm (final_norm)
|
||||||
cur = build_norm(inpL, model.output_norm, model.output_norm_b, LLM_NORM, -1);
|
cur = build_norm(inpL, model.output_norm, model.output_norm_b, LLM_NORM, -1);
|
||||||
cb(cur, "final_norm", -1);
|
cb(cur, "final_norm", -1);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue