tensor debugging now works -> (llama-eval-callback), instead of simulated gate split with views, GEGLU is now used which does exactly this
This commit is contained in:
parent
18c0c23ed8
commit
bffe3c9092
|
|
@ -645,8 +645,11 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
llm_ffn_gate_type type_gate,
|
||||
int il) const {
|
||||
|
||||
|
||||
LLAMA_LOG_INFO("building lora: up is {%lld, %lld}\n input is {%lld, %lld}\n", up->ne[0], up->ne[1], cur->ne[0], cur->ne[1]);
|
||||
|
||||
ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
|
||||
LLAMA_LOG_INFO("Building FFN\n");
|
||||
LLAMA_LOG_INFO("built lora: tmp is {%lld, %lld}\n", tmp->ne[0], tmp->ne[1]);
|
||||
cb(tmp, "ffn_up", il);
|
||||
|
||||
if (up_b) {
|
||||
|
|
@ -669,6 +672,8 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
case LLM_FFN_PAR:
|
||||
{
|
||||
cur = build_lora_mm(gate, cur);
|
||||
LLAMA_LOG_INFO("built lora: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
|
||||
|
||||
cb(cur, "ffn_gate", il);
|
||||
} break;
|
||||
}
|
||||
|
|
@ -687,6 +692,10 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
cur = tmp;
|
||||
}
|
||||
|
||||
if( gate && type_gate == LLM_FFN_PAR ) {
|
||||
LLAMA_LOG_INFO("Gate Exists and In Paralell\n");
|
||||
}
|
||||
|
||||
switch (type_op) {
|
||||
case LLM_FFN_SILU:
|
||||
if (gate && type_gate == LLM_FFN_PAR) {
|
||||
|
|
@ -735,6 +744,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
case LLM_FFN_GEGLU:
|
||||
{
|
||||
cur = ggml_geglu(ctx0, cur);
|
||||
LLAMA_LOG_INFO("geglu split: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
|
||||
cb(cur, "ffn_geglu", il);
|
||||
} break;
|
||||
case LLM_FFN_REGLU:
|
||||
|
|
@ -747,12 +757,16 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
}
|
||||
|
||||
if (gate && type_gate == LLM_FFN_PAR) {
|
||||
LLAMA_LOG_INFO("cur @ tmp: cur is {%lld, %lld}\n tmp is {%lld, %lld}\n", cur->ne[0], cur->ne[1], tmp->ne[0], tmp->ne[1]);
|
||||
cur = ggml_mul(ctx0, cur, tmp);
|
||||
LLAMA_LOG_INFO("res is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
|
||||
cb(cur, "ffn_gate_par", il);
|
||||
}
|
||||
|
||||
if (down) {
|
||||
cur = build_lora_mm(down, cur);
|
||||
LLAMA_LOG_INFO("built lora: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
|
||||
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
||||
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
|
|
|
|||
|
|
@ -7672,7 +7672,7 @@ struct llm_build_modern_bert : public llm_graph_context {
|
|||
ggml_tensor * ffn_up_view = model.layers[il].ffn_up;
|
||||
|
||||
if (ffn_gate_view == nullptr && ffn_up_view) {
|
||||
|
||||
|
||||
// Case A: weight stored as (2*ffn, hidden) -> split rows into two (ffn x hidden)
|
||||
if( ffn_up_view->ne[0] == 2 * n_ff and ffn_up_view->ne[1] == n_embd) {
|
||||
|
||||
|
|
@ -7685,33 +7685,49 @@ struct llm_build_modern_bert : public llm_graph_context {
|
|||
ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
||||
/*ne0*/ n_ff, /*ne1*/ n_embd,
|
||||
/*nb1*/ model.layers[il].ffn_up->nb[1],
|
||||
|
||||
/*offset_bytes*/ (size_t)n_ff * model.layers[il].ffn_up->nb[1]);
|
||||
}
|
||||
|
||||
/*
|
||||
else if ( ffn_up_view->ne[0] == n_embd && ffn_up_view->ne[1] == 2 * n_ff) {
|
||||
// top half
|
||||
LLAMA_LOG_INFO("Case B:\n");
|
||||
ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
||||
n_embd, n_ff,
|
||||
model.layers[il].ffn_up->nb[1],
|
||||
0);
|
||||
|
||||
ffn_up_view = ggml_cont(ctx0, ffn_up_view);
|
||||
|
||||
ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
||||
n_embd, n_ff,
|
||||
model.layers[il].ffn_up->nb[1],
|
||||
n_ff * sizeof(float));
|
||||
n_ff * model.layers[il].ffn_up->nb[0]);
|
||||
ffn_gate_view = ggml_cont(ctx0, ffn_gate_view);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_down_view = model.layers[il].ffn_down;
|
||||
LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld}, Gate: {%lld, %lld}, Down: {%lld, %lld}",
|
||||
ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]);
|
||||
|
||||
*/
|
||||
//ggml_tensor * ffn_down_view = model.layers[il].ffn_down;
|
||||
//LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld}, Gate: {%lld, %lld}, Down: {%lld, %lld}\n",
|
||||
// ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]);
|
||||
/*
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * up,
|
||||
ggml_tensor * up_b,
|
||||
ggml_tensor * up_s,
|
||||
ggml_tensor * gate,
|
||||
ggml_tensor * gate_b,
|
||||
ggml_tensor * gate_s,
|
||||
ggml_tensor * down,
|
||||
ggml_tensor * down_b,
|
||||
ggml_tensor * down_s,
|
||||
ggml_tensor * act_scales,*/
|
||||
mlp_out = build_ffn(
|
||||
h,
|
||||
model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL,
|
||||
ffn_gate_view , /*gate_b*/ NULL, /*gate_shexp*/ NULL,
|
||||
model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL,
|
||||
NULL , /*gate_b*/ NULL, /*gate_shexp*/ NULL,
|
||||
model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL,
|
||||
/*expert_scores*/ NULL,
|
||||
/*act_scales*/ NULL,
|
||||
LLM_FFN_GEGLU, LLM_FFN_PAR, il
|
||||
);
|
||||
cb(mlp_out, "ffn_out_geglu", il);
|
||||
|
|
|
|||
Loading…
Reference in New Issue