diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index ae8b150d28..972d37306c 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -645,8 +645,11 @@ ggml_tensor * llm_graph_context::build_ffn( llm_ffn_gate_type type_gate, int il) const { - + LLAMA_LOG_INFO("building lora: up is {%lld, %lld}\n input is {%lld, %lld}\n", up->ne[0], up->ne[1], cur->ne[0], cur->ne[1]); + ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur; + LLAMA_LOG_INFO("Building FFN\n"); + LLAMA_LOG_INFO("built lora: tmp is {%lld, %lld}\n", tmp->ne[0], tmp->ne[1]); cb(tmp, "ffn_up", il); if (up_b) { @@ -669,6 +672,8 @@ ggml_tensor * llm_graph_context::build_ffn( case LLM_FFN_PAR: { cur = build_lora_mm(gate, cur); + LLAMA_LOG_INFO("built lora: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]); + cb(cur, "ffn_gate", il); } break; } @@ -687,6 +692,10 @@ ggml_tensor * llm_graph_context::build_ffn( cur = tmp; } + if( gate && type_gate == LLM_FFN_PAR ) { + LLAMA_LOG_INFO("Gate Exists and In Paralell\n"); + } + switch (type_op) { case LLM_FFN_SILU: if (gate && type_gate == LLM_FFN_PAR) { @@ -735,6 +744,7 @@ ggml_tensor * llm_graph_context::build_ffn( case LLM_FFN_GEGLU: { cur = ggml_geglu(ctx0, cur); + LLAMA_LOG_INFO("geglu split: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]); cb(cur, "ffn_geglu", il); } break; case LLM_FFN_REGLU: @@ -747,12 +757,16 @@ ggml_tensor * llm_graph_context::build_ffn( } if (gate && type_gate == LLM_FFN_PAR) { + LLAMA_LOG_INFO("cur @ tmp: cur is {%lld, %lld}\n tmp is {%lld, %lld}\n", cur->ne[0], cur->ne[1], tmp->ne[0], tmp->ne[1]); cur = ggml_mul(ctx0, cur, tmp); + LLAMA_LOG_INFO("res is {%lld, %lld}\n", cur->ne[0], cur->ne[1]); cb(cur, "ffn_gate_par", il); } if (down) { cur = build_lora_mm(down, cur); + LLAMA_LOG_INFO("built lora: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]); + if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) { // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6a8953af33..c02a3078d7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7672,7 +7672,7 @@ struct llm_build_modern_bert : public llm_graph_context { ggml_tensor * ffn_up_view = model.layers[il].ffn_up; if (ffn_gate_view == nullptr && ffn_up_view) { - + // Case A: weight stored as (2*ffn, hidden) -> split rows into two (ffn x hidden) if( ffn_up_view->ne[0] == 2 * n_ff and ffn_up_view->ne[1] == n_embd) { @@ -7685,33 +7685,49 @@ struct llm_build_modern_bert : public llm_graph_context { ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, /*ne0*/ n_ff, /*ne1*/ n_embd, /*nb1*/ model.layers[il].ffn_up->nb[1], + /*offset_bytes*/ (size_t)n_ff * model.layers[il].ffn_up->nb[1]); } + + /* else if ( ffn_up_view->ne[0] == n_embd && ffn_up_view->ne[1] == 2 * n_ff) { // top half + LLAMA_LOG_INFO("Case B:\n"); ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, n_embd, n_ff, model.layers[il].ffn_up->nb[1], 0); + ffn_up_view = ggml_cont(ctx0, ffn_up_view); ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, n_embd, n_ff, model.layers[il].ffn_up->nb[1], - n_ff * sizeof(float)); + n_ff * model.layers[il].ffn_up->nb[0]); ffn_gate_view = ggml_cont(ctx0, ffn_gate_view); } - - ggml_tensor * ffn_down_view = model.layers[il].ffn_down; - LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld}, Gate: {%lld, %lld}, Down: {%lld, %lld}", - ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]); - + */ + //ggml_tensor * ffn_down_view = model.layers[il].ffn_down; + //LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld}, Gate: {%lld, %lld}, Down: {%lld, %lld}\n", + // ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]); + /* + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * up_s, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * gate_s, + ggml_tensor * down, + ggml_tensor * down_b, + ggml_tensor * down_s, + ggml_tensor * act_scales,*/ mlp_out = build_ffn( h, - model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL, - ffn_gate_view , /*gate_b*/ NULL, /*gate_shexp*/ NULL, + model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL, + NULL , /*gate_b*/ NULL, /*gate_shexp*/ NULL, model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL, - /*expert_scores*/ NULL, + /*act_scales*/ NULL, LLM_FFN_GEGLU, LLM_FFN_PAR, il ); cb(mlp_out, "ffn_out_geglu", il);