From 8f328431a1032ef63bbdf386f18c8f18f4bcc088 Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Thu, 28 Aug 2025 12:33:52 -0400 Subject: [PATCH] cleanup --- src/llama-graph.cpp | 14 ------ src/llama-model.cpp | 114 ++++++++++---------------------------------- 2 files changed, 25 insertions(+), 103 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 972d37306c..1512869ec6 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -645,11 +645,8 @@ ggml_tensor * llm_graph_context::build_ffn( llm_ffn_gate_type type_gate, int il) const { - LLAMA_LOG_INFO("building lora: up is {%lld, %lld}\n input is {%lld, %lld}\n", up->ne[0], up->ne[1], cur->ne[0], cur->ne[1]); ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur; - LLAMA_LOG_INFO("Building FFN\n"); - LLAMA_LOG_INFO("built lora: tmp is {%lld, %lld}\n", tmp->ne[0], tmp->ne[1]); cb(tmp, "ffn_up", il); if (up_b) { @@ -672,8 +669,6 @@ ggml_tensor * llm_graph_context::build_ffn( case LLM_FFN_PAR: { cur = build_lora_mm(gate, cur); - LLAMA_LOG_INFO("built lora: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]); - cb(cur, "ffn_gate", il); } break; } @@ -692,10 +687,6 @@ ggml_tensor * llm_graph_context::build_ffn( cur = tmp; } - if( gate && type_gate == LLM_FFN_PAR ) { - LLAMA_LOG_INFO("Gate Exists and In Paralell\n"); - } - switch (type_op) { case LLM_FFN_SILU: if (gate && type_gate == LLM_FFN_PAR) { @@ -744,7 +735,6 @@ ggml_tensor * llm_graph_context::build_ffn( case LLM_FFN_GEGLU: { cur = ggml_geglu(ctx0, cur); - LLAMA_LOG_INFO("geglu split: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]); cb(cur, "ffn_geglu", il); } break; case LLM_FFN_REGLU: @@ -757,16 +747,12 @@ ggml_tensor * llm_graph_context::build_ffn( } if (gate && type_gate == LLM_FFN_PAR) { - LLAMA_LOG_INFO("cur @ tmp: cur is {%lld, %lld}\n tmp is {%lld, %lld}\n", cur->ne[0], cur->ne[1], tmp->ne[0], tmp->ne[1]); cur = ggml_mul(ctx0, cur, tmp); - LLAMA_LOG_INFO("res is {%lld, %lld}\n", cur->ne[0], cur->ne[1]); cb(cur, "ffn_gate_par", il); } if (down) { cur = build_lora_mm(down, cur); - LLAMA_LOG_INFO("built lora: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]); - if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) { // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c02a3078d7..897c58ac14 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2696,11 +2696,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for(int i = 0; i < n_layer; ++i) { auto& layer = layers[i]; - // layer 0 uses identity so we dont need weights for said layer if ( i != 0 ) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + } else{ + // layer 0 uses identity so we dont need weights for said layer layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); } @@ -7546,14 +7547,14 @@ struct llm_build_modern_bert : public llm_graph_context { const int64_t n_head = hparams.n_head(); const int64_t n_head_kv = hparams.n_head_kv(); const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); // == n_head_kv * n_embd_head + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); const int64_t n_tokens = ubatch.n_tokens; const int64_t n_ff = hparams.n_ff(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); // RoPE params - const int32_t rope_type = LLAMA_ROPE_TYPE_NEOX; // ModernBERT uses rotary + const int32_t rope_type = LLAMA_ROPE_TYPE_NEOX; // uses rotary const int32_t n_rot = hparams.n_rot; const int32_t n_ctx_orig = hparams.n_ctx_train; @@ -7561,7 +7562,7 @@ struct llm_build_modern_bert : public llm_graph_context { ggml_tensor * inpL; ggml_tensor * inp_pos = nullptr; - // ModernBERT needs positions for RoPE + // needs positions for RoPE inp_pos = build_inp_pos(); // embeddings (token + optional type), NO absolute pos embed @@ -7583,7 +7584,7 @@ struct llm_build_modern_bert : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { ggml_tensor * x = inpL; - // pre-attention norm (attn_norm). Layer 0 may be Identity() -> nullptr + // pre attention norm (attn_norm). Layer 0 may be Identity() -> nullptr ggml_tensor * x_attn_in = x; if (model.layers[il].attn_norm) { x_attn_in = build_norm(x, @@ -7592,6 +7593,7 @@ struct llm_build_modern_bert : public llm_graph_context { LLM_NORM, il); cb(x_attn_in, "attn_pre_norm", il); } else { + LLAMA_LOG_INFO("Identity Tensor\n"); cb(x_attn_in, "attn_pre_norm_identity", il); } @@ -7601,7 +7603,7 @@ struct llm_build_modern_bert : public llm_graph_context { ggml_tensor * Kcur; ggml_tensor * Vcur; - GGML_ASSERT(model.layers[il].wqkv); // ModernBERT uses fused QKV + GGML_ASSERT(model.layers[il].wqkv); // fused QKV qkv = build_lora_mm(model.layers[il].wqkv, x_attn_in); cb(qkv, "wqkv", il); @@ -7615,7 +7617,7 @@ struct llm_build_modern_bert : public llm_graph_context { Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - // Optional per Q/K + // optional per Q/K if (model.layers[il].attn_q_norm) { Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il); } @@ -7623,12 +7625,12 @@ struct llm_build_modern_bert : public llm_graph_context { Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il); } - // Heads + // heads Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - // RoPE (NEOX) on Q and K + // RoPE (NEOX ... maybe?) on Q and K Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); @@ -7650,99 +7652,33 @@ struct llm_build_modern_bert : public llm_graph_context { il); cb(attn_out, "attn_out", il); - // Residual after attention + // residual after attention ggml_tensor * cur_attn = ggml_add(ctx0, attn_out, x); - // If we subselect outputs, do it at the last layer after attn resid + // ifwe subselect outputs, do it at the last layer after attn resid if (il == n_layer - 1 && inp_out_ids) { cur_attn = ggml_get_rows(ctx0, cur_attn, inp_out_ids); x = ggml_get_rows(ctx0, x, inp_out_ids); } - // pre-MLP norm (mlp_norm) + // pre mlp norm ggml_tensor * h = build_norm(cur_attn, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, il); cb(h, "mlp_pre_norm", il); - // MLP (prefer GEGLU if gate exists or up has 2*n_ff rows) - ggml_tensor * mlp_out = nullptr; - ggml_tensor * ffn_gate_view = model.layers[il].ffn_gate; - ggml_tensor * ffn_up_view = model.layers[il].ffn_up; - - if (ffn_gate_view == nullptr && ffn_up_view) { - - // Case A: weight stored as (2*ffn, hidden) -> split rows into two (ffn x hidden) - if( ffn_up_view->ne[0] == 2 * n_ff and ffn_up_view->ne[1] == n_embd) { - - // top half, (ffn up) - ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, - /*ne0*/ n_ff, /*ne1*/ n_embd, - /*nb1*/ model.layers[il].ffn_up->nb[1], - /*offset_bytes*/ (size_t)0); - // bottom half (gate) - ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, - /*ne0*/ n_ff, /*ne1*/ n_embd, - /*nb1*/ model.layers[il].ffn_up->nb[1], - - /*offset_bytes*/ (size_t)n_ff * model.layers[il].ffn_up->nb[1]); - } - - /* - else if ( ffn_up_view->ne[0] == n_embd && ffn_up_view->ne[1] == 2 * n_ff) { - // top half - LLAMA_LOG_INFO("Case B:\n"); - ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, - n_embd, n_ff, - model.layers[il].ffn_up->nb[1], - 0); - - ffn_up_view = ggml_cont(ctx0, ffn_up_view); - - ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, - n_embd, n_ff, - model.layers[il].ffn_up->nb[1], - n_ff * model.layers[il].ffn_up->nb[0]); - ffn_gate_view = ggml_cont(ctx0, ffn_gate_view); - } - */ - //ggml_tensor * ffn_down_view = model.layers[il].ffn_down; - //LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld}, Gate: {%lld, %lld}, Down: {%lld, %lld}\n", - // ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]); - /* - ggml_tensor * cur, - ggml_tensor * up, - ggml_tensor * up_b, - ggml_tensor * up_s, - ggml_tensor * gate, - ggml_tensor * gate_b, - ggml_tensor * gate_s, - ggml_tensor * down, - ggml_tensor * down_b, - ggml_tensor * down_s, - ggml_tensor * act_scales,*/ - mlp_out = build_ffn( - h, - model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL, - NULL , /*gate_b*/ NULL, /*gate_shexp*/ NULL, - model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL, - /*act_scales*/ NULL, - LLM_FFN_GEGLU, LLM_FFN_PAR, il - ); - cb(mlp_out, "ffn_out_geglu", il); - } else { - mlp_out = build_ffn( - h, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_GEGLU, LLM_FFN_PAR, il - ); - cb(mlp_out, "ffn_out_geglu", il); - } + // GEGLU because we will split ffn_up which has shape [n_embd, n_ff * 2] and ffn_down has shape [n_ff, n_embd] + ggml_tensor * mlp_out = build_ffn( + h, + model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL, + /*gate*/ NULL , /*gate_b*/ NULL, /*gate_shexp*/ NULL, + model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL, + /*act_scales*/ NULL, + LLM_FFN_GEGLU, LLM_FFN_PAR, il + ); + cb(mlp_out, "ffn_out_geglu", il); // Residual after MLP ggml_tensor * cur_layer = ggml_add(ctx0, mlp_out, cur_attn); @@ -7750,7 +7686,7 @@ struct llm_build_modern_bert : public llm_graph_context { inpL = cur_layer; } - // 9) final model norm (final_norm) + // final model norm (final_norm) cur = build_norm(inpL, model.output_norm, model.output_norm_b, LLM_NORM, -1); cb(cur, "final_norm", -1);