diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2505489b1e..79c0e437d3 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3015,9 +3015,6 @@ struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - - printf("Up: {%lld, %lld}\n", a->ne[0], a->ne[1]); - printf("Cur: {%lld, %lld}\n", b->ne[0], b->ne[1]); GGML_ASSERT(ggml_can_mul_mat(a, b)); GGML_ASSERT(!ggml_is_transposed(a)); diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 2d3c16ab84..e775f0f575 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -367,7 +367,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2 "layers.{bid}.feed_forward.w3", # llama-pth "encoder.layer.{bid}.intermediate.dense", # bert - "layers.{bid}.mlp.Wo", # modern bert + "layers.{bid}.mlp.Wi", # modern bert "transformer.layer.{bid}.ffn.lin1", # distillbert "transformer.h.{bid}.mlp.fc_in", # gpt-j "transformer.h.{bid}.mlp.linear_3", # refact @@ -467,7 +467,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2 "layers.{bid}.feed_forward.w2", # llama-pth "encoder.layer.{bid}.output.dense", # bert - "layers.{bid}.mlp.Wi", # modern bert + "layers.{bid}.mlp.Wo", # modern bert "transformer.layer.{bid}.ffn.lin2", # distillbert "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 92ff8b876f..6a8953af33 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2708,8 +2708,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_ff, n_embd}, 0); // [3072, 384] - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, 2 * n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); } } break; @@ -7548,6 +7548,7 @@ struct llm_build_modern_bert : public llm_graph_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); // == n_head_kv * n_embd_head const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_ff = hparams.n_ff(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7667,30 +7668,63 @@ struct llm_build_modern_bert : public llm_graph_context { // MLP (prefer GEGLU if gate exists or up has 2*n_ff rows) ggml_tensor * mlp_out = nullptr; - const bool has_gate_tensor = (model.layers[il].ffn_gate != nullptr); - const bool up_is_2x = (model.layers[il].ffn_up && model.layers[il].ffn_up->ne[0] == 2*hparams.n_ff()); + ggml_tensor * ffn_gate_view = model.layers[il].ffn_gate; + ggml_tensor * ffn_up_view = model.layers[il].ffn_up; + + if (ffn_gate_view == nullptr && ffn_up_view) { + + // Case A: weight stored as (2*ffn, hidden) -> split rows into two (ffn x hidden) + if( ffn_up_view->ne[0] == 2 * n_ff and ffn_up_view->ne[1] == n_embd) { + + // top half, (ffn up) + ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, + /*ne0*/ n_ff, /*ne1*/ n_embd, + /*nb1*/ model.layers[il].ffn_up->nb[1], + /*offset_bytes*/ (size_t)0); + // bottom half (gate) + ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, + /*ne0*/ n_ff, /*ne1*/ n_embd, + /*nb1*/ model.layers[il].ffn_up->nb[1], + /*offset_bytes*/ (size_t)n_ff * model.layers[il].ffn_up->nb[1]); + } + else if ( ffn_up_view->ne[0] == n_embd && ffn_up_view->ne[1] == 2 * n_ff) { + // top half + ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, + n_embd, n_ff, + model.layers[il].ffn_up->nb[1], + 0); + ffn_up_view = ggml_cont(ctx0, ffn_up_view); + + ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up, + n_embd, n_ff, + model.layers[il].ffn_up->nb[1], + n_ff * sizeof(float)); + ffn_gate_view = ggml_cont(ctx0, ffn_gate_view); + } + + ggml_tensor * ffn_down_view = model.layers[il].ffn_down; + LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld}, Gate: {%lld, %lld}, Down: {%lld, %lld}", + ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]); - if (has_gate_tensor || up_is_2x) { mlp_out = build_ffn( h, model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL, - model.layers[il].ffn_gate, /*gate_b*/ NULL, /*gate_shexp*/ NULL, + ffn_gate_view , /*gate_b*/ NULL, /*gate_shexp*/ NULL, model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL, /*expert_scores*/ NULL, - LLM_FFN_GEGLU, LLM_FFN_PAR, il); - cb(mlp_out, "ffn_out_geglu", il); + LLM_FFN_GEGLU, LLM_FFN_PAR, il + ); + cb(mlp_out, "ffn_out_geglu", il); } else { - - LLAMA_LOG_INFO("Ffn_up : {%lld, %lld}, ffn_down : {%lld, %lld}\n", model.layers[il].ffn_up->ne[0], model.layers[il].ffn_up->ne[1], - model.layers[il].ffn_down->ne[0], model.layers[il].ffn_down->ne[0]); mlp_out = build_ffn( h, - model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL, - /*gate*/ NULL, /*gate_b*/ NULL, /*gate_shexp*/ NULL, - model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL, - /*expert_scores*/ NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, il); - cb(mlp_out, "ffn_out_gelu", il); + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_GEGLU, LLM_FFN_PAR, il + ); + cb(mlp_out, "ffn_out_geglu", il); } // Residual after MLP