fixed tensor mappings and working on buildin graph

2025-08-27 15:32:20 -04:00 · 2025-08-27 15:32:20 -04:00 · 18c0c23ed8
parent 4ceb828112
commit 18c0c23ed8
3 changed files with 53 additions and 22 deletions
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -3015,9 +3015,6 @@ struct ggml_tensor * ggml_mul_mat(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
-
-    printf("Up: {%lld, %lld}\n", a->ne[0], a->ne[1]);
-    printf("Cur: {%lld, %lld}\n", b->ne[0], b->ne[1]);
    
    GGML_ASSERT(ggml_can_mul_mat(a, b));
    GGML_ASSERT(!ggml_is_transposed(a));
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -367,7 +367,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
            "layers.{bid}.feed_forward.w3",                           # llama-pth
            "encoder.layer.{bid}.intermediate.dense",                 # bert
-            "layers.{bid}.mlp.Wo",                                    # modern bert
+            "layers.{bid}.mlp.Wi",                                    # modern bert
            "transformer.layer.{bid}.ffn.lin1",                       # distillbert
            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
            "transformer.h.{bid}.mlp.linear_3",                       # refact
@ -467,7 +467,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
            "layers.{bid}.feed_forward.w2",                           # llama-pth
            "encoder.layer.{bid}.output.dense",                       # bert
-            "layers.{bid}.mlp.Wi",                                    # modern bert
+            "layers.{bid}.mlp.Wo",                                    # modern bert
            "transformer.layer.{bid}.ffn.lin2",                       # distillbert
            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -2708,8 +2708,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);

-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_ff, n_embd}, 0);   // [3072, 384]
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, 2 * n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0); 
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                    }
                } break;
@ -7548,6 +7548,7 @@ struct llm_build_modern_bert : public llm_graph_context {
        const int64_t n_embd_head   = hparams.n_embd_head_v;
        const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(); // == n_head_kv * n_embd_head
        const int64_t n_tokens      = ubatch.n_tokens;
+        const int64_t n_ff          = hparams.n_ff();

        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);

@ -7667,30 +7668,63 @@ struct llm_build_modern_bert : public llm_graph_context {

            // MLP (prefer GEGLU if gate exists or up has 2*n_ff rows)
            ggml_tensor * mlp_out = nullptr;
-            const bool has_gate_tensor = (model.layers[il].ffn_gate != nullptr);
-            const bool up_is_2x = (model.layers[il].ffn_up && model.layers[il].ffn_up->ne[0] == 2*hparams.n_ff());
+            ggml_tensor * ffn_gate_view = model.layers[il].ffn_gate;
+            ggml_tensor * ffn_up_view   = model.layers[il].ffn_up;
+
+            if (ffn_gate_view == nullptr && ffn_up_view) {
+
+                // Case A: weight stored as (2*ffn, hidden)  -> split rows into two (ffn x hidden)
+                if( ffn_up_view->ne[0] == 2 * n_ff and ffn_up_view->ne[1] == n_embd) {
+
+                    // top half, (ffn up)
+                    ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
+                                   /*ne0*/ n_ff, /*ne1*/ n_embd,
+                                   /*nb1*/ model.layers[il].ffn_up->nb[1],
+                                   /*offset_bytes*/ (size_t)0);
+                    // bottom half (gate)
+                    ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
+                                                /*ne0*/ n_ff, /*ne1*/ n_embd,
+                                                /*nb1*/ model.layers[il].ffn_up->nb[1],
+                                                /*offset_bytes*/ (size_t)n_ff * model.layers[il].ffn_up->nb[1]);
+                }
+                else if ( ffn_up_view->ne[0] == n_embd && ffn_up_view->ne[1] == 2 * n_ff) {
+                    // top half
+                    ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
+                           n_embd, n_ff,
+                           model.layers[il].ffn_up->nb[1],
+                           0);
+                    ffn_up_view = ggml_cont(ctx0, ffn_up_view);
+
+                    ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
+                                                n_embd, n_ff,
+                                                model.layers[il].ffn_up->nb[1],
+                                                n_ff * sizeof(float));
+                    ffn_gate_view = ggml_cont(ctx0, ffn_gate_view);
+                }
+
+                ggml_tensor * ffn_down_view = model.layers[il].ffn_down;
+                LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld},  Gate: {%lld, %lld},  Down: {%lld, %lld}",
+                                                ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]);

-            if (has_gate_tensor || up_is_2x) {
                mlp_out = build_ffn(
                    h,
                    model.layers[il].ffn_up,   /*up_b*/   NULL,           /*up_shexp*/   NULL,
-                    model.layers[il].ffn_gate, /*gate_b*/ NULL,           /*gate_shexp*/ NULL,
+                    ffn_gate_view         ,    /*gate_b*/ NULL,           /*gate_shexp*/ NULL,
                    model.layers[il].ffn_down, /*down_b*/ NULL,           /*down_shexp*/ NULL,
                    /*expert_scores*/ NULL,
-                    LLM_FFN_GEGLU, LLM_FFN_PAR, il);
-                cb(mlp_out, "ffn_out_geglu", il);
+                    LLM_FFN_GEGLU, LLM_FFN_PAR, il
+                );
+                cb(mlp_out, "ffn_out_geglu", il);   
            } else {
-
-                LLAMA_LOG_INFO("Ffn_up : {%lld, %lld}, ffn_down : {%lld, %lld}\n", model.layers[il].ffn_up->ne[0], model.layers[il].ffn_up->ne[1],
-                                                                                   model.layers[il].ffn_down->ne[0], model.layers[il].ffn_down->ne[0]);
                mlp_out = build_ffn(
                    h,
-                    model.layers[il].ffn_up,   /*up_b*/   NULL,           /*up_shexp*/   NULL,
-                    /*gate*/ NULL,             /*gate_b*/ NULL,           /*gate_shexp*/ NULL,
-                    model.layers[il].ffn_down, /*down_b*/ NULL,           /*down_shexp*/ NULL,
-                    /*expert_scores*/ NULL,
-                    LLM_FFN_GELU, LLM_FFN_SEQ, il);
-                cb(mlp_out, "ffn_out_gelu", il);
+                    model.layers[il].ffn_up,   NULL,    NULL,
+                    model.layers[il].ffn_gate, NULL,    NULL,
+                    model.layers[il].ffn_down, NULL,    NULL,
+                    NULL,
+                    LLM_FFN_GEGLU, LLM_FFN_PAR, il
+                );
+                cb(mlp_out, "ffn_out_geglu", il);
            }

            // Residual after MLP