diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 18dcc6ddfe..031b4c486f 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -18,6 +18,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_STARCODER,        "starcoder"        },
     { LLM_ARCH_REFACT,           "refact"           },
     { LLM_ARCH_BERT,             "bert"             },
+    { LLM_ARCH_MODERN_BERT,      "modern-bert"      },
     { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
     { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
     { LLM_ARCH_NEO_BERT,         "neo-bert"         },
@@ -505,6 +506,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_CLS_OUT,         "cls.output" },
         },
     },
+    {
+        LLM_ARCH_MODERN_BERT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+        },
+    },
     {
         LLM_ARCH_NOMIC_BERT,
         {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 053c72d6dc..417552096f 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1375,7 +1375,9 @@ ggml_tensor * llm_graph_context::build_attn(
 
     // [TAG_NO_CACHE_PAD]
     // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
-    assert(!ubatch.equal_seqs());
+    LLAMA_LOG_INFO("ubatch.equal_seqs() = %d, n_seqs = %d\n", ubatch.equal_seqs(), ubatch.n_seqs);
+
+    //assert(!ubatch.equal_seqs());
 
     ggml_tensor * q = q_cur;
     ggml_tensor * k = k_cur;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 58ca7df707..67fc2d003c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -451,6 +451,7 @@ void llama_model::load_arch(llama_model_loader & ml) {
 }
 
 void llama_model::load_hparams(llama_model_loader & ml) {
+    
     const gguf_context * ctx = ml.meta.get();
 
     // get metadata as string
@@ -464,6 +465,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         gguf_kv.emplace(name, value);
     }
 
+
     // get general kv
     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
 
@@ -584,6 +586,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     }
 
     // arch-specific KVs
+    LLAMA_LOG_INFO("Switching Arch\n");
     switch (arch) {
         case LLM_ARCH_LLAMA:
             {
@@ -757,6 +760,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_MODERN_BERT:
+            {
+                //ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+                LLAMA_LOG_INFO("Switching Modern Bert Arch\n");
+                switch (hparams.n_layer) {
+                    case 12:
+                        type = LLM_TYPE_47M; break; // granite-embeddings-mall
+                    default: type = LLM_TYPE_UNKNOWN; 
+                }
+            } break;
         case LLM_ARCH_JINA_BERT_V2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
@@ -1888,7 +1901,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
 void llama_model::load_vocab(llama_model_loader & ml) {
     const auto kv = LLM_KV(arch);
-
     vocab.load(ml, kv);
 }
 
@@ -2022,6 +2034,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         const int64_t n_expert_used = hparams.n_expert_used;
         const int64_t n_ctx_train   = hparams.n_ctx_train;
 
+        LLAMA_LOG_INFO("n_head        = %lld\n", (long long) n_head);
+        LLAMA_LOG_INFO("n_head_kv     = %lld\n", (long long) n_head_kv);
+        LLAMA_LOG_INFO("n_embd        = %lld\n", (long long) n_embd);
+        LLAMA_LOG_INFO("n_embd_k_gqa  = %lld\n", (long long) n_embd_k_gqa);
+        LLAMA_LOG_INFO("n_embd_v_gqa  = %lld\n", (long long) n_embd_v_gqa);
+        LLAMA_LOG_INFO("n_embd_head_k = %lld\n", (long long) n_embd_head_k);
+        LLAMA_LOG_INFO("n_embd_head_v = %lld\n", (long long) n_embd_head_v);
+        LLAMA_LOG_INFO("n_ff          = %lld\n", (long long) n_ff);
+        LLAMA_LOG_INFO("n_embd_gqa    = %lld\n", (long long) n_embd_gqa);
+        LLAMA_LOG_INFO("n_vocab       = %lld\n", (long long) n_vocab);
+        LLAMA_LOG_INFO("n_token_types = %lld\n", (long long) n_token_types);
+        LLAMA_LOG_INFO("n_rot         = %lld\n", (long long) n_rot);
+        LLAMA_LOG_INFO("n_expert      = %lld\n", (long long) n_expert);
+        LLAMA_LOG_INFO("n_expert_used = %lld\n", (long long) n_expert_used);
+        LLAMA_LOG_INFO("n_ctx_train   = %lld\n", (long long) n_ctx_train);
+
         if (n_expert > 0 && hparams.n_expert_used == 0) {
             throw std::runtime_error("model has expert layers but no expert layers are used");
         }
@@ -2033,7 +2061,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
         auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
             ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
-
+            LLAMA_LOG_INFO("Creating Tensor: %s\n", tn.str().c_str());
             if (!t_meta) {
                 if (flags & TENSOR_NOT_REQUIRED) {
                     return nullptr;
@@ -2108,7 +2136,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             }
 
             ggml_backend_buffer_type_t buft = nullptr;
-
             // check overrides
             if (ml.tensor_buft_overrides) {
                 std::string tensor_name = tn.str();
@@ -2156,7 +2183,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     first_moved_to_buft   = buft;
                 }
             }
-
             ggml_context * ctx = ctx_for_buft(buft);
 
             // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
@@ -2614,11 +2640,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_NOMIC_BERT_MOE:
                 {
                     tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
-                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
+                    tok_norm     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+
+                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);                        
+                    tok_norm_b   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
+
 
                     if (arch == LLM_ARCH_BERT) {
                         pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
-
                         cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
                         cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
 
@@ -2626,14 +2655,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
                     }
 
-                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
-                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
-
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
 
                         layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
-                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        
 
                         if (!layer.wqkv) {
                             layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
@@ -2647,7 +2673,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         }
 
                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
-
+                        
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
                         layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
                         layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
 
@@ -2657,6 +2684,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
                             layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
                         } else {
+
                             layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff}, 0);
                             layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd}, 0);
 
@@ -2671,6 +2699,33 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
                         layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
+                        
+                    }
+                } break;
+            case LLM_ARCH_MODERN_BERT: 
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
+                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+                                        
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+                    for(int i = 0; i < n_layer; ++i) {
+                        auto& layer = layers[i];
+                        
+                        // layer 0 uses identity so we dont need weights for said layer
+                        if ( i != 0 ) {
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        }
+                        else{
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        }
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_ff, n_embd} , 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_ff * 2}, 0);
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                     }
                 } break;
             case LLM_ARCH_NEO_BERT:
@@ -7498,6 +7553,175 @@ struct llm_build_bert : public llm_graph_context {
     }
 };
 
+struct llm_build_modern_bert : public llm_graph_context {
+    llm_build_modern_bert(const llama_model & model, const llm_graph_params & params)
+    : llm_graph_context(params) {
+        const int64_t n_embd        = hparams.n_embd;
+        const int64_t n_layer       = hparams.n_layer;
+        const int64_t n_head        = hparams.n_head();
+        const int64_t n_head_kv     = hparams.n_head_kv();
+        const int64_t n_embd_head   = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(); // == n_head_kv * n_embd_head
+        const int64_t n_tokens      = ubatch.n_tokens;
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        // RoPE params
+        const int32_t rope_type   = LLAMA_ROPE_TYPE_NEOX; // ModernBERT uses rotary
+        const int32_t n_rot       = hparams.n_rot;
+        const int32_t n_ctx_orig  = hparams.n_ctx_train;
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+        ggml_tensor * inp_pos = nullptr;
+
+        // ModernBERT needs positions for RoPE
+        inp_pos = build_inp_pos();
+
+        // 1) embeddings (token + optional type), NO absolute pos embed
+        inpL = build_inp_embd(model.tok_embd);
+
+        if (model.type_embd) {
+            ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
+            inpL = ggml_add(ctx0, inpL, type_row0);
+        }
+        cb(inpL, "inp_embd", -1);
+
+        // 2) embeddings LayerNorm (embeddings.norm)
+        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+        cb(inpL, "inp_norm", -1);
+
+        auto * inp_attn = build_attn_inp_no_cache();
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * x = inpL;
+
+            // pre-attention norm (attn_norm). Layer 0 may be Identity() -> nullptr
+            ggml_tensor * x_attn_in = x;
+            if (model.layers[il].attn_norm) {
+                x_attn_in = build_norm(x,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+                cb(x_attn_in, "attn_pre_norm", il);
+            } else {
+                cb(x_attn_in, "attn_pre_norm_identity", il);
+            }
+
+            // Attention: fused Wqkv  -> split -> heads -> RoPE(Q,K) -> attn -> Wo
+            ggml_tensor * qkv = nullptr;
+            ggml_tensor * Qcur;
+            ggml_tensor * Kcur;
+            ggml_tensor * Vcur;
+
+            GGML_ASSERT(model.layers[il].wqkv); // ModernBERT uses fused QKV
+            qkv = build_lora_mm(model.layers[il].wqkv, x_attn_in);
+            cb(qkv, "wqkv", il);
+
+            if (model.layers[il].bqkv) {
+                qkv = ggml_add(ctx0, qkv, model.layers[il].bqkv);
+                cb(qkv, "bqkv", il);
+            }
+
+            // Fused layout: [ (n_embd + 2*n_embd_gqa), n_tokens ]
+            Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd,     n_tokens, qkv->nb[1], 0*sizeof(float)*(n_embd)));
+            Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], 1*sizeof(float)*(n_embd)));
+            Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+            // Optional per Q/K 
+            if (model.layers[il].attn_q_norm) {
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il);
+            }
+            if (model.layers[il].attn_k_norm) {
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il);
+            }
+
+            // Heads
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            // RoPE (NEOX) on Q and K
+            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
+                                 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                 ext_factor, attn_factor, beta_fast, beta_slow);
+            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
+                                 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                 ext_factor, attn_factor, beta_fast, beta_slow);
+
+            cb(Qcur, "Qcur_rope", il);
+            cb(Kcur, "Kcur_rope", il);
+            cb(Vcur, "Vcur",      il);
+
+            ggml_tensor * attn_out = build_attn(
+                inp_attn,
+                model.layers[il].wo, model.layers[il].bo,  // Wo, optional bias
+                Qcur, Kcur, Vcur,
+                /*K_cache*/ nullptr, 
+                /*V_cache*/ nullptr,
+                1.0f / sqrtf(float(n_embd_head)),
+                il);
+            cb(attn_out, "attn_out", il);
+
+            // Residual after attention
+            ggml_tensor * cur_attn = ggml_add(ctx0, attn_out, x);
+
+            // If we subselect outputs, do it at the last layer after attn resid
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur_attn = ggml_get_rows(ctx0,  cur_attn, inp_out_ids);
+                x        = ggml_get_rows(ctx0,  x,        inp_out_ids);
+            }
+
+            // 5) pre-MLP norm (mlp_norm)
+            ggml_tensor * h = build_norm(cur_attn,
+                                         model.layers[il].ffn_norm,
+                                         model.layers[il].ffn_norm_b,
+                                         LLM_NORM, il);
+            cb(h, "mlp_pre_norm", il);
+
+            // 6) MLP (prefer GEGLU if gate exists or up has 2*n_ff rows)
+            ggml_tensor * mlp_out = nullptr;
+            const bool has_gate_tensor = (model.layers[il].ffn_gate != nullptr);
+            const bool up_is_2x = (model.layers[il].ffn_up && model.layers[il].ffn_up->ne[0] == 2*hparams.n_ff());
+
+            if (has_gate_tensor || up_is_2x) {
+                mlp_out = build_ffn(
+                    h,
+                    model.layers[il].ffn_up,   /*up_b*/   nullptr,           /*up_shexp*/   nullptr,
+                    model.layers[il].ffn_gate, /*gate_b*/ nullptr,           /*gate_shexp*/ nullptr,
+                    model.layers[il].ffn_down, /*down_b*/ nullptr,           /*down_shexp*/ nullptr,
+                    /*expert_scores*/ nullptr,
+                    LLM_FFN_GEGLU, LLM_FFN_PAR, il);
+                cb(mlp_out, "ffn_out_geglu", il);
+            } else {
+                mlp_out = build_ffn(
+                    h,
+                    model.layers[il].ffn_up,   /*up_b*/   nullptr,           /*up_shexp*/   nullptr,
+                    /*gate*/ nullptr,          /*gate_b*/ nullptr,           /*gate_shexp*/ nullptr,
+                    model.layers[il].ffn_down, /*down_b*/ nullptr,           /*down_shexp*/ nullptr,
+                    /*expert_scores*/ nullptr,
+                    LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(mlp_out, "ffn_out_gelu", il);
+            }
+
+            // 7) Residual after MLP
+            ggml_tensor * cur_layer = ggml_add(ctx0, mlp_out, cur_attn);
+
+            // 8) feed into next layer
+            inpL = cur_layer;
+        }
+
+        // 9) final model norm (final_norm)
+        cur = build_norm(inpL, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+        cb(cur, "final_norm", -1);
+
+        res->t_embd = cur;
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
+
 struct llm_build_neo_bert : public llm_graph_context {
     llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -18186,6 +18410,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_bert>(*this, params);
             } break;
+        case LLM_ARCH_MODERN_BERT:
+            {
+                llm = std::make_unique<llm_build_modern_bert>(*this, params);
+            } break;
         case LLM_ARCH_NEO_BERT:
             {
                 llm = std::make_unique<llm_build_neo_bert>(*this, params);
@@ -18666,6 +18894,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GROK:
         case LLM_ARCH_DBRX:
         case LLM_ARCH_BERT:
+        case LLM_ARCH_MODERN_BERT: 
         case LLM_ARCH_NOMIC_BERT:
         case LLM_ARCH_NOMIC_BERT_MOE:
         case LLM_ARCH_STABLELM:
diff --git a/src/llama-model.h b/src/llama-model.h
index 6fcd74d57f..5ebe320e36 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -23,6 +23,7 @@ enum llm_type {
     LLM_TYPE_17M,
     LLM_TYPE_22M,
     LLM_TYPE_33M,
+    LLM_TYPE_47M,
     LLM_TYPE_60M,
     LLM_TYPE_70M,
     LLM_TYPE_80M,
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index f7e03e702e..92a21b6426 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1661,10 +1661,13 @@ private:
 void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
     struct gguf_context * ctx = ml.meta.get();
 
+    LLAMA_LOG_INFO("Determining Vocab Type\n");
     // determine vocab type
     {
         ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
         ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
+        LLAMA_LOG_INFO("pre tokenizer model: %s\n", tokenizer_pre.c_str());
+        LLAMA_LOG_INFO("tokenizer model: %s\n", tokenizer_model.c_str());
 
         ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
 
@@ -1813,7 +1816,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-            } else if (tokenizer_pre == "default") {
+            } else if (tokenizer_pre == "default" || tokenizer_pre == "modern-bert") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
diff --git a/src/llama.cpp b/src/llama.cpp
index 34906cdb62..024e142453 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -126,6 +126,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
         if (!model.load_tensors(ml)) {
             return -2;
         }
+
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
         return -1;