diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 417552096f..a05de6e585 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1377,7 +1377,7 @@ ggml_tensor * llm_graph_context::build_attn(
     // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
     LLAMA_LOG_INFO("ubatch.equal_seqs() = %d, n_seqs = %d\n", ubatch.equal_seqs(), ubatch.n_seqs);
 
-    //assert(!ubatch.equal_seqs());
+    assert(!ubatch.equal_seqs());
 
     ggml_tensor * q = q_cur;
     ggml_tensor * k = k_cur;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 67fc2d003c..e31f5e5159 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7578,7 +7578,7 @@ struct llm_build_modern_bert : public llm_graph_context {
         // ModernBERT needs positions for RoPE
         inp_pos = build_inp_pos();
 
-        // 1) embeddings (token + optional type), NO absolute pos embed
+        // embeddings (token + optional type), NO absolute pos embed
         inpL = build_inp_embd(model.tok_embd);
 
         if (model.type_embd) {
@@ -7587,7 +7587,7 @@ struct llm_build_modern_bert : public llm_graph_context {
         }
         cb(inpL, "inp_embd", -1);
 
-        // 2) embeddings LayerNorm (embeddings.norm)
+        // embeddings LayerNorm (embeddings.norm)
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
@@ -7673,14 +7673,14 @@ struct llm_build_modern_bert : public llm_graph_context {
                 x        = ggml_get_rows(ctx0,  x,        inp_out_ids);
             }
 
-            // 5) pre-MLP norm (mlp_norm)
+            // pre-MLP norm (mlp_norm)
             ggml_tensor * h = build_norm(cur_attn,
                                          model.layers[il].ffn_norm,
                                          model.layers[il].ffn_norm_b,
                                          LLM_NORM, il);
             cb(h, "mlp_pre_norm", il);
 
-            // 6) MLP (prefer GEGLU if gate exists or up has 2*n_ff rows)
+            // MLP (prefer GEGLU if gate exists or up has 2*n_ff rows)
             ggml_tensor * mlp_out = nullptr;
             const bool has_gate_tensor = (model.layers[il].ffn_gate != nullptr);
             const bool up_is_2x = (model.layers[il].ffn_up && model.layers[il].ffn_up->ne[0] == 2*hparams.n_ff());
@@ -7705,14 +7705,14 @@ struct llm_build_modern_bert : public llm_graph_context {
                 cb(mlp_out, "ffn_out_gelu", il);
             }
 
-            // 7) Residual after MLP
+            // Residual after MLP
             ggml_tensor * cur_layer = ggml_add(ctx0, mlp_out, cur_attn);
 
-            // 8) feed into next layer
+            // feed into next layer
             inpL = cur_layer;
         }
 
-        // 9) final model norm (final_norm)
+        // final model norm (final_norm)
         cur = build_norm(inpL, model.output_norm, model.output_norm_b, LLM_NORM, -1);
         cb(cur, "final_norm", -1);
 
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 92a21b6426..0b6c8c73e2 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1816,7 +1816,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-            } else if (tokenizer_pre == "default" || tokenizer_pre == "modern-bert") {
+            } else if (tokenizer_pre == "default" || tokenizer_pre == "modern-bert") /* need to fix modern-bert pre tokenizer */ {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||