diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 16c5acf346..f5a9341c61 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -10028,6 +10028,14 @@ class ModernBertModel(BertModel): if name.startswith("model."): name = name[6:] + + if self.cls_out_labels: + # For BertForSequenceClassification (direct projection layer) + if name == "classifier.weight": + name = "classifier.out_proj.weight" + + if name == "classifier.bias": + name = "classifier.out_proj.bias" return super().modify_tensors(data_torch, name, bid) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index dbdae5719e..5d6f313ede 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2056,7 +2056,12 @@ void llm_graph_context::build_pooling( if (cls_b) { cur = ggml_add(ctx0, cur, cls_b); } - cur = ggml_tanh(ctx0, cur); + // modernbert uses gelu + if (arch == LLM_ARCH_MODERN_BERT) { + cur = ggml_gelu(ctx0, cur); + } else { + cur = ggml_tanh(ctx0, cur); + } if (cls_norm) { // head norm cur = build_norm(cur, cls_norm, NULL, LLM_NORM, -1); diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp index c7809bdedf..299e188216 100644 --- a/src/models/modern-bert.cpp +++ b/src/models/modern-bert.cpp @@ -110,13 +110,6 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, co LLM_NORM, -1); cb(cur, "final_norm_out", -1); - if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { - // extracting cls token - cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0); - cb(cur, "cls_pooled_embd", -1); - } - - cb(cur, "res_embd", -1); res->t_embd = cur; ggml_build_forward_expand(gf, cur); }