From 6151592ea795a4bf5950a16b917b89a21e4131c4 Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Thu, 21 Aug 2025 12:38:04 -0400 Subject: [PATCH] constants and tensor mappings for modern bert support, model not supported yet but working on getting conversion to work for encoder only --- convert_hf_to_gguf.py | 36 ++++++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 14 +++++++++++++ gguf-py/gguf/tensor_mapping.py | 11 ++++++++++- src/llama-arch.h | 1 + 4 files changed, 61 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b8c7d97a78..6251529e54 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -133,6 +133,7 @@ class ModelBase: self.ftype = gguf.LlamaFileType.MOSTLY_BF16 # Configure GGUF Writer + print(f"arch: {gguf.MODEL_ARCH_NAMES[self.model_arch]}") self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @@ -465,6 +466,7 @@ class ModelBase: @classmethod def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]: try: + print(f"model_type: {model_type}, arch: {arch}") return cls._model_classes[model_type][arch] except KeyError: raise NotImplementedError(f'Architecture {arch!r} not supported!') from None @@ -8303,6 +8305,40 @@ class SmallThinkerModel(TextModel): experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("ModernBertModel") +class ModernBertModel(TextModel): + model_arch = gguf.MODEL_ARCH.MODERN_BERT + + def set_gguf_parameters(self) -> None: + # Determine block count (number of hidden layers) + block_count = self.hparams.get("num_hidden_layers") or self.hparams.get("num_hidden_layers_alt") + if block_count is None: + raise ValueError("Could not determine number of hidden layers from hparams") + + # Attention heads and dimensions + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + raise ValueError("Missing 'num_attention_heads' in hparams") + + hidden_size = self.hparams["hidden_size"] + head_dim = hidden_size // n_head + ffn_dim = self.hparams.get("intermediate_size", 4 * hidden_size) + + # GGUF parameter assignment + self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 512)) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_feed_forward_length(ffn_dim) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_layer_norm_eps(self.hparams.get("layer_norm_eps", 1e-12)) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Directly map tensor names without QKV splitting or reordering + return [(self.map_tensor_name(name), data_torch)] + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 911eea504a..1273ca31d5 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -311,6 +311,7 @@ class MODEL_ARCH(IntEnum): STARCODER = auto() REFACT = auto() BERT = auto() + MODERN_BERT = auto() NOMIC_BERT = auto() NOMIC_BERT_MOE = auto() NEO_BERT = auto() @@ -642,6 +643,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.STARCODER: "starcoder", MODEL_ARCH.REFACT: "refact", MODEL_ARCH.BERT: "bert", + MODEL_ARCH.MODERN_BERT: "modern-bert", MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe", MODEL_ARCH.NEO_BERT: "neo-bert", @@ -1172,6 +1174,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.CLS, MODEL_TENSOR.CLS_OUT, ], + MODEL_ARCH.MODERN_BERT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_NORM, + ], MODEL_ARCH.NOMIC_BERT: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index dc7c03b464..2d3c16ab84 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -16,6 +16,7 @@ class TensorNameMap: "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 plamo2 granite-hybrid "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert + "embeddings.tok_embeddings", # modern bert "language_model.embedding.word_embeddings", # persimmon "wte", # gpt2 "transformer.embd.wte", # phi2 @@ -45,6 +46,7 @@ class TensorNameMap: MODEL_TENSOR.TOKEN_EMBD_NORM: ( "word_embeddings_layernorm", # bloom "embeddings.LayerNorm", # bert + "embeddings.norm", # modern bert "emb_ln", # nomic-bert "transformer.norm", # openelm "rwkv.blocks.0.pre_ln", # rwkv @@ -98,6 +100,7 @@ class TensorNameMap: "backbone.final_layer_norm", # wavtokenizer "model.norm", # llama4 "model.transformer.ln_f", # llada + "final_norm", # modern bert ), # Rope frequencies @@ -142,9 +145,10 @@ class TensorNameMap: "model.layers.{bid}.ln1", # rwkv7 "model.layers.{bid}.input_layernorm", # llama4 "transformer_encoder.{bid}.attention_norm", # neobert + "layers.{bid}.attn_norm", # bert "model.layers.{bid}.operator_norm", # lfm2 "model.transformer.blocks.{bid}.attn_norm", # llada - "layers.{bid}.input_layernorm", # qwen3-embedding + "layers.{bid}.input_layernorm", # qwen3-embedding, ), # Attention norm 2 @@ -174,6 +178,7 @@ class TensorNameMap: "encoder.layers.{bid}.self_attention.query_key_value", # chatglm "transformer.layers.{bid}.attn.qkv_proj", # openelm "transformer_encoder.{bid}.qkv", # neobert + "layers.{bid}.attn.Wqkv", # modern bert ), # Attention query @@ -240,6 +245,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.linear_attn", # deci "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert + "layers.{bid}.attn.Wo", # modern bert "transformer.layer.{bid}.attention.out_lin", # distillbert "transformer.h.{bid}.attn.out_proj", # gpt-j "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon @@ -311,6 +317,7 @@ class TensorNameMap: "model.layers.layers.{bid}.pre_mlp_norm", # plamo2 "model.transformer.blocks.{bid}.ff_norm", # llada "layers.{bid}.post_attention_layernorm", # qwen3-embedding + "layers.{bid}.mlp_norm" # modern bert ), # Post feed-forward norm @@ -360,6 +367,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2 "layers.{bid}.feed_forward.w3", # llama-pth "encoder.layer.{bid}.intermediate.dense", # bert + "layers.{bid}.mlp.Wo", # modern bert "transformer.layer.{bid}.ffn.lin1", # distillbert "transformer.h.{bid}.mlp.fc_in", # gpt-j "transformer.h.{bid}.mlp.linear_3", # refact @@ -459,6 +467,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2 "layers.{bid}.feed_forward.w2", # llama-pth "encoder.layer.{bid}.output.dense", # bert + "layers.{bid}.mlp.Wi", # modern bert "transformer.layer.{bid}.ffn.lin2", # distillbert "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon diff --git a/src/llama-arch.h b/src/llama-arch.h index 7af587e795..c99448e78f 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -22,6 +22,7 @@ enum llm_arch { LLM_ARCH_STARCODER, LLM_ARCH_REFACT, LLM_ARCH_BERT, + LLM_ARCH_MODERN_BERT, LLM_ARCH_NOMIC_BERT, LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_NEO_BERT,