From 3d5acab3e774c3d30748d1e60093f19f0c80506e Mon Sep 17 00:00:00 2001 From: Pavel Zloi Date: Thu, 26 Mar 2026 18:49:09 +0300 Subject: [PATCH] convert : add RuGPT3XL (RuGPT3XLForCausalLM) support (#21011) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Support of ruGPT3XL model added * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * chkhsh for ruGPT3XL model added * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Fixing chkhsh for ruGPT3XL, rerun updated and _qkv_parts in RuGPT3XLModel --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 44 ++++++++++++++++++++++++++++++++++ convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/tensor_mapping.py | 1 + 3 files changed, 46 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f377738f8d..1e2fce7edd 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1311,6 +1311,9 @@ class TextModel(ModelBase): if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df": # ref: https://huggingface.co/aari1995/German_Semantic_V3 res = "jina-v2-de" + if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4": + # ref: https://huggingface.co/evilfreelancer/ruGPT3XL + res = "gpt-2" if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-bpe" @@ -5100,6 +5103,47 @@ class GPT2Model(TextModel): yield from super().modify_tensors(data_torch, new_name, bid) +@ModelBase.register("RuGPT3XLForCausalLM") +class RuGPT3XLModel(TextModel): + model_arch = gguf.MODEL_ARCH.GPT2 + + _qkv_parts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Fuse separate Q, K, V projections into a single QKV tensor + if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name: + suffix = "weight" if name.endswith(".weight") else "bias" + part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v") + key = f"{part}.{suffix}" + + assert bid is not None + if self._qkv_parts is None: + self._qkv_parts = [{} for _ in range(self.block_count)] + self._qkv_parts[bid][key] = data_torch + + q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}" + if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]): + q = self._qkv_parts[bid].pop(q_key) + k = self._qkv_parts[bid].pop(k_key) + v = self._qkv_parts[bid].pop(v_key) + data_torch = torch.cat([q, k, v], dim=0) + name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}") + logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}") + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._qkv_parts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()] + if len(parts) > 0: + raise ValueError(f"Unprocessed Q/K/V parts: {parts}") + + @ModelBase.register("PhiForCausalLM") class Phi2Model(TextModel): model_arch = gguf.MODEL_ARCH.PHI2 diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 1e8b29fb2e..086f1c2286 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -178,6 +178,7 @@ pre_computed_hashes = [ {"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"}, # jina-v2-de variants {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"}, + {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/evilfreelancer/ruGPT3XL", "chkhsh": "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4"}, ] diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 281c1a8304..df70577dbc 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -63,6 +63,7 @@ class TensorNameMap: "transformer.wpe", # gpt2 "embeddings.position_embeddings", # bert "wpe", # gpt2 + "model.embed_positions", # rugpt3xl ), # Output