diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0d77b83c06..27c8f8e5b0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6005,6 +6005,7 @@ class Gemma3VisionModel(MmprojModel): return [] # skip other tensors + @ModelBase.register("DeepseekOCRForCausalLM") class DeepseekOCRVisionModel(MmprojModel): def set_gguf_parameters(self): @@ -6044,7 +6045,6 @@ class DeepseekOCRVisionModel(MmprojModel): return vision_config - def tensor_force_quant(self, name, new_name, bid, n_dims): # TODO: increase numercial stability. maybe delete later. return gguf.GGMLQuantizationType.F32 @@ -7244,7 +7244,7 @@ class DeepseekV2Model(TextModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) vision_config = self.hparams.get('vision_config', {}).get('width', {}) - + if 'clip-l-14-224' in vision_config and 'sam_vit_b' in vision_config: self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] @@ -7354,8 +7354,12 @@ class DeepseekV2Model(TextModel): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # skip vision tensors and remove "language_model." for Kimi-VL - if "vision_" in name or "multi_modal_projector" in name \ - or "image_newline" in name or "model.projector" in name or "sam_model" in name or "view_seperator" in name: + if ("vision_" in name + or "multi_modal_projector" in name + or "image_newline" in name + or "model.projector" in name + or "sam_model" in name + or "view_seperator" in name): return [] if name.startswith("language_model."): @@ -7435,6 +7439,7 @@ class DeepseekV2Model(TextModel): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") + @ModelBase.register("MiniMaxM2ForCausalLM") class MiniMaxM2Model(TextModel): model_arch = gguf.MODEL_ARCH.MINIMAXM2 diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 15c318e11c..0c04e10c47 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1129,7 +1129,7 @@ class GGUFWriter: def add_vision_sam_layers_count(self, value: int) -> None: self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value) - + def add_vision_sam_embedding_length(self, value: int) -> None: self.add_uint32(Keys.ClipVision.SAM.EMBEDDING_LENGTH, value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 90491b15da..9e9ab16dea 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1238,15 +1238,15 @@ class TensorNameMap: "vision_model.positional_embedding_vlm", # llama 4 "vision_tower.patch_embed.pos_emb", # kimi-vl "visual.pos_embed", # qwen3vl - "model.vision.patch_embedding.position_embedding", # cogvlm + "model.vision.patch_embedding.position_embedding", # cogvlm ), - + MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( - "model.image_newline", # Deepseek-OCR + "model.image_newline", # Deepseek-OCR ), - + MODEL_TENSOR.V_ENC_EMBD_VSEP: ( - "model.view_seperator", # Deepseek-OCR + "model.view_seperator", # Deepseek-OCR ), MODEL_TENSOR.V_ENC_ATTN_QKV: (