diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d4929d6b6f..7ba6f6a742 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -11521,13 +11521,50 @@ class LLaDAMoEModel(TextModel): raise ValueError(f"Unprocessed experts: {experts}") -@ModelBase.register("HunYuanDenseV1ForCausalLM") +@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration") class HunYuanModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE + def _get_eod_token_id(self) -> int | None: + """Get the actual end-of-generation token from config (eod_token_id).""" + return self.hparams.get("eod_token_id") + + def _get_eot_token_id(self) -> int | None: + """Get the end-of-turn token from generation_config.json. + This is the first entry in eos_token_id when it's a list.""" + gen_cfg_path = self.dir_model / "generation_config.json" + if gen_cfg_path.is_file(): + with open(gen_cfg_path, encoding="utf-8") as f: + gen_cfg = json.load(f) + eos = gen_cfg.get("eos_token_id") + if isinstance(eos, list) and len(eos) >= 2: + return eos[0] + return None + + def _fix_special_tokens(self): + """Fix EOS/EOT tokens that are incorrect in upstream configs.""" + eod_id = self._get_eod_token_id() + if eod_id is not None: + self.gguf_writer.add_eos_token_id(eod_id) + eot_id = self._get_eot_token_id() + if eot_id is not None: + self.gguf_writer.add_eot_token_id(eot_id) + def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab + token_types = None + if (self.hparams.get("pad_token_id") or 0) < 0: + token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask') + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types) + special_vocab.add_to_gguf(self.gguf_writer) + self._fix_special_tokens() else: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) @@ -11579,13 +11616,18 @@ class HunYuanModel(TextModel): # FIX for BOS token: Overwrite incorrect id read from config.json if self.hparams['hidden_size'] == 4096: self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token + self._fix_special_tokens() def set_gguf_parameters(self): + # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it + saved_num_experts = self.hparams.pop("num_experts", None) super().set_gguf_parameters() + if saved_num_experts is not None and saved_num_experts > 1: + self.hparams["num_experts"] = saved_num_experts hparams = self.hparams # Rope - if self.rope_parameters.get("rope_type") == "dynamic": + if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"): # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) alpha = self.rope_parameters.get("alpha", 50) @@ -11595,13 +11637,14 @@ class HunYuanModel(TextModel): self.gguf_writer.add_rope_freq_base(scaled_base) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_rope_scaling_factor(1) - # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k - self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length - self.gguf_writer.add_context_length(256 * 1024) # 256k context length + if self.rope_parameters.get("rope_type") == "dynamic": + # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k + self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length + self.gguf_writer.add_context_length(256 * 1024) # 256k context length - # if any of our assumptions about the values are wrong, something has changed and this may need to be updated - assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ - "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" + # if any of our assumptions about the values are wrong, something has changed and this may need to be updated + assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ + "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name == "lm_head.weight": @@ -11609,9 +11652,48 @@ class HunYuanModel(TextModel): logger.info("Skipping tied output layer 'lm_head.weight'") return + # skip vision tensors for HunyuanVL models + if name.startswith("vit."): + return + yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("HunYuanVLForConditionalGeneration") +class HunyuanOCRVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + # HunyuanOCR uses max_image_size instead of image_size + if "image_size" not in self.hparams_vision: + self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + hparams = self.hparams_vision + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5)) + self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2)) + self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) + self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if not name.startswith("vit."): + return # skip text tensors + # strip CLS token (row 0) from position embeddings so resize_position_embeddings works + if "position_embedding" in name: + data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd] + yield from super().modify_tensors(data_torch, name, bid) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal + if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"): + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @ModelBase.register("SmolLM3ForCausalLM") class SmolLM3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.SMOLLM3 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3ebd9de5f6..6b1a19a309 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -734,6 +734,7 @@ class MODEL_TENSOR(IntEnum): V_LAYER_OUT_SCALE = auto() V_PRE_NORM = auto() V_POST_NORM = auto() + V_MM_PRE_NORM = auto() # hunyuanocr V_MM_POST_NORM = auto() V_MM_INP_NORM = auto() V_MM_INP_PROJ = auto() # gemma3 @@ -769,6 +770,8 @@ class MODEL_TENSOR(IntEnum): V_MM_GATE = auto() # cogvlm V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm + V_TOK_IMG_BEGIN = auto() # hunyuanocr + V_TOK_IMG_END = auto() # hunyuanocr V_STD_BIAS = auto() # gemma4 V_STD_SCALE = auto() # gemma4 V_SAM_POS_EMBD = auto() # Deepseek-OCR @@ -1246,6 +1249,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_MM_GATE: "mm.gate", MODEL_TENSOR.V_TOK_BOI: "v.boi", MODEL_TENSOR.V_TOK_EOI: "v.eoi", + MODEL_TENSOR.V_MM_PRE_NORM: "mm.pre_norm", + MODEL_TENSOR.V_TOK_IMG_BEGIN: "mm.image_begin", + MODEL_TENSOR.V_TOK_IMG_END: "mm.image_end", MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4 MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4 # DeepSeek-OCR SAM @@ -1393,6 +1399,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_MM_GATE, MODEL_TENSOR.V_TOK_BOI, MODEL_TENSOR.V_TOK_EOI, + MODEL_TENSOR.V_MM_PRE_NORM, + MODEL_TENSOR.V_TOK_IMG_BEGIN, + MODEL_TENSOR.V_TOK_IMG_END, MODEL_TENSOR.V_STD_BIAS, MODEL_TENSOR.V_STD_SCALE, MODEL_TENSOR.V_SAM_POS_EMBD, @@ -4113,6 +4122,7 @@ class VisionProjectorType: GLM4V = "glm4v" YOUTUVL = "youtuvl" NEMOTRON_V2_VL = "nemotron_v2_vl" + HUNYUANOCR = "hunyuanocr" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a7c7ce4640..1c324976c3 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1359,6 +1359,7 @@ class TensorNameMap: "visual.merger.mlp.{bid}", # qwen2vl "mlp_AR.linear_{bid}", # PaddleOCR-VL "merger.mlp.{bid}", + "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2) ), MODEL_TENSOR.V_MMPROJ_FC: ( @@ -1366,6 +1367,7 @@ class TensorNameMap: "model.vision.linear_proj.linear_proj", # cogvlm "model.projector.layers", # Deepseek-OCR "visual.merger.proj", # glm4v + "vit.perceive.mlp", # HunyuanOCR ), MODEL_TENSOR.V_MMPROJ_MLP: ( @@ -1393,6 +1395,7 @@ class TensorNameMap: "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 "vpm.embeddings.patch_embedding", "model.vision_model.embeddings.patch_embedding", # SmolVLM + "vit.embeddings.patch_embedding", # HunyuanOCR "vision_tower.patch_conv", # pixtral-hf "vision_encoder.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 @@ -1414,6 +1417,7 @@ class TensorNameMap: "model.vision_tower.embeddings.position_embeddings", # Intern-S1 "vpm.embeddings.position_embedding", "model.vision_model.embeddings.position_embedding", # SmolVLM + "vit.embeddings.position_embedding", # HunyuanOCR "vision_model.positional_embedding_vlm", # llama 4 "vision_tower.patch_embed.pos_emb", # kimi-vl "visual.pos_embed", # qwen3vl @@ -1425,10 +1429,12 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( "model.image_newline", # Deepseek-OCR + "vit.perceive.image_newline", # HunyuanOCR ), MODEL_TENSOR.V_ENC_EMBD_VSEP: ( "model.view_seperator", # Deepseek-OCR + "vit.perceive.image_sep", # HunyuanOCR ), MODEL_TENSOR.V_ENC_ATTN_QKV: ( @@ -1444,6 +1450,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.q_proj", "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM + "vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral @@ -1466,6 +1473,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.k_proj", "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM + "vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral @@ -1488,6 +1496,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM + "vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral @@ -1504,6 +1513,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm1", "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM + "vit.layers.{bid}.input_layernorm", # HunyuanOCR "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral "vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4 @@ -1521,6 +1531,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM + "vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf @@ -1540,6 +1551,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm2", "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM + "vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4 "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral @@ -1557,6 +1569,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc1", "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 + "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral "vision_model.model.layers.{bid}.mlp.fc1", # llama4 @@ -1583,6 +1596,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc2", "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 + "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral "vision_model.model.layers.{bid}.mlp.fc2", # llama4 @@ -1639,6 +1653,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_POST_NORM: ( "visual.merger.post_projection_norm", # glm4v + "vit.perceive.after_rms", # HunyuanOCR ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1806,6 +1821,18 @@ class TensorNameMap: "model.vision.eoi", # cogvlm ), + MODEL_TENSOR.V_MM_PRE_NORM: ( + "vit.perceive.before_rms", # HunyuanOCR + ), + + MODEL_TENSOR.V_TOK_IMG_BEGIN: ( + "vit.perceive.image_begin", # HunyuanOCR + ), + + MODEL_TENSOR.V_TOK_IMG_END: ( + "vit.perceive.image_end", # HunyuanOCR + ), + MODEL_TENSOR.V_STD_BIAS: ( "model.vision_tower.std_bias", # gemma4 ), diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 80a88fadec..6554a89b28 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -73,6 +73,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE }, { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE }, { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE }, + { "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR }, { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 }, { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS }, { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, @@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_HUNYUAN_MOE; } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) { return LLM_CHAT_TEMPLATE_OPENAI_MOE; + } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) { + return LLM_CHAT_TEMPLATE_HUNYUAN_OCR; } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) { return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE; } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) { @@ -822,6 +825,22 @@ int32_t llm_chat_apply_template( ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>"; } } + } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) { + // tencent/HunyuanOCR + ss << "<|hy_begin▁of▁sentence|>"; + for (size_t i = 0; i < chat.size(); i++) { + std::string role(chat[i]->role); + if (i == 0 && role == "system") { + ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>"; + continue; + } + + if (role == "user") { + ss << chat[i]->content << "<|hy_User|>"; + } else if (role == "assistant") { + ss << chat[i]->content << "<|hy_Assistant|>"; + } + } } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) { // moonshotai/Kimi-K2-Instruct for (auto message : chat) { diff --git a/src/llama-chat.h b/src/llama-chat.h index 2542f3cc86..13f936a946 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -53,6 +53,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_HUNYUAN_MOE, LLM_CHAT_TEMPLATE_OPENAI_MOE, LLM_CHAT_TEMPLATE_HUNYUAN_DENSE, + LLM_CHAT_TEMPLATE_HUNYUAN_OCR, LLM_CHAT_TEMPLATE_KIMI_K2, LLM_CHAT_TEMPLATE_SEED_OSS, LLM_CHAT_TEMPLATE_GROK_2, diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 675464c6b5..6ffdb674de 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -19,6 +19,7 @@ add_library(mtmd models/conformer.cpp models/gemma4v.cpp models/glm4v.cpp + models/hunyuanocr.cpp models/internvl.cpp models/kimivl.cpp models/kimik25.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 5fa487367c..1f2f7cfaac 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -148,6 +148,11 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" +// hunyuanocr +#define TN_MM_PRE_NORM "mm.pre_norm.%s" +#define TN_TOK_IMG_BEGIN "mm.image_begin" +#define TN_TOK_IMG_END "mm.image_end" + // deepseek-ocr #define TN_SAM_POS_EMBD "v.sam.pos_embd.%s" #define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s" @@ -266,6 +271,7 @@ enum projector_type { PROJECTOR_TYPE_YOUTUVL, PROJECTOR_TYPE_KIMIK25, PROJECTOR_TYPE_NEMOTRON_V2_VL, + PROJECTOR_TYPE_HUNYUANOCR, PROJECTOR_TYPE_UNKNOWN, }; @@ -306,6 +312,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, { PROJECTOR_TYPE_KIMIK25, "kimik25"}, { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, + { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 70270d6e76..b85c4122ed 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -358,7 +358,8 @@ struct clip_model { // MINICPMV projection ggml_tensor * mm_model_pos_embed_k = nullptr; ggml_tensor * mm_model_query = nullptr; - ggml_tensor * mm_model_proj = nullptr; + ggml_tensor * mm_model_proj = nullptr; + ggml_tensor * mm_model_proj_b = nullptr; ggml_tensor * mm_model_kv_proj = nullptr; ggml_tensor * mm_model_attn_q_w = nullptr; ggml_tensor * mm_model_attn_q_b = nullptr; @@ -419,6 +420,11 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; + // hunyuanocr perceiver + ggml_tensor * mm_pre_norm_w = nullptr; + ggml_tensor * mm_img_begin = nullptr; + ggml_tensor * mm_img_end = nullptr; + // deepseek ocr sam ggml_tensor * patch_embed_proj_w = nullptr; ggml_tensor * patch_embed_proj_b = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 12517123e7..2faf595a9f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -902,6 +902,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_HUNYUANOCR: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP_NORM: case PROJECTOR_TYPE_LDP: @@ -1408,6 +1412,14 @@ struct clip_model_loader { get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); } break; + case PROJECTOR_TYPE_HUNYUANOCR: + { + hparams.n_merge = 2; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); + get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); + hparams.set_warmup_n_tokens(28*28); + } break; case PROJECTOR_TYPE_LFM2A: { // audio preprocessing params @@ -2035,6 +2047,22 @@ struct clip_model_loader { model.mm_boi = get_tensor(TN_TOK_BOI); model.mm_eoi = get_tensor(TN_TOK_EOI); } break; + case PROJECTOR_TYPE_HUNYUANOCR: + { + // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear) + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); + model.mm_model_proj_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias")); + model.mm_pre_norm_w = get_tensor(string_format(TN_MM_PRE_NORM, "weight")); + model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight")); + model.mm_img_begin = get_tensor(TN_TOK_IMG_BEGIN); + model.mm_img_end = get_tensor(TN_TOK_IMG_END); + model.image_newline = get_tensor(TN_IMAGE_NEWLINE); + model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR, false); + } break; case PROJECTOR_TYPE_JANUS_PRO: { model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); @@ -2584,6 +2612,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: + case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_YOUTUVL: return (img->nx / params.patch_size) / 2; default: @@ -2768,6 +2797,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int h = static_cast(std::sqrt(static_cast(n_patches))); n_patches = h * (h + 1) + 1; } break; + case PROJECTOR_TYPE_HUNYUANOCR: + { + int merge = ctx->model.hparams.n_merge; + int ow = (img->nx / patch_size) / merge; + int oh = (img->ny / patch_size) / merge; + n_patches = (ow + 1) * oh + 2; + } break; case PROJECTOR_TYPE_LFM2A: { n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; @@ -3175,6 +3211,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_COGVLM: + case PROJECTOR_TYPE_HUNYUANOCR: { // do nothing } break; @@ -3346,6 +3383,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_KIMIK25: return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_HUNYUANOCR: + return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; case PROJECTOR_TYPE_DEEPSEEKOCR: diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanocr.cpp new file mode 100644 index 0000000000..37d1e2b86a --- /dev/null +++ b/tools/mtmd/models/hunyuanocr.cpp @@ -0,0 +1,59 @@ +#include "models.h" + +ggml_cgraph * clip_graph_hunyuanocr::build() { + const int merge = hparams.n_merge; + const int pw = n_patches_x; + const int ph = n_patches_y; + + ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr); + + // perceiver projector + cur = build_norm(cur, model.mm_pre_norm_w, nullptr, NORM_TYPE_RMS, eps, -1); + + // [C, W*H] -> [W, H, C] for conv2d + cur = ggml_reshape_3d(ctx0, cur, n_embd, pw, ph); + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); + cur = ggml_cont(ctx0, cur); + + // Conv2d(1152->2304, k=2, s=2) + GELU + Conv2d(2304->4608, k=1, s=1) + cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, merge, merge, 0, 0, 1, 1); + if (model.mm_0_b) { + cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_0_b, 1, 1, model.mm_0_b->ne[0])); + } + cur = ggml_gelu(ctx0, cur); + cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 1, 1, 0, 0, 1, 1); + if (model.mm_1_b) { + cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_1_b, 1, 1, model.mm_1_b->ne[0])); + } + + const int ow = pw / merge; + const int oh = ph / merge; + const int idim = (int)cur->ne[2]; // OC = 4608 + + // append newline along W (dim 0) + ggml_tensor * nl = ggml_reshape_4d(ctx0, model.image_newline, 1, 1, idim, 1); + nl = ggml_repeat_4d(ctx0, nl, 1, oh, idim, 1); + cur = ggml_concat(ctx0, cur, nl, 0); + + // [OW+1, OH, OC] -> [OC, (OW+1)*OH] + cur = ggml_permute(ctx0, cur, 1, 2, 0, 3); + cur = ggml_cont_2d(ctx0, cur, idim, (ow + 1) * oh); + + // project to LLM hidden size + cur = build_mm(model.mm_model_proj, cur); + if (model.mm_model_proj_b) { + cur = ggml_add(ctx0, cur, model.mm_model_proj_b); + } + + // wrap with begin/end tokens + cur = ggml_concat(ctx0, ggml_reshape_2d(ctx0, model.mm_img_begin, model.mm_img_begin->ne[0], 1), cur, 1); + cur = ggml_concat(ctx0, cur, ggml_reshape_2d(ctx0, model.mm_img_end, model.mm_img_end->ne[0], 1), 1); + + cur = build_norm(cur, model.mm_post_norm_w, nullptr, NORM_TYPE_RMS, eps, -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 992eda04bb..6f9632b62a 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -98,6 +98,11 @@ struct clip_graph_glm4v : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_hunyuanocr : clip_graph { + clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_mobilenetv5 : clip_graph { clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 35b4396fd8..4b6dd44f09 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -406,6 +406,13 @@ struct mtmd_context { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_HUNYUANOCR: + { + // note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary + img_beg = "<|hy_place▁holder▁no▁100|>"; + img_end = "<|hy_place▁holder▁no▁101|>"; + image_preproc = std::make_unique(ctx_v); + } break; default: throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj)); }