model : add HunyuanOCR support (#21395)

* HunyuanOCR: add support for text and vision models

- Add HunyuanOCR vision projector (perceiver-based) with Conv2d merge
- Add separate HUNYUAN_OCR chat template (content-before-role format)
- Handle HunyuanOCR's invalid pad_token_id=-1 in converter
- Fix EOS/EOT token IDs from generation_config.json
- Support xdrope RoPE scaling type
- Add tensor mappings for perceiver projector (mm.before_rms, mm.after_rms, etc.)
- Register HunYuanVLForConditionalGeneration for both text and mmproj conversion

* fix proper mapping

* Update gguf-py/gguf/tensor_mapping.py

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>

* Update tools/mtmd/clip.cpp

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>

* address comments

* update

* Fix typecheck

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Richard Davison 2026-04-05 23:32:14 +02:00 committed by GitHub
parent 761797ffdf
commit af76639f72
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 273 additions and 10 deletions

View File

@ -11521,13 +11521,50 @@ class LLaDAMoEModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("HunYuanDenseV1ForCausalLM")
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
class HunYuanModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
def _get_eod_token_id(self) -> int | None:
"""Get the actual end-of-generation token from config (eod_token_id)."""
return self.hparams.get("eod_token_id")
def _get_eot_token_id(self) -> int | None:
"""Get the end-of-turn token from generation_config.json.
This is the first entry in eos_token_id when it's a list."""
gen_cfg_path = self.dir_model / "generation_config.json"
if gen_cfg_path.is_file():
with open(gen_cfg_path, encoding="utf-8") as f:
gen_cfg = json.load(f)
eos = gen_cfg.get("eos_token_id")
if isinstance(eos, list) and len(eos) >= 2:
return eos[0]
return None
def _fix_special_tokens(self):
"""Fix EOS/EOT tokens that are incorrect in upstream configs."""
eod_id = self._get_eod_token_id()
if eod_id is not None:
self.gguf_writer.add_eos_token_id(eod_id)
eot_id = self._get_eot_token_id()
if eot_id is not None:
self.gguf_writer.add_eot_token_id(eot_id)
def set_vocab(self):
if (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
# HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
token_types = None
if (self.hparams.get("pad_token_id") or 0) < 0:
token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types)
special_vocab.add_to_gguf(self.gguf_writer)
self._fix_special_tokens()
else:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
@ -11579,13 +11616,18 @@ class HunYuanModel(TextModel):
# FIX for BOS token: Overwrite incorrect id read from config.json
if self.hparams['hidden_size'] == 4096:
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
self._fix_special_tokens()
def set_gguf_parameters(self):
# HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
saved_num_experts = self.hparams.pop("num_experts", None)
super().set_gguf_parameters()
if saved_num_experts is not None and saved_num_experts > 1:
self.hparams["num_experts"] = saved_num_experts
hparams = self.hparams
# Rope
if self.rope_parameters.get("rope_type") == "dynamic":
if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"):
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
alpha = self.rope_parameters.get("alpha", 50)
@ -11595,13 +11637,14 @@ class HunYuanModel(TextModel):
self.gguf_writer.add_rope_freq_base(scaled_base)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_rope_scaling_factor(1)
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
if self.rope_parameters.get("rope_type") == "dynamic":
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name == "lm_head.weight":
@ -11609,9 +11652,48 @@ class HunYuanModel(TextModel):
logger.info("Skipping tied output layer 'lm_head.weight'")
return
# skip vision tensors for HunyuanVL models
if name.startswith("vit."):
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("HunYuanVLForConditionalGeneration")
class HunyuanOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
# HunyuanOCR uses max_image_size instead of image_size
if "image_size" not in self.hparams_vision:
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
hparams = self.hparams_vision
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if not name.startswith("vit."):
return # skip text tensors
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
if "position_embedding" in name:
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
yield from super().modify_tensors(data_torch, name, bid)
def tensor_force_quant(self, name, new_name, bid, n_dims):
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
@ModelBase.register("SmolLM3ForCausalLM")
class SmolLM3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.SMOLLM3

View File

@ -734,6 +734,7 @@ class MODEL_TENSOR(IntEnum):
V_LAYER_OUT_SCALE = auto()
V_PRE_NORM = auto()
V_POST_NORM = auto()
V_MM_PRE_NORM = auto() # hunyuanocr
V_MM_POST_NORM = auto()
V_MM_INP_NORM = auto()
V_MM_INP_PROJ = auto() # gemma3
@ -769,6 +770,8 @@ class MODEL_TENSOR(IntEnum):
V_MM_GATE = auto() # cogvlm
V_TOK_BOI = auto() # cogvlm
V_TOK_EOI = auto() # cogvlm
V_TOK_IMG_BEGIN = auto() # hunyuanocr
V_TOK_IMG_END = auto() # hunyuanocr
V_STD_BIAS = auto() # gemma4
V_STD_SCALE = auto() # gemma4
V_SAM_POS_EMBD = auto() # Deepseek-OCR
@ -1246,6 +1249,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_MM_GATE: "mm.gate",
MODEL_TENSOR.V_TOK_BOI: "v.boi",
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
MODEL_TENSOR.V_MM_PRE_NORM: "mm.pre_norm",
MODEL_TENSOR.V_TOK_IMG_BEGIN: "mm.image_begin",
MODEL_TENSOR.V_TOK_IMG_END: "mm.image_end",
MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4
MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4
# DeepSeek-OCR SAM
@ -1393,6 +1399,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_MM_GATE,
MODEL_TENSOR.V_TOK_BOI,
MODEL_TENSOR.V_TOK_EOI,
MODEL_TENSOR.V_MM_PRE_NORM,
MODEL_TENSOR.V_TOK_IMG_BEGIN,
MODEL_TENSOR.V_TOK_IMG_END,
MODEL_TENSOR.V_STD_BIAS,
MODEL_TENSOR.V_STD_SCALE,
MODEL_TENSOR.V_SAM_POS_EMBD,
@ -4113,6 +4122,7 @@ class VisionProjectorType:
GLM4V = "glm4v"
YOUTUVL = "youtuvl"
NEMOTRON_V2_VL = "nemotron_v2_vl"
HUNYUANOCR = "hunyuanocr"
# Items here are (block size, type size)

View File

@ -1359,6 +1359,7 @@ class TensorNameMap:
"visual.merger.mlp.{bid}", # qwen2vl
"mlp_AR.linear_{bid}", # PaddleOCR-VL
"merger.mlp.{bid}",
"vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
),
MODEL_TENSOR.V_MMPROJ_FC: (
@ -1366,6 +1367,7 @@ class TensorNameMap:
"model.vision.linear_proj.linear_proj", # cogvlm
"model.projector.layers", # Deepseek-OCR
"visual.merger.proj", # glm4v
"vit.perceive.mlp", # HunyuanOCR
),
MODEL_TENSOR.V_MMPROJ_MLP: (
@ -1393,6 +1395,7 @@ class TensorNameMap:
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
"vpm.embeddings.patch_embedding",
"model.vision_model.embeddings.patch_embedding", # SmolVLM
"vit.embeddings.patch_embedding", # HunyuanOCR
"vision_tower.patch_conv", # pixtral-hf
"vision_encoder.patch_conv", # pixtral
"vision_model.patch_embedding.linear", # llama 4
@ -1414,6 +1417,7 @@ class TensorNameMap:
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
"vpm.embeddings.position_embedding",
"model.vision_model.embeddings.position_embedding", # SmolVLM
"vit.embeddings.position_embedding", # HunyuanOCR
"vision_model.positional_embedding_vlm", # llama 4
"vision_tower.patch_embed.pos_emb", # kimi-vl
"visual.pos_embed", # qwen3vl
@ -1425,10 +1429,12 @@ class TensorNameMap:
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
"model.image_newline", # Deepseek-OCR
"vit.perceive.image_newline", # HunyuanOCR
),
MODEL_TENSOR.V_ENC_EMBD_VSEP: (
"model.view_seperator", # Deepseek-OCR
"vit.perceive.image_sep", # HunyuanOCR
),
MODEL_TENSOR.V_ENC_ATTN_QKV: (
@ -1444,6 +1450,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.q_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
"vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
@ -1466,6 +1473,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.k_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
"vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
@ -1488,6 +1496,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
"vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
@ -1504,6 +1513,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
"vpm.encoder.layers.{bid}.layer_norm1",
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
"vit.layers.{bid}.input_layernorm", # HunyuanOCR
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
@ -1521,6 +1531,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
"vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
@ -1540,6 +1551,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
"vpm.encoder.layers.{bid}.layer_norm2",
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
"vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
@ -1557,6 +1569,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
"vpm.encoder.layers.{bid}.mlp.fc1",
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
"vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
@ -1583,6 +1596,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
"vpm.encoder.layers.{bid}.mlp.fc2",
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
"vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
@ -1639,6 +1653,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MM_POST_NORM: (
"visual.merger.post_projection_norm", # glm4v
"vit.perceive.after_rms", # HunyuanOCR
),
MODEL_TENSOR.V_MM_INP_PROJ: (
@ -1806,6 +1821,18 @@ class TensorNameMap:
"model.vision.eoi", # cogvlm
),
MODEL_TENSOR.V_MM_PRE_NORM: (
"vit.perceive.before_rms", # HunyuanOCR
),
MODEL_TENSOR.V_TOK_IMG_BEGIN: (
"vit.perceive.image_begin", # HunyuanOCR
),
MODEL_TENSOR.V_TOK_IMG_END: (
"vit.perceive.image_end", # HunyuanOCR
),
MODEL_TENSOR.V_STD_BIAS: (
"model.vision_tower.std_bias", # gemma4
),

View File

@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
{ "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
} else if (tmpl_contains("<hy_Assistant>") && tmpl_contains("<hy_begin▁of▁sentence>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
} else if (tmpl_contains("<hy_Assistant>") && tmpl_contains("<hy_place▁holder▁no▁3>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
@ -822,6 +825,22 @@ int32_t llm_chat_apply_template(
ss << "<hy_User>" << chat[i]->content << "<hy_Assistant>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
// tencent/HunyuanOCR
ss << "<hy_begin▁of▁sentence>";
for (size_t i = 0; i < chat.size(); i++) {
std::string role(chat[i]->role);
if (i == 0 && role == "system") {
ss << chat[i]->content << "<hy_place▁holder▁no▁3>";
continue;
}
if (role == "user") {
ss << chat[i]->content << "<hy_User>";
} else if (role == "assistant") {
ss << chat[i]->content << "<hy_Assistant>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
// moonshotai/Kimi-K2-Instruct
for (auto message : chat) {

View File

@ -53,6 +53,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
LLM_CHAT_TEMPLATE_OPENAI_MOE,
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2,

View File

@ -19,6 +19,7 @@ add_library(mtmd
models/conformer.cpp
models/gemma4v.cpp
models/glm4v.cpp
models/hunyuanocr.cpp
models/internvl.cpp
models/kimivl.cpp
models/kimik25.cpp

View File

@ -148,6 +148,11 @@
#define TN_TOK_BOI "v.boi"
#define TN_TOK_EOI "v.eoi"
// hunyuanocr
#define TN_MM_PRE_NORM "mm.pre_norm.%s"
#define TN_TOK_IMG_BEGIN "mm.image_begin"
#define TN_TOK_IMG_END "mm.image_end"
// deepseek-ocr
#define TN_SAM_POS_EMBD "v.sam.pos_embd.%s"
#define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s"
@ -266,6 +271,7 @@ enum projector_type {
PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_KIMIK25,
PROJECTOR_TYPE_NEMOTRON_V2_VL,
PROJECTOR_TYPE_HUNYUANOCR,
PROJECTOR_TYPE_UNKNOWN,
};
@ -306,6 +312,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {

View File

@ -358,7 +358,8 @@ struct clip_model {
// MINICPMV projection
ggml_tensor * mm_model_pos_embed_k = nullptr;
ggml_tensor * mm_model_query = nullptr;
ggml_tensor * mm_model_proj = nullptr;
ggml_tensor * mm_model_proj = nullptr;
ggml_tensor * mm_model_proj_b = nullptr;
ggml_tensor * mm_model_kv_proj = nullptr;
ggml_tensor * mm_model_attn_q_w = nullptr;
ggml_tensor * mm_model_attn_q_b = nullptr;
@ -419,6 +420,11 @@ struct clip_model {
ggml_tensor * mm_boi = nullptr;
ggml_tensor * mm_eoi = nullptr;
// hunyuanocr perceiver
ggml_tensor * mm_pre_norm_w = nullptr;
ggml_tensor * mm_img_begin = nullptr;
ggml_tensor * mm_img_end = nullptr;
// deepseek ocr sam
ggml_tensor * patch_embed_proj_w = nullptr;
ggml_tensor * patch_embed_proj_b = nullptr;

View File

@ -902,6 +902,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
{
builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
} break;
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_LDP:
@ -1408,6 +1412,14 @@ struct clip_model_loader {
get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
{
hparams.n_merge = 2;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
hparams.set_warmup_n_tokens(28*28);
} break;
case PROJECTOR_TYPE_LFM2A:
{
// audio preprocessing params
@ -2035,6 +2047,22 @@ struct clip_model_loader {
model.mm_boi = get_tensor(TN_TOK_BOI);
model.mm_eoi = get_tensor(TN_TOK_EOI);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
{
// proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
model.mm_model_proj_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
model.mm_pre_norm_w = get_tensor(string_format(TN_MM_PRE_NORM, "weight"));
model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
model.mm_img_begin = get_tensor(TN_TOK_IMG_BEGIN);
model.mm_img_end = get_tensor(TN_TOK_IMG_END);
model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR, false);
} break;
case PROJECTOR_TYPE_JANUS_PRO:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
@ -2584,6 +2612,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2;
default:
@ -2768,6 +2797,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
n_patches = h * (h + 1) + 1;
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
{
int merge = ctx->model.hparams.n_merge;
int ow = (img->nx / patch_size) / merge;
int oh = (img->ny / patch_size) / merge;
n_patches = (ow + 1) * oh + 2;
} break;
case PROJECTOR_TYPE_LFM2A:
{
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
@ -3175,6 +3211,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_PHI4:
case PROJECTOR_TYPE_COGVLM:
case PROJECTOR_TYPE_HUNYUANOCR:
{
// do nothing
} break;
@ -3346,6 +3383,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_KIMIK25:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_HUNYUANOCR:
return ctx->model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1];
case PROJECTOR_TYPE_DEEPSEEKOCR:

View File

@ -0,0 +1,59 @@
#include "models.h"
ggml_cgraph * clip_graph_hunyuanocr::build() {
const int merge = hparams.n_merge;
const int pw = n_patches_x;
const int ph = n_patches_y;
ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
ggml_tensor * inp = build_inp();
ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
// perceiver projector
cur = build_norm(cur, model.mm_pre_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
// [C, W*H] -> [W, H, C] for conv2d
cur = ggml_reshape_3d(ctx0, cur, n_embd, pw, ph);
cur = ggml_permute(ctx0, cur, 2, 0, 1, 3);
cur = ggml_cont(ctx0, cur);
// Conv2d(1152->2304, k=2, s=2) + GELU + Conv2d(2304->4608, k=1, s=1)
cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, merge, merge, 0, 0, 1, 1);
if (model.mm_0_b) {
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_0_b, 1, 1, model.mm_0_b->ne[0]));
}
cur = ggml_gelu(ctx0, cur);
cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 1, 1, 0, 0, 1, 1);
if (model.mm_1_b) {
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_1_b, 1, 1, model.mm_1_b->ne[0]));
}
const int ow = pw / merge;
const int oh = ph / merge;
const int idim = (int)cur->ne[2]; // OC = 4608
// append newline along W (dim 0)
ggml_tensor * nl = ggml_reshape_4d(ctx0, model.image_newline, 1, 1, idim, 1);
nl = ggml_repeat_4d(ctx0, nl, 1, oh, idim, 1);
cur = ggml_concat(ctx0, cur, nl, 0);
// [OW+1, OH, OC] -> [OC, (OW+1)*OH]
cur = ggml_permute(ctx0, cur, 1, 2, 0, 3);
cur = ggml_cont_2d(ctx0, cur, idim, (ow + 1) * oh);
// project to LLM hidden size
cur = build_mm(model.mm_model_proj, cur);
if (model.mm_model_proj_b) {
cur = ggml_add(ctx0, cur, model.mm_model_proj_b);
}
// wrap with begin/end tokens
cur = ggml_concat(ctx0, ggml_reshape_2d(ctx0, model.mm_img_begin, model.mm_img_begin->ne[0], 1), cur, 1);
cur = ggml_concat(ctx0, cur, ggml_reshape_2d(ctx0, model.mm_img_end, model.mm_img_end->ne[0], 1), 1);
cur = build_norm(cur, model.mm_post_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -98,6 +98,11 @@ struct clip_graph_glm4v : clip_graph {
ggml_cgraph * build() override;
};
struct clip_graph_hunyuanocr : clip_graph {
clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_mobilenetv5 : clip_graph {
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;

View File

@ -406,6 +406,13 @@ struct mtmd_context {
img_end = "\n"; // prevent empty batch on llama-server
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
{
// note: these use fullwidth (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
img_beg = "<hy_place▁holder▁no▁100>";
img_end = "<hy_place▁holder▁no▁101>";
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
} break;
default:
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
}