model : add HunyuanOCR support (#21395)
* HunyuanOCR: add support for text and vision models - Add HunyuanOCR vision projector (perceiver-based) with Conv2d merge - Add separate HUNYUAN_OCR chat template (content-before-role format) - Handle HunyuanOCR's invalid pad_token_id=-1 in converter - Fix EOS/EOT token IDs from generation_config.json - Support xdrope RoPE scaling type - Add tensor mappings for perceiver projector (mm.before_rms, mm.after_rms, etc.) - Register HunYuanVLForConditionalGeneration for both text and mmproj conversion * fix proper mapping * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> * address comments * update * Fix typecheck * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
761797ffdf
commit
af76639f72
|
|
@ -11521,13 +11521,50 @@ class LLaDAMoEModel(TextModel):
|
|||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM")
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
|
||||
class HunYuanModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
|
||||
def _get_eod_token_id(self) -> int | None:
|
||||
"""Get the actual end-of-generation token from config (eod_token_id)."""
|
||||
return self.hparams.get("eod_token_id")
|
||||
|
||||
def _get_eot_token_id(self) -> int | None:
|
||||
"""Get the end-of-turn token from generation_config.json.
|
||||
This is the first entry in eos_token_id when it's a list."""
|
||||
gen_cfg_path = self.dir_model / "generation_config.json"
|
||||
if gen_cfg_path.is_file():
|
||||
with open(gen_cfg_path, encoding="utf-8") as f:
|
||||
gen_cfg = json.load(f)
|
||||
eos = gen_cfg.get("eos_token_id")
|
||||
if isinstance(eos, list) and len(eos) >= 2:
|
||||
return eos[0]
|
||||
return None
|
||||
|
||||
def _fix_special_tokens(self):
|
||||
"""Fix EOS/EOT tokens that are incorrect in upstream configs."""
|
||||
eod_id = self._get_eod_token_id()
|
||||
if eod_id is not None:
|
||||
self.gguf_writer.add_eos_token_id(eod_id)
|
||||
eot_id = self._get_eot_token_id()
|
||||
if eot_id is not None:
|
||||
self.gguf_writer.add_eot_token_id(eot_id)
|
||||
|
||||
def set_vocab(self):
|
||||
if (self.dir_model / "tokenizer.json").is_file():
|
||||
self._set_vocab_gpt2()
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
|
||||
token_types = None
|
||||
if (self.hparams.get("pad_token_id") or 0) < 0:
|
||||
token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
self._fix_special_tokens()
|
||||
else:
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
|
|
@ -11579,13 +11616,18 @@ class HunYuanModel(TextModel):
|
|||
# FIX for BOS token: Overwrite incorrect id read from config.json
|
||||
if self.hparams['hidden_size'] == 4096:
|
||||
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
|
||||
self._fix_special_tokens()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
# HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
|
||||
saved_num_experts = self.hparams.pop("num_experts", None)
|
||||
super().set_gguf_parameters()
|
||||
if saved_num_experts is not None and saved_num_experts > 1:
|
||||
self.hparams["num_experts"] = saved_num_experts
|
||||
hparams = self.hparams
|
||||
|
||||
# Rope
|
||||
if self.rope_parameters.get("rope_type") == "dynamic":
|
||||
if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"):
|
||||
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||
alpha = self.rope_parameters.get("alpha", 50)
|
||||
|
|
@ -11595,13 +11637,14 @@ class HunYuanModel(TextModel):
|
|||
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_rope_scaling_factor(1)
|
||||
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
||||
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
||||
if self.rope_parameters.get("rope_type") == "dynamic":
|
||||
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
||||
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
||||
|
||||
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
||||
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
||||
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
||||
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
||||
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
||||
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name == "lm_head.weight":
|
||||
|
|
@ -11609,9 +11652,48 @@ class HunYuanModel(TextModel):
|
|||
logger.info("Skipping tied output layer 'lm_head.weight'")
|
||||
return
|
||||
|
||||
# skip vision tensors for HunyuanVL models
|
||||
if name.startswith("vit."):
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanOCRVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
# HunyuanOCR uses max_image_size instead of image_size
|
||||
if "image_size" not in self.hparams_vision:
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if not name.startswith("vit."):
|
||||
return # skip text tensors
|
||||
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
|
||||
if "position_embedding" in name:
|
||||
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
|
||||
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
@ModelBase.register("SmolLM3ForCausalLM")
|
||||
class SmolLM3Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
||||
|
|
|
|||
|
|
@ -734,6 +734,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_LAYER_OUT_SCALE = auto()
|
||||
V_PRE_NORM = auto()
|
||||
V_POST_NORM = auto()
|
||||
V_MM_PRE_NORM = auto() # hunyuanocr
|
||||
V_MM_POST_NORM = auto()
|
||||
V_MM_INP_NORM = auto()
|
||||
V_MM_INP_PROJ = auto() # gemma3
|
||||
|
|
@ -769,6 +770,8 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_MM_GATE = auto() # cogvlm
|
||||
V_TOK_BOI = auto() # cogvlm
|
||||
V_TOK_EOI = auto() # cogvlm
|
||||
V_TOK_IMG_BEGIN = auto() # hunyuanocr
|
||||
V_TOK_IMG_END = auto() # hunyuanocr
|
||||
V_STD_BIAS = auto() # gemma4
|
||||
V_STD_SCALE = auto() # gemma4
|
||||
V_SAM_POS_EMBD = auto() # Deepseek-OCR
|
||||
|
|
@ -1246,6 +1249,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.V_MM_GATE: "mm.gate",
|
||||
MODEL_TENSOR.V_TOK_BOI: "v.boi",
|
||||
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
|
||||
MODEL_TENSOR.V_MM_PRE_NORM: "mm.pre_norm",
|
||||
MODEL_TENSOR.V_TOK_IMG_BEGIN: "mm.image_begin",
|
||||
MODEL_TENSOR.V_TOK_IMG_END: "mm.image_end",
|
||||
MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4
|
||||
MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4
|
||||
# DeepSeek-OCR SAM
|
||||
|
|
@ -1393,6 +1399,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.V_MM_GATE,
|
||||
MODEL_TENSOR.V_TOK_BOI,
|
||||
MODEL_TENSOR.V_TOK_EOI,
|
||||
MODEL_TENSOR.V_MM_PRE_NORM,
|
||||
MODEL_TENSOR.V_TOK_IMG_BEGIN,
|
||||
MODEL_TENSOR.V_TOK_IMG_END,
|
||||
MODEL_TENSOR.V_STD_BIAS,
|
||||
MODEL_TENSOR.V_STD_SCALE,
|
||||
MODEL_TENSOR.V_SAM_POS_EMBD,
|
||||
|
|
@ -4113,6 +4122,7 @@ class VisionProjectorType:
|
|||
GLM4V = "glm4v"
|
||||
YOUTUVL = "youtuvl"
|
||||
NEMOTRON_V2_VL = "nemotron_v2_vl"
|
||||
HUNYUANOCR = "hunyuanocr"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
|
|
|||
|
|
@ -1359,6 +1359,7 @@ class TensorNameMap:
|
|||
"visual.merger.mlp.{bid}", # qwen2vl
|
||||
"mlp_AR.linear_{bid}", # PaddleOCR-VL
|
||||
"merger.mlp.{bid}",
|
||||
"vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||
|
|
@ -1366,6 +1367,7 @@ class TensorNameMap:
|
|||
"model.vision.linear_proj.linear_proj", # cogvlm
|
||||
"model.projector.layers", # Deepseek-OCR
|
||||
"visual.merger.proj", # glm4v
|
||||
"vit.perceive.mlp", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||
|
|
@ -1393,6 +1395,7 @@ class TensorNameMap:
|
|||
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
|
||||
"vpm.embeddings.patch_embedding",
|
||||
"model.vision_model.embeddings.patch_embedding", # SmolVLM
|
||||
"vit.embeddings.patch_embedding", # HunyuanOCR
|
||||
"vision_tower.patch_conv", # pixtral-hf
|
||||
"vision_encoder.patch_conv", # pixtral
|
||||
"vision_model.patch_embedding.linear", # llama 4
|
||||
|
|
@ -1414,6 +1417,7 @@ class TensorNameMap:
|
|||
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
|
||||
"vpm.embeddings.position_embedding",
|
||||
"model.vision_model.embeddings.position_embedding", # SmolVLM
|
||||
"vit.embeddings.position_embedding", # HunyuanOCR
|
||||
"vision_model.positional_embedding_vlm", # llama 4
|
||||
"vision_tower.patch_embed.pos_emb", # kimi-vl
|
||||
"visual.pos_embed", # qwen3vl
|
||||
|
|
@ -1425,10 +1429,12 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
|
||||
"model.image_newline", # Deepseek-OCR
|
||||
"vit.perceive.image_newline", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_VSEP: (
|
||||
"model.view_seperator", # Deepseek-OCR
|
||||
"vit.perceive.image_sep", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
||||
|
|
@ -1444,6 +1450,7 @@ class TensorNameMap:
|
|||
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.q_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR
|
||||
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
|
||||
|
|
@ -1466,6 +1473,7 @@ class TensorNameMap:
|
|||
"model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR
|
||||
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
|
||||
|
|
@ -1488,6 +1496,7 @@ class TensorNameMap:
|
|||
"model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR
|
||||
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
|
||||
|
|
@ -1504,6 +1513,7 @@ class TensorNameMap:
|
|||
"model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.layer_norm1",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||
"vit.layers.{bid}.input_layernorm", # HunyuanOCR
|
||||
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
|
||||
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
|
||||
|
|
@ -1521,6 +1531,7 @@ class TensorNameMap:
|
|||
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
|
||||
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
|
||||
|
|
@ -1540,6 +1551,7 @@ class TensorNameMap:
|
|||
"model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.layer_norm2",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
|
||||
"vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR
|
||||
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
|
||||
|
|
@ -1557,6 +1569,7 @@ class TensorNameMap:
|
|||
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.mlp.fc1",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
|
||||
"vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
|
||||
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
|
||||
|
|
@ -1583,6 +1596,7 @@ class TensorNameMap:
|
|||
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.mlp.fc2",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
|
||||
"vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
|
||||
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
|
||||
|
|
@ -1639,6 +1653,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.V_MM_POST_NORM: (
|
||||
"visual.merger.post_projection_norm", # glm4v
|
||||
"vit.perceive.after_rms", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: (
|
||||
|
|
@ -1806,6 +1821,18 @@ class TensorNameMap:
|
|||
"model.vision.eoi", # cogvlm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_PRE_NORM: (
|
||||
"vit.perceive.before_rms", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_TOK_IMG_BEGIN: (
|
||||
"vit.perceive.image_begin", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_TOK_IMG_END: (
|
||||
"vit.perceive.image_end", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_STD_BIAS: (
|
||||
"model.vision_tower.std_bias", # gemma4
|
||||
),
|
||||
|
|
|
|||
|
|
@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
||||
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
||||
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
||||
{ "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
|
||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
||||
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
||||
|
|
@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
||||
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
|
||||
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
|
||||
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
|
||||
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
||||
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
||||
|
|
@ -822,6 +825,22 @@ int32_t llm_chat_apply_template(
|
|||
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
|
||||
// tencent/HunyuanOCR
|
||||
ss << "<|hy_begin▁of▁sentence|>";
|
||||
for (size_t i = 0; i < chat.size(); i++) {
|
||||
std::string role(chat[i]->role);
|
||||
if (i == 0 && role == "system") {
|
||||
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
|
||||
continue;
|
||||
}
|
||||
|
||||
if (role == "user") {
|
||||
ss << chat[i]->content << "<|hy_User|>";
|
||||
} else if (role == "assistant") {
|
||||
ss << chat[i]->content << "<|hy_Assistant|>";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
||||
// moonshotai/Kimi-K2-Instruct
|
||||
for (auto message : chat) {
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ enum llm_chat_template {
|
|||
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
||||
LLM_CHAT_TEMPLATE_OPENAI_MOE,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
|
||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||
LLM_CHAT_TEMPLATE_SEED_OSS,
|
||||
LLM_CHAT_TEMPLATE_GROK_2,
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ add_library(mtmd
|
|||
models/conformer.cpp
|
||||
models/gemma4v.cpp
|
||||
models/glm4v.cpp
|
||||
models/hunyuanocr.cpp
|
||||
models/internvl.cpp
|
||||
models/kimivl.cpp
|
||||
models/kimik25.cpp
|
||||
|
|
|
|||
|
|
@ -148,6 +148,11 @@
|
|||
#define TN_TOK_BOI "v.boi"
|
||||
#define TN_TOK_EOI "v.eoi"
|
||||
|
||||
// hunyuanocr
|
||||
#define TN_MM_PRE_NORM "mm.pre_norm.%s"
|
||||
#define TN_TOK_IMG_BEGIN "mm.image_begin"
|
||||
#define TN_TOK_IMG_END "mm.image_end"
|
||||
|
||||
// deepseek-ocr
|
||||
#define TN_SAM_POS_EMBD "v.sam.pos_embd.%s"
|
||||
#define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s"
|
||||
|
|
@ -266,6 +271,7 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_YOUTUVL,
|
||||
PROJECTOR_TYPE_KIMIK25,
|
||||
PROJECTOR_TYPE_NEMOTRON_V2_VL,
|
||||
PROJECTOR_TYPE_HUNYUANOCR,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
|
@ -306,6 +312,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
|
||||
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
|
||||
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
|
||||
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
|
|
|||
|
|
@ -358,7 +358,8 @@ struct clip_model {
|
|||
// MINICPMV projection
|
||||
ggml_tensor * mm_model_pos_embed_k = nullptr;
|
||||
ggml_tensor * mm_model_query = nullptr;
|
||||
ggml_tensor * mm_model_proj = nullptr;
|
||||
ggml_tensor * mm_model_proj = nullptr;
|
||||
ggml_tensor * mm_model_proj_b = nullptr;
|
||||
ggml_tensor * mm_model_kv_proj = nullptr;
|
||||
ggml_tensor * mm_model_attn_q_w = nullptr;
|
||||
ggml_tensor * mm_model_attn_q_b = nullptr;
|
||||
|
|
@ -419,6 +420,11 @@ struct clip_model {
|
|||
ggml_tensor * mm_boi = nullptr;
|
||||
ggml_tensor * mm_eoi = nullptr;
|
||||
|
||||
// hunyuanocr perceiver
|
||||
ggml_tensor * mm_pre_norm_w = nullptr;
|
||||
ggml_tensor * mm_img_begin = nullptr;
|
||||
ggml_tensor * mm_img_end = nullptr;
|
||||
|
||||
// deepseek ocr sam
|
||||
ggml_tensor * patch_embed_proj_w = nullptr;
|
||||
ggml_tensor * patch_embed_proj_b = nullptr;
|
||||
|
|
|
|||
|
|
@ -902,6 +902,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
|
|
@ -1408,6 +1412,14 @@ struct clip_model_loader {
|
|||
get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
|
||||
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
||||
hparams.set_warmup_n_tokens(28*28);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
// audio preprocessing params
|
||||
|
|
@ -2035,6 +2047,22 @@ struct clip_model_loader {
|
|||
model.mm_boi = get_tensor(TN_TOK_BOI);
|
||||
model.mm_eoi = get_tensor(TN_TOK_EOI);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
// proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
|
||||
model.mm_model_proj_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
|
||||
model.mm_pre_norm_w = get_tensor(string_format(TN_MM_PRE_NORM, "weight"));
|
||||
model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
|
||||
model.mm_img_begin = get_tensor(TN_TOK_IMG_BEGIN);
|
||||
model.mm_img_end = get_tensor(TN_TOK_IMG_END);
|
||||
model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
|
||||
model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
{
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
|
|
@ -2584,6 +2612,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
|||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
default:
|
||||
|
|
@ -2768,6 +2797,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
|
||||
n_patches = h * (h + 1) + 1;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
int merge = ctx->model.hparams.n_merge;
|
||||
int ow = (img->nx / patch_size) / merge;
|
||||
int oh = (img->ny / patch_size) / merge;
|
||||
n_patches = (ow + 1) * oh + 2;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
|
||||
|
|
@ -3175,6 +3211,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
case PROJECTOR_TYPE_PHI4:
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
// do nothing
|
||||
} break;
|
||||
|
|
@ -3346,6 +3383,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,59 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_hunyuanocr::build() {
|
||||
const int merge = hparams.n_merge;
|
||||
const int pw = n_patches_x;
|
||||
const int ph = n_patches_y;
|
||||
|
||||
ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
|
||||
|
||||
// perceiver projector
|
||||
cur = build_norm(cur, model.mm_pre_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
|
||||
|
||||
// [C, W*H] -> [W, H, C] for conv2d
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, pw, ph);
|
||||
cur = ggml_permute(ctx0, cur, 2, 0, 1, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
// Conv2d(1152->2304, k=2, s=2) + GELU + Conv2d(2304->4608, k=1, s=1)
|
||||
cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, merge, merge, 0, 0, 1, 1);
|
||||
if (model.mm_0_b) {
|
||||
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_0_b, 1, 1, model.mm_0_b->ne[0]));
|
||||
}
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (model.mm_1_b) {
|
||||
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_1_b, 1, 1, model.mm_1_b->ne[0]));
|
||||
}
|
||||
|
||||
const int ow = pw / merge;
|
||||
const int oh = ph / merge;
|
||||
const int idim = (int)cur->ne[2]; // OC = 4608
|
||||
|
||||
// append newline along W (dim 0)
|
||||
ggml_tensor * nl = ggml_reshape_4d(ctx0, model.image_newline, 1, 1, idim, 1);
|
||||
nl = ggml_repeat_4d(ctx0, nl, 1, oh, idim, 1);
|
||||
cur = ggml_concat(ctx0, cur, nl, 0);
|
||||
|
||||
// [OW+1, OH, OC] -> [OC, (OW+1)*OH]
|
||||
cur = ggml_permute(ctx0, cur, 1, 2, 0, 3);
|
||||
cur = ggml_cont_2d(ctx0, cur, idim, (ow + 1) * oh);
|
||||
|
||||
// project to LLM hidden size
|
||||
cur = build_mm(model.mm_model_proj, cur);
|
||||
if (model.mm_model_proj_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_model_proj_b);
|
||||
}
|
||||
|
||||
// wrap with begin/end tokens
|
||||
cur = ggml_concat(ctx0, ggml_reshape_2d(ctx0, model.mm_img_begin, model.mm_img_begin->ne[0], 1), cur, 1);
|
||||
cur = ggml_concat(ctx0, cur, ggml_reshape_2d(ctx0, model.mm_img_end, model.mm_img_end->ne[0], 1), 1);
|
||||
|
||||
cur = build_norm(cur, model.mm_post_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -98,6 +98,11 @@ struct clip_graph_glm4v : clip_graph {
|
|||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_hunyuanocr : clip_graph {
|
||||
clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_mobilenetv5 : clip_graph {
|
||||
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
|
|
|||
|
|
@ -406,6 +406,13 @@ struct mtmd_context {
|
|||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
// note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
|
||||
img_beg = "<|hy_place▁holder▁no▁100|>";
|
||||
img_end = "<|hy_place▁holder▁no▁101|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue