From fdb17643d379cd35bf6acf0f57cfaa500f88a145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Wed, 11 Mar 2026 19:25:54 -0400 Subject: [PATCH] model : add support for Phi4ForCausalLMV (#20168) * Add support for Phi4ForCausalLMV. * Fix Phi-4 vision parity (correcting SigLIP2 patch-kernel export layout) and matching HF NaFlex resize behavior in mtmd. * Rename contants + fix tokenizer label * Clean-ups. * Fix GGUF export. * Set tokenizer.ggml.pre explicitly. * Default vocab name rather than forcing it. * Clean-ups. * Fix indent. * Fix subscriptable error. * remov overcomplicated code path * Clean-ups. --------- Co-authored-by: Xuan Son Nguyen --- convert_hf_to_gguf.py | 125 ++++++++++++++++++++++++++++++++++- gguf-py/gguf/constants.py | 1 + tools/mtmd/clip-impl.h | 2 + tools/mtmd/clip.cpp | 19 ++++++ tools/mtmd/models/siglip.cpp | 10 ++- tools/mtmd/mtmd.cpp | 3 + 6 files changed, 158 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5657ae8744..6e3d3ad1dc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5062,7 +5062,7 @@ class Phi2Model(TextModel): self.gguf_writer.add_add_bos_token(False) -@ModelBase.register("Phi3ForCausalLM") +@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV") class Phi3MiniModel(TextModel): model_arch = gguf.MODEL_ARCH.PHI3 @@ -5237,6 +5237,129 @@ class Phi3MiniModel(TextModel): yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith(("model.vision_tower.", "vision_tower.", "model.mm_projector.", "mm_projector.")): + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Phi4ForCausalLMV") +class Phi4VisionMmprojModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + + self.vision_total_layers = int(self.find_vparam(self.n_block_keys)) + if self.vision_total_layers < 2: + raise ValueError( + f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}" + ) + + # Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and + # drop post-layernorm/head weights. This makes the GGUF runtime output match + # the feature map consumed by the patched siglip.cpp Phi-4 projector path. + self.vision_export_layers = self.vision_total_layers - 1 + self.vision_last_layer_idx = self.vision_total_layers - 1 + + for key in self.n_block_keys: + if key in self.hparams_vision: + self.hparams_vision[key] = self.vision_export_layers + break + + self.block_count = self.vision_export_layers + self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) + + patch_size = self.preprocessor_config.get("patch_size") + if patch_size is None: + raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json") + + self.hparams_vision["patch_size"] = patch_size + + pos_emb_name = next( + ( + name for name in self.model_tensors + if name.endswith("vision_model.embeddings.position_embedding.weight") + ), + None, + ) + if pos_emb_name is None: + raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight") + + pos_emb_shape = self.model_tensors[pos_emb_name]().shape + base_grid_tokens = int(pos_emb_shape[0]) + grid_side = math.isqrt(base_grid_tokens) + if grid_side * grid_side != base_grid_tokens: + raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}") + + self.hparams_vision["image_size"] = grid_side * patch_size + + min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches")) + max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches")) + if min_num_patches is None or max_num_patches is None: + raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches") + + self.min_pixels = int(min_num_patches) * patch_size * patch_size + self.max_pixels = int(max_num_patches) * patch_size * patch_size + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4) + self.gguf_writer.add_vision_min_pixels(self.min_pixels) + self.gguf_writer.add_vision_max_pixels(self.max_pixels) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith(("model.vision_tower.vision_tower.", "vision_tower.")): + if ".vision_model.head." in name: + return + + new_name = name.replace("model.vision_tower.vision_tower.", "vision_tower.") + + if ".vision_model.post_layernorm." in new_name: + return + + if bid is not None and bid == self.vision_last_layer_idx: + return + + if new_name.endswith("vision_model.embeddings.patch_embedding.weight"): + assert self.hparams_vision is not None + if data_torch.ndim != 2: + raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}") + + patch_area = self.hparams_vision["patch_size"] ** 2 + in_features = data_torch.shape[1] + if in_features % patch_area != 0: + raise ValueError( + f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}" + ) + + num_channels = in_features // patch_area + patch_size = self.hparams_vision["patch_size"] + data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels) + data_torch = data_torch.permute(0, 3, 1, 2) + + yield from super().modify_tensors(data_torch, new_name, bid) + return + + if name.startswith(("model.mm_projector.", "mm_projector.")): + local_name = name + local_name = local_name.replace("model.mm_projector.", "") + local_name = local_name.replace("mm_projector.", "") + + if not (local_name.startswith("0.") or local_name.startswith("2.")): + return + + suffix = ".bias" if local_name.endswith(".bias") else ".weight" + mm_idx = int(local_name.split(".", maxsplit=1)[0]) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch) + return + + return + @ModelBase.register("PhiMoEForCausalLM") class PhiMoeModel(Phi3MiniModel): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6376ad0600..bf617382d0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3881,6 +3881,7 @@ class VisionProjectorType: GEMMA3 = "gemma3" GEMMA3NV = "gemma3nv" GEMMA3NA = "gemma3na" + PHI4 = "phi4" IDEFICS3 = "idefics3" PIXTRAL = "pixtral" LLAMA4 = "llama4" diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 0c3cf8670a..06e1ffb7ca 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -216,6 +216,7 @@ enum projector_type { PROJECTOR_TYPE_GEMMA3, PROJECTOR_TYPE_GEMMA3NV, PROJECTOR_TYPE_GEMMA3NA, + PROJECTOR_TYPE_PHI4, PROJECTOR_TYPE_IDEFICS3, PROJECTOR_TYPE_PIXTRAL, PROJECTOR_TYPE_QWEN25VL, @@ -253,6 +254,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_GEMMA3, "gemma3"}, { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"}, { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"}, + { PROJECTOR_TYPE_PHI4, "phi4"}, { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b70bad33b6..b6b31ae866 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -792,6 +792,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_JANUS_PRO: + case PROJECTOR_TYPE_PHI4: { builder = std::make_unique(ctx, img); } break; @@ -1144,6 +1145,13 @@ struct clip_model_loader { // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json hparams.set_limit_image_tokens(64, 256); } break; + case PROJECTOR_TYPE_PHI4: + { + hparams.n_merge = 1; + get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); + get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); + hparams.set_warmup_n_tokens(16*16); + } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { @@ -1841,6 +1849,13 @@ struct clip_model_loader { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); } break; + case PROJECTOR_TYPE_PHI4: + { + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; case PROJECTOR_TYPE_LFM2A: { for (int i : {0, 2, 3, 5, 6}) { @@ -3157,6 +3172,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->entries.push_back(std::move(img_f32)); } break; + case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { @@ -3383,6 +3399,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP_NORM: case PROJECTOR_TYPE_JANUS_PRO: + case PROJECTOR_TYPE_PHI4: { // do nothing } break; @@ -3884,6 +3901,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_MUSIC_FLAMINGO: case PROJECTOR_TYPE_JANUS_PRO: + case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_COGVLM: { // do nothing @@ -4013,6 +4031,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_LDPV2: return ctx->model.mm_model_peg_0_b->ne[0]; case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: return ctx->model.mm_2_w->ne[1]; diff --git a/tools/mtmd/models/siglip.cpp b/tools/mtmd/models/siglip.cpp index b866a11c5a..75f9b4db44 100644 --- a/tools/mtmd/models/siglip.cpp +++ b/tools/mtmd/models/siglip.cpp @@ -4,7 +4,7 @@ ggml_cgraph * clip_graph_siglip::build() { ggml_tensor * inp = build_inp(); ggml_tensor * learned_pos_embd = model.position_embeddings; - if (proj_type == PROJECTOR_TYPE_LFM2) { + if (proj_type == PROJECTOR_TYPE_LFM2 || proj_type == PROJECTOR_TYPE_PHI4) { learned_pos_embd = resize_position_embeddings(); } @@ -75,6 +75,14 @@ ggml_cgraph * clip_graph_siglip::build() { hparams.ffn_op, -1); + } else if (proj_type == PROJECTOR_TYPE_PHI4) { + cur = build_ffn(cur, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + } else { GGML_ABORT("SigLIP: Unsupported projector type"); } diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 8ca979c86c..ccafb80b2b 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -290,6 +290,9 @@ struct mtmd_context { img_beg = "<|vision_start|>"; img_end = "<|vision_end|>"; + } else if (proj == PROJECTOR_TYPE_PHI4) { + // Phi-4 uses media marker insertion only. Keep image boundary text empty. + } else if (proj == PROJECTOR_TYPE_LLAMA4) { // (more details in mtmd_context constructor) img_beg = "<|image_start|>";