mtmd: Add Gemma3n multimodal support with MobileNetV5 vision encoder (#18256)
* Add Gemma3nVisionModel - MobileNetV5 vision encoder convertor to convert_hf_to_gguf.py. Add gemma3n to vision projectors in gguf-py/gguf/constants.py. * Add mobilenetv5 impl * Fix comments, remove unused vars * Fix permute and remove transpose of projection weights * Fix comments, remove debugging prints from hf_to_gguf * 1. Hard-code image_mean = 0 and image_std = 1 2. Use available tensor mapping logic 3. Remove redundant chat template replacement of soft tokens placeholder with media placeholder * 1. Move mobilenetv5 helpers declarations to `clip_graph_mobilenetv5` struct and definitions to mobilenetv5.cpp 2.Remove unused `clip_is_gemma3n` func declarations and definitions 3. Remove redundant `rescale_image_u8_to_f32` func and use `normalize_image_u8_to_f32` with zero mean and unit std 4. Calculate n_patches using image_size / patch_size * Remove obsolete comments * - convert_hf_to_gguf.py & constants.py & tensor_mapping.py: Use explicit mapping: Custom map for double indexed blocks and tensor_mapping.py for rest - convert_hf_to_gguf.py: Unsqueeze Stem Bias and Layer scale tensors to correct shape while converting to gguf - mobilenetv5.cpp: Remove explicit reshaping of Stem Bias and Layer scale which are now handled while converting to gguf, replace fprintf with LOG_* - clip.cpp: Remove unused embedding and hard_emb_norm tensor loading * - Rename tensors to v.conv..., v.blk..., v.msfa... to better align with already existing terminology * Fix stem conv bias name * Remove explicit handling of bias term for stem conv * - Change order of addition in "project_per_layer_inputs" to support broadcasting of vision inp_per_layer - Simplify the vision embeddings path of "get_per_layer_inputs" to output [n_embd_altup, n_layer, 1], broadcastable * clean up conversion script * fix code style * also preserve audio tensors * trailing space * split arch A and V * rm unused gemma3 func * fix alignment --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
parent
593da7fa49
commit
a61c8bc3bf
|
|
@ -528,7 +528,11 @@ class ModelBase:
|
|||
return ()
|
||||
|
||||
def prepare_tensors(self):
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
|
||||
if self.tensor_map.mapping:
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
else:
|
||||
max_name_len = len("vision_encoder.weight,") # Default reasonable length
|
||||
|
||||
for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
|
||||
# we don't need these
|
||||
|
|
@ -6038,7 +6042,175 @@ class Gemma3VisionModel(MmprojModel):
|
|||
return [] # skip other tensors
|
||||
|
||||
|
||||
class ConformerAudioModel(MmprojModel):
|
||||
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
|
||||
|
||||
@staticmethod
|
||||
def is_audio_tensor(name: str):
|
||||
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if ConformerAudioModel.is_audio_tensor(name):
|
||||
if ".conv" in name or "_conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# fold running_mean, running_var and eps into weight and bias for batch_norm
|
||||
if "batch_norm" in name:
|
||||
if self._batch_norm_tensors is None:
|
||||
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
|
||||
assert bid is not None
|
||||
self._batch_norm_tensors[bid][name] = data_torch
|
||||
|
||||
if len(self._batch_norm_tensors[bid]) < 5:
|
||||
return []
|
||||
|
||||
weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
|
||||
bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
|
||||
running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
|
||||
running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
|
||||
eps = 1e-5 # default value
|
||||
|
||||
a = weight / torch.sqrt(running_var + eps)
|
||||
b = bias - running_mean * a
|
||||
return [
|
||||
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
|
||||
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
|
||||
]
|
||||
|
||||
# reshape conv weights
|
||||
if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
|
||||
data_torch = data_torch[:, None, None]
|
||||
if "conv.depthwise_conv" in name and name.endswith(".weight"):
|
||||
assert data_torch.shape[1] == 1
|
||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
|
||||
if "conv.pointwise_conv" in name and name.endswith(".weight"):
|
||||
assert data_torch.shape[2] == 1
|
||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Gemma3nForConditionalGeneration")
|
||||
class Gemma3nVisionAudioModel(ConformerAudioModel):
|
||||
has_audio_encoder = True
|
||||
has_vision_encoder = True
|
||||
|
||||
# Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
|
||||
# This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
|
||||
block_tensor_mapping = {
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight",
|
||||
"model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight",
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Parent init will call find_hparam which now returns 0 for empty keys
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it
|
||||
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4
|
||||
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
|
||||
|
||||
# MobileNetV5 does not use image_mean/std
|
||||
self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0]
|
||||
self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0]
|
||||
self.hparams_vision["image_size"] = self.preprocessor_config.get(
|
||||
"size", {"height": 768, "width": 768}
|
||||
)["height"]
|
||||
|
||||
# Image sequence length (256 tokens = 16x16 for Gemma3n)
|
||||
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
|
||||
image_size = self.hparams_vision["image_size"]
|
||||
self.hparams_vision["patch_size"] = image_size // image_seq_length
|
||||
|
||||
# remap audio hparams
|
||||
assert self.hparams_audio is not None
|
||||
self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"]
|
||||
self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"]
|
||||
self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"]
|
||||
self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# vision params
|
||||
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
||||
|
||||
# audio params
|
||||
assert self.hparams_audio is not None
|
||||
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# Force quantization settings for specific tensor types
|
||||
if "input_projection" in name or "input_proj" in name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
if ".embeddings." in name or "stem" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def custom_map(self, name: str) -> str:
|
||||
"""Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping."""
|
||||
parts = name.split(".")
|
||||
# MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix
|
||||
if len(parts) >= 7:
|
||||
bid, sid = parts[4], parts[5]
|
||||
suffix = ".".join(parts[6:])
|
||||
template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}"
|
||||
if template in self.block_tensor_mapping:
|
||||
return self.block_tensor_mapping[template].format(bid=bid, sid=sid)
|
||||
|
||||
raise ValueError(f"Unknown name: {name}")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if (ConformerAudioModel.is_audio_tensor(name)):
|
||||
name = name.replace("model.audio_tower.conformer.", "conformer.layers.")
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
# Gemma3n uses
|
||||
# - model.embed_vision.* for projection layers
|
||||
# - model.vision_tower.* for vision encoder
|
||||
# Skip non-vision tensors
|
||||
if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")):
|
||||
return []
|
||||
|
||||
if name.startswith("model.vision_tower.timm_model.blocks."):
|
||||
# Double-indexed block tensors through custom logic
|
||||
new_name = self.custom_map(name)
|
||||
else:
|
||||
# Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
|
||||
data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
|
||||
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
|
||||
class Gemma3NModel(Gemma3Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA3N
|
||||
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
||||
|
|
@ -6061,8 +6233,25 @@ class Gemma3NModel(Gemma3Model):
|
|||
]
|
||||
|
||||
def set_vocab(self):
|
||||
# For Gemma3n multimodal models, we need the FULL vocab_size (262400)
|
||||
# which includes special tokens from 262144-262399 for vision/audio.
|
||||
# The vocab_size_per_layer_input (262144) is only the embedding size per layer.
|
||||
# Temporarily override the hparams lookup order to prioritize vocab_size.
|
||||
|
||||
# Store original vocab_size_per_layer_input if it exists
|
||||
vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input")
|
||||
|
||||
# Temporarily remove vocab_size_per_layer_input to force using vocab_size
|
||||
if vocab_size_per_layer_input is not None:
|
||||
del self.hparams["vocab_size_per_layer_input"]
|
||||
|
||||
# Call parent set_vocab which will now use vocab_size (262400)
|
||||
super().set_vocab()
|
||||
|
||||
# Restore vocab_size_per_layer_input for later use
|
||||
if vocab_size_per_layer_input is not None:
|
||||
self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
|
||||
|
|
@ -6098,8 +6287,32 @@ class Gemma3NModel(Gemma3Model):
|
|||
if "language_model." not in name:
|
||||
return [] # skip non-language model tensors
|
||||
|
||||
# Pad token embeddings for vision/audio special tokens (262144-262399)
|
||||
if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name:
|
||||
# Move to CPU to avoid meta device issues during padding
|
||||
data_torch = data_torch.to(device="cpu")
|
||||
|
||||
vocab_size = self.hparams.get("vocab_size", 262400)
|
||||
current_size = data_torch.shape[0] # First dimension is vocab_size
|
||||
|
||||
if current_size < vocab_size:
|
||||
# Pad with zeros for vision/audio tokens (they get embeddings from vision tower)
|
||||
padding_size = vocab_size - current_size
|
||||
tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings"
|
||||
logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)")
|
||||
|
||||
# Create padding with zeros (vision tokens won't use these embeddings)
|
||||
padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device)
|
||||
data_torch = torch.cat([data_torch, padding], dim=0)
|
||||
|
||||
# Continue with normal processing
|
||||
name = name.replace("language_model.", "")
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
if "altup_unembed_projections" in name:
|
||||
data_torch = data_torch.to(device="cpu")
|
||||
# altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based
|
||||
# They should NOT be padded
|
||||
if ".0." in name:
|
||||
self._altup_unembd[0] = data_torch
|
||||
elif ".1." in name:
|
||||
|
|
@ -9936,7 +10149,7 @@ class LFM2Model(TextModel):
|
|||
self._add_feed_forward_length()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if self._is_vision_tensor(name) or self._is_audio_tensor(name):
|
||||
if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name):
|
||||
# skip multimodal tensors
|
||||
return []
|
||||
|
||||
|
|
@ -9952,9 +10165,6 @@ class LFM2Model(TextModel):
|
|||
def _is_vision_tensor(self, name: str) -> bool:
|
||||
return "vision_tower" in name or "multi_modal_projector" in name
|
||||
|
||||
def _is_audio_tensor(self, name: str):
|
||||
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
|
||||
|
||||
|
||||
@ModelBase.register("Lfm2Model")
|
||||
class LFM2ColBertModel(LFM2Model):
|
||||
|
|
@ -10082,13 +10292,11 @@ class LFM2VLModel(MmprojModel):
|
|||
|
||||
|
||||
@ModelBase.register("Lfm2AudioForConditionalGeneration")
|
||||
class LFM2AudioModel(MmprojModel):
|
||||
class LFM2AudioModel(ConformerAudioModel):
|
||||
has_vision_encoder = False
|
||||
has_audio_encoder = True
|
||||
model_name = "Lfm2AudioEncoder"
|
||||
|
||||
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("encoder")
|
||||
|
||||
|
|
@ -10102,12 +10310,7 @@ class LFM2AudioModel(MmprojModel):
|
|||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if ".conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
# skip language model tensors
|
||||
if name.startswith("lfm."):
|
||||
return []
|
||||
|
|
@ -10120,40 +10323,7 @@ class LFM2AudioModel(MmprojModel):
|
|||
if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
|
||||
return []
|
||||
|
||||
# fold running_mean, running_var and eps into weight and bias for batch_norm
|
||||
if "batch_norm" in name:
|
||||
if self._batch_norm_tensors is None:
|
||||
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
|
||||
assert bid is not None
|
||||
self._batch_norm_tensors[bid][name] = data_torch
|
||||
|
||||
if len(self._batch_norm_tensors[bid]) < 5:
|
||||
return []
|
||||
|
||||
weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
|
||||
bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
|
||||
running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
|
||||
running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
|
||||
eps = 1e-5 # default value
|
||||
|
||||
a = weight / torch.sqrt(running_var + eps)
|
||||
b = bias - running_mean * a
|
||||
return [
|
||||
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
|
||||
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
|
||||
]
|
||||
|
||||
# reshape conv weights
|
||||
if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
|
||||
data_torch = data_torch[:, None, None]
|
||||
if "conv.depthwise_conv" in name and name.endswith(".weight"):
|
||||
assert data_torch.shape[1] == 1
|
||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
|
||||
if "conv.pointwise_conv" in name and name.endswith(".weight"):
|
||||
assert data_torch.shape[2] == 1
|
||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("SmallThinkerForCausalLM")
|
||||
|
|
|
|||
|
|
@ -276,12 +276,13 @@ class Keys:
|
|||
DATASETS = "imatrix.datasets"
|
||||
|
||||
class Clip:
|
||||
PROJECTOR_TYPE = "clip.projector_type"
|
||||
HAS_VISION_ENCODER = "clip.has_vision_encoder"
|
||||
HAS_AUDIO_ENCODER = "clip.has_audio_encoder"
|
||||
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
|
||||
PROJECTOR_TYPE = "clip.projector_type"
|
||||
HAS_VISION_ENCODER = "clip.has_vision_encoder"
|
||||
HAS_AUDIO_ENCODER = "clip.has_audio_encoder"
|
||||
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
|
||||
|
||||
class ClipVision:
|
||||
PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models
|
||||
IMAGE_SIZE = "clip.vision.image_size"
|
||||
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
|
||||
PATCH_SIZE = "clip.vision.patch_size"
|
||||
|
|
@ -307,6 +308,7 @@ class Keys:
|
|||
SCALE_FACTOR = "clip.vision.projector.scale_factor"
|
||||
|
||||
class ClipAudio:
|
||||
PROJECTOR_TYPE = "clip.audio.projector_type" # for mixed modality models
|
||||
NUM_MEL_BINS = "clip.audio.num_mel_bins"
|
||||
EMBEDDING_LENGTH = "clip.audio.embedding_length"
|
||||
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
|
||||
|
|
@ -465,6 +467,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
|
|||
RESAMPLER = auto()
|
||||
GLM_EDGE = auto()
|
||||
MERGER = auto()
|
||||
GEMMA3N = auto()
|
||||
GEMMA3 = auto()
|
||||
QWEN3VL = auto()
|
||||
COGVLM = auto()
|
||||
|
|
@ -675,6 +678,15 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_MM_INP_NORM = auto()
|
||||
V_MM_INP_PROJ = auto() # gemma3
|
||||
V_MM_SOFT_EMB_NORM = auto() # gemma3
|
||||
V_MM_EMBEDDING = auto() # gemma3n
|
||||
V_MM_HARD_EMB_NORM = auto() # gemma3n
|
||||
V_ENC_CONV_STEM = auto() # gemma3n
|
||||
V_ENC_CONV_STEM_NORM = auto() # gemma3n
|
||||
V_ENC_MSFA_EXP = auto() # gemma3n
|
||||
V_ENC_MSFA_EXP_NORM = auto() # gemma3n
|
||||
V_ENC_MSFA_PROJ = auto() # gemma3n
|
||||
V_ENC_MSFA_PROJ_NORM = auto() # gemma3n
|
||||
V_ENC_MSFA_NORM = auto() # gemma3n
|
||||
V_RESMPL_POS_EMBD_K = auto() # minicpmv
|
||||
V_RESMPL_ATTN_Q = auto() # minicpmv
|
||||
V_RESMPL_ATTN_K = auto() # minicpmv
|
||||
|
|
@ -698,30 +710,41 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_TOK_BOI = auto() # cogvlm
|
||||
V_TOK_EOI = auto() # cogvlm
|
||||
# audio (mtmd)
|
||||
A_ENC_EMBD_POS = auto()
|
||||
A_ENC_EMBD_NORM = auto()
|
||||
A_ENC_EMBD_TO_LOGITS = auto()
|
||||
A_ENC_CONV1D = auto()
|
||||
A_PRE_NORM = auto()
|
||||
A_POST_NORM = auto()
|
||||
A_ENC_ATTN_Q = auto()
|
||||
A_ENC_ATTN_K = auto()
|
||||
A_ENC_ATTN_V = auto()
|
||||
A_ENC_INPUT_NORM = auto()
|
||||
A_ENC_OUTPUT = auto()
|
||||
A_ENC_OUTPUT_NORM = auto()
|
||||
A_ENC_FFN_UP = auto()
|
||||
A_ENC_FFN_NORM = auto()
|
||||
A_ENC_FFN_GATE = auto()
|
||||
A_ENC_FFN_DOWN = auto()
|
||||
A_ENC_FFN_UP_1 = auto()
|
||||
A_ENC_FFN_NORM_1 = auto()
|
||||
A_ENC_FFN_GATE_1 = auto()
|
||||
A_ENC_FFN_DOWN_1 = auto()
|
||||
A_MMPROJ = auto()
|
||||
A_MMPROJ_FC = auto()
|
||||
A_MM_NORM_PRE = auto()
|
||||
A_MM_NORM_MID = auto()
|
||||
A_ENC_EMBD_POS = auto()
|
||||
A_ENC_EMBD_NORM = auto()
|
||||
A_ENC_EMBD_TO_LOGITS = auto() # lfm2
|
||||
A_ENC_CONV1D = auto()
|
||||
A_ENC_CONV1D_NORM = auto() # gemma3n
|
||||
A_PRE_NORM = auto()
|
||||
A_POST_NORM = auto()
|
||||
A_ENC_LAYER_PRE_NORM = auto() # gemma3n
|
||||
A_ENC_ATTN_Q = auto()
|
||||
A_ENC_ATTN_K = auto()
|
||||
A_ENC_ATTN_V = auto()
|
||||
A_ENC_PER_DIM_SCALE = auto() # gemma3n
|
||||
A_ENC_INPUT_NORM = auto()
|
||||
A_ENC_OUTPUT = auto()
|
||||
A_ENC_OUTPUT_NORM = auto()
|
||||
A_ENC_FFN_UP = auto()
|
||||
A_ENC_FFN_NORM = auto()
|
||||
A_ENC_FFN_POST_NORM = auto() # gemma3n
|
||||
A_ENC_FFN_SCALE = auto() # gemma3n
|
||||
A_ENC_FFN_GATE = auto()
|
||||
A_ENC_FFN_DOWN = auto()
|
||||
A_ENC_FFN_UP_1 = auto() # lfm2, gemma3n
|
||||
A_ENC_FFN_NORM_1 = auto() # lfm2, gemma3n (pre-norm)
|
||||
A_ENC_FFN_POST_NORM_1 = auto() # gemma3n
|
||||
A_ENC_FFN_SCALE_1 = auto() # gemma3n
|
||||
A_ENC_FFN_GATE_1 = auto() # lfm2, gemma3n
|
||||
A_ENC_FFN_DOWN_1 = auto() # lfm2, gemma3n
|
||||
A_MMPROJ = auto()
|
||||
A_MMPROJ_FC = auto()
|
||||
A_MM_NORM_PRE = auto()
|
||||
A_MM_NORM_MID = auto()
|
||||
A_MM_EMBEDDING = auto() # gemma3n
|
||||
A_MM_HARD_EMB_NORM = auto() # gemma3n
|
||||
A_MM_SOFT_EMB_NORM = auto() # gemma3n
|
||||
A_MM_INP_PROJ = auto() # gemma3n
|
||||
# nextn/mtp
|
||||
NEXTN_EH_PROJ = auto()
|
||||
NEXTN_EMBED_TOKENS = auto()
|
||||
|
|
@ -1071,7 +1094,16 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
||||
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", # gemma3n
|
||||
MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n
|
||||
MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM: "v.conv_stem.conv", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.conv_stem.bn", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP: "v.msfa.ffn.pw_exp.conv", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: "v.msfa.ffn.pw_exp.bn", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ: "v.msfa.ffn.pw_proj.conv", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: "v.msfa.ffn.pw_proj.bn", # gemma3n
|
||||
MODEL_TENSOR.V_ENC_MSFA_NORM: "v.msfa.norm", # gemma3n
|
||||
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k",
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q",
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k",
|
||||
|
|
@ -1100,19 +1132,26 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
|
||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
|
||||
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm",
|
||||
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
||||
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
|
||||
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: "a.blk.{bid}.layer_pre_norm",
|
||||
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale",
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
|
||||
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM: "a.blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM: "a.blk.{bid}.ffn_post_norm",
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE: "a.blk.{bid}.ffn_scale",
|
||||
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1: "a.blk.{bid}.ffn_norm_1",
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: "a.blk.{bid}.ffn_post_norm_1",
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE_1: "a.blk.{bid}.ffn_scale_1",
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1: "a.blk.{bid}.ffn_up_1",
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE_1: "a.blk.{bid}.ffn_gate_1",
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1: "a.blk.{bid}.ffn_down_1",
|
||||
|
|
@ -1120,6 +1159,10 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
|
||||
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
|
||||
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
|
||||
MODEL_TENSOR.A_MM_INP_PROJ: "mm.a.input_projection", # gemma3n
|
||||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n
|
||||
MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n
|
||||
# lfm2 audio
|
||||
MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv",
|
||||
MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos",
|
||||
|
|
@ -1170,6 +1213,15 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.V_MM_INP_PROJ,
|
||||
MODEL_TENSOR.V_MM_INP_NORM,
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
|
||||
MODEL_TENSOR.V_MM_EMBEDDING,
|
||||
MODEL_TENSOR.V_MM_HARD_EMB_NORM,
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM,
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM_NORM,
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP,
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP_NORM,
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ,
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM,
|
||||
MODEL_TENSOR.V_ENC_MSFA_NORM,
|
||||
MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_Q,
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_K,
|
||||
|
|
@ -1197,19 +1249,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
|
||||
MODEL_TENSOR.A_ENC_CONV1D,
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM,
|
||||
MODEL_TENSOR.A_PRE_NORM,
|
||||
MODEL_TENSOR.A_POST_NORM,
|
||||
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM,
|
||||
MODEL_TENSOR.A_ENC_ATTN_Q,
|
||||
MODEL_TENSOR.A_ENC_ATTN_K,
|
||||
MODEL_TENSOR.A_ENC_ATTN_V,
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE,
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM,
|
||||
MODEL_TENSOR.A_ENC_OUTPUT,
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM,
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM,
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE,
|
||||
MODEL_TENSOR.A_ENC_FFN_UP,
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE,
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN,
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1,
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1,
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE_1,
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1,
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE_1,
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1,
|
||||
|
|
@ -1226,6 +1285,10 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_ENC_CONV_NORM,
|
||||
MODEL_TENSOR.A_ENC_CONV_PW1,
|
||||
MODEL_TENSOR.A_ENC_CONV_PW2,
|
||||
MODEL_TENSOR.A_MM_INP_PROJ,
|
||||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
|
||||
MODEL_TENSOR.A_MM_EMBEDDING,
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
|
||||
],
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
|
@ -3496,6 +3559,8 @@ class GGUFValueType(IntEnum):
|
|||
|
||||
class VisionProjectorType:
|
||||
GEMMA3 = "gemma3"
|
||||
GEMMA3NV = "gemma3nv"
|
||||
GEMMA3NA = "gemma3na"
|
||||
IDEFICS3 = "idefics3"
|
||||
PIXTRAL = "pixtral"
|
||||
LLAMA4 = "llama4"
|
||||
|
|
|
|||
|
|
@ -1086,6 +1086,9 @@ class GGUFWriter:
|
|||
def add_clip_projector_type(self, value: str) -> None:
|
||||
self.add_string(Keys.Clip.PROJECTOR_TYPE, value)
|
||||
|
||||
def add_clip_vision_projector_type(self, value: str) -> None:
|
||||
self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
|
||||
|
||||
def add_vision_projection_dim(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
|
||||
|
||||
|
|
@ -1168,6 +1171,9 @@ class GGUFWriter:
|
|||
|
||||
# audio models
|
||||
|
||||
def add_clip_audio_projector_type(self, value: str) -> None:
|
||||
self.add_string(Keys.ClipAudio.PROJECTOR_TYPE, value)
|
||||
|
||||
def add_audio_projection_dim(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)
|
||||
|
||||
|
|
|
|||
|
|
@ -123,6 +123,40 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.CONV1D: (
|
||||
"backbone.embed", # roberta
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_EMBEDDING: (
|
||||
"model.embed_vision.embedding", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
|
||||
"model.embed_vision.hard_embedding_norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: (
|
||||
"model.embed_vision.embedding_projection", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
|
||||
"model.embed_vision.soft_embedding_norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM: (
|
||||
"model.vision_tower.timm_model.conv_stem.conv", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM_NORM: (
|
||||
"model.vision_tower.timm_model.conv_stem.bn", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_exp.conv", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_exp.bn", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_proj.conv", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_proj.bn", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.norm", # gemma3n
|
||||
),
|
||||
}
|
||||
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
|
|
@ -1575,6 +1609,11 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.A_ENC_CONV1D: (
|
||||
"audio_tower.conv{bid}", # ultravox
|
||||
"conformer.pre_encode.conv.{bid}", # lfm2
|
||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: (
|
||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PRE_NORM: (),
|
||||
|
|
@ -1587,40 +1626,64 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.A_ENC_ATTN_Q: (
|
||||
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: (
|
||||
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: (
|
||||
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
|
||||
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
|
||||
"conformer.layers.{bid}.norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM: (
|
||||
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
|
||||
"conformer.layers.{bid}.norm_self_att", # lfm2
|
||||
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT: (
|
||||
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
||||
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
|
||||
"conformer.layers.{bid}.norm_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM: (
|
||||
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
|
||||
"conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE: (
|
||||
"conformer.layers.{bid}.ffw_layer_start.post_layer_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_UP: (
|
||||
"audio_tower.layers.{bid}.fc1", # ultravox
|
||||
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
||||
|
|
@ -1628,22 +1691,35 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.A_ENC_FFN_DOWN: (
|
||||
"audio_tower.layers.{bid}.fc2", # ultravox
|
||||
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
|
||||
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
|
||||
"conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
|
||||
"conformer.layers.{bid}.ffw_layer_end.post_layer_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_LINEAR_POS: (
|
||||
"conformer.layers.{bid}.self_attn.linear_pos", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_POS_BIAS_U: (
|
||||
|
|
@ -1656,6 +1732,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.A_ENC_OUT: (
|
||||
"conformer.pre_encode.out", # lfm2
|
||||
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
|
||||
),
|
||||
|
||||
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
||||
|
|
@ -1681,22 +1758,40 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.A_ENC_CONV_DW: (
|
||||
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_NORM: (
|
||||
"conformer.layers.{bid}.conv.batch_norm", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_PW1: (
|
||||
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_PW2: (
|
||||
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_NORM_CONV: (
|
||||
"conformer.layers.{bid}.norm_conv", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MM_EMBEDDING: (
|
||||
"model.embed_audio.embedding", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM: (
|
||||
"model.embed_audio.hard_embedding_norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_MM_INP_PROJ: (
|
||||
"model.embed_audio.embedding_projection", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: (
|
||||
"model.embed_audio.soft_embedding_norm", # gemma3n
|
||||
),
|
||||
|
||||
# NextN/MTP tensors for GLM4_MOE
|
||||
|
|
|
|||
|
|
@ -255,10 +255,20 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
|
|||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
|
||||
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
||||
res->add_input(std::move(inp));
|
||||
} else {
|
||||
GGML_ABORT("TODO: support embd input");
|
||||
// Vision embedding path: use padding token (ID=0) embedding
|
||||
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
|
||||
|
||||
// Extract and dequantize padding token embedding (column 0)
|
||||
ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
|
||||
ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
|
||||
inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
|
||||
|
||||
// Reshape to [n_embd_altup, n_layer, 1]
|
||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
|
||||
cb(inp_per_layer, "inp_per_layer_vision", -1);
|
||||
}
|
||||
res->add_input(std::move(inp));
|
||||
return inp_per_layer;
|
||||
}
|
||||
|
||||
|
|
@ -276,7 +286,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
|
|||
-1); // [n_embd_altup, n_layer, n_tokens]
|
||||
cb(per_layer_proj, "per_layer_proj", -1);
|
||||
|
||||
inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
|
||||
inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
||||
cb(inp_per_layer, "inp_per_layer", -1);
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ add_library(mtmd
|
|||
models/qwen3vl.cpp
|
||||
models/siglip.cpp
|
||||
models/whisper-enc.cpp
|
||||
models/mobilenetv5.cpp
|
||||
models/youtuvl.cpp
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -154,6 +154,47 @@
|
|||
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
|
||||
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
|
||||
|
||||
// mobilenetv5 (gemma3n) definitions
|
||||
#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
|
||||
#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
|
||||
#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight"
|
||||
|
||||
// Stage 0 Block (Edge Residual)
|
||||
#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight"
|
||||
#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight"
|
||||
#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight"
|
||||
#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight"
|
||||
|
||||
// Stage 1+ Block (Universal Inverted Residual)
|
||||
#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight"
|
||||
#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight"
|
||||
#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight"
|
||||
#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight"
|
||||
#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight"
|
||||
#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight"
|
||||
#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight"
|
||||
#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight"
|
||||
#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma"
|
||||
|
||||
// Attention Components
|
||||
#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight"
|
||||
#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight"
|
||||
#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight"
|
||||
#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight"
|
||||
#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight"
|
||||
#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight"
|
||||
#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight"
|
||||
#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight"
|
||||
#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
|
||||
|
||||
// MSFA
|
||||
#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight"
|
||||
#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight"
|
||||
#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight"
|
||||
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
|
||||
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
|
||||
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
||||
|
|
@ -171,6 +212,8 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_QWEN2VL,
|
||||
PROJECTOR_TYPE_QWEN3VL,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_GEMMA3NV,
|
||||
PROJECTOR_TYPE_GEMMA3NA,
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
PROJECTOR_TYPE_QWEN25VL,
|
||||
|
|
@ -203,6 +246,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
|
||||
|
|
|
|||
|
|
@ -173,6 +173,45 @@ struct clip_layer {
|
|||
}
|
||||
};
|
||||
|
||||
// Expanded MobileNetV5 block structure for Gemma3n vision encoder
|
||||
struct mobilenetv5_block {
|
||||
// Stage 0 (Edge Residual)
|
||||
ggml_tensor * s0_conv_exp_w = nullptr;
|
||||
ggml_tensor * s0_bn1_w = nullptr;
|
||||
ggml_tensor * s0_conv_pwl_w = nullptr;
|
||||
ggml_tensor * s0_bn2_w = nullptr;
|
||||
|
||||
// Stage 1+ (Universal Inverted Residual)
|
||||
ggml_tensor * dw_start_w = nullptr;
|
||||
ggml_tensor * dw_start_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * pw_exp_w = nullptr;
|
||||
ggml_tensor * pw_exp_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * dw_mid_w = nullptr;
|
||||
ggml_tensor * dw_mid_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * pw_proj_w = nullptr;
|
||||
ggml_tensor * pw_proj_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * layer_scale_w = nullptr;
|
||||
|
||||
// Attention (MQA) components
|
||||
ggml_tensor * attn_q_w = nullptr;
|
||||
ggml_tensor * attn_k_w = nullptr;
|
||||
ggml_tensor * attn_v_w = nullptr;
|
||||
ggml_tensor * attn_o_w = nullptr;
|
||||
|
||||
// Optional downsampling/norm in attention
|
||||
ggml_tensor * attn_k_dw_w = nullptr;
|
||||
ggml_tensor * attn_k_norm_w = nullptr;
|
||||
ggml_tensor * attn_v_dw_w = nullptr;
|
||||
ggml_tensor * attn_v_norm_w = nullptr;
|
||||
|
||||
// Block norm (often present in attention blocks)
|
||||
ggml_tensor * attn_norm_w = nullptr;
|
||||
};
|
||||
|
||||
struct clip_model {
|
||||
clip_modality modality = CLIP_MODALITY_VISION;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
|
@ -289,6 +328,23 @@ struct clip_model {
|
|||
ggml_tensor * mm_input_proj_w = nullptr;
|
||||
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
||||
|
||||
// mobilenetv5 for gemma3n
|
||||
std::vector<mobilenetv5_block> mobilenet_blocks;
|
||||
std::vector<int> mobilenet_stage_ends;
|
||||
ggml_tensor * mobilenet_stem_conv_w = nullptr;
|
||||
ggml_tensor * mobilenet_stem_conv_b = nullptr;
|
||||
ggml_tensor * mobilenet_stem_norm_w = nullptr;
|
||||
ggml_tensor * mm_post_proj_norm_w = nullptr;
|
||||
|
||||
// Multi-Scale Fusion Adapter (MSFA) components
|
||||
ggml_tensor * msfa_concat_conv_w = nullptr;
|
||||
ggml_tensor * msfa_concat_norm_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_expand_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_expand_bn = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_bn = nullptr;
|
||||
|
||||
|
||||
// pixtral, glm4v
|
||||
ggml_tensor * token_embd_img_break = nullptr;
|
||||
ggml_tensor * mm_patch_merger_w = nullptr;
|
||||
|
|
|
|||
|
|
@ -788,6 +788,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_siglip>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
|
|
@ -1146,6 +1150,14 @@ struct clip_model_loader {
|
|||
// test model (tinygemma3) has a different value, we optionally read it
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
|
||||
// Similar configuration to Gemma3
|
||||
hparams.n_merge = 1; // MobileNetV5 handles resizing internally
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
|
|
@ -1334,6 +1346,10 @@ struct clip_model_loader {
|
|||
|
||||
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
||||
|
||||
if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
hparams.n_layer = 0; // gemma3n does not use normal layer structure
|
||||
}
|
||||
|
||||
// layers
|
||||
model.layers.resize(hparams.n_layer);
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
|
|
@ -1408,6 +1424,7 @@ struct clip_model_loader {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
switch (model.proj_type) {
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
|
|
@ -1547,6 +1564,99 @@ struct clip_model_loader {
|
|||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
|
||||
model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
|
||||
model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
|
||||
|
||||
model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
|
||||
model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
|
||||
model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
|
||||
model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
|
||||
|
||||
model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
|
||||
|
||||
// Dynamically load blocks stage by stage
|
||||
for (int stage = 0; stage < 4; ++stage) {
|
||||
int blocks_found_in_stage = 0;
|
||||
|
||||
for (int blk_idx = 0; ; ++blk_idx) {
|
||||
bool found_block = false;
|
||||
mobilenetv5_block block;
|
||||
|
||||
// 1. Check for Edge Residual (S0)
|
||||
block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
|
||||
if (block.s0_conv_exp_w) {
|
||||
found_block = true;
|
||||
block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
|
||||
block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
|
||||
block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
|
||||
}
|
||||
// 2. Check for UIR (Universal Inverted Residual)
|
||||
else {
|
||||
// Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
|
||||
block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
|
||||
block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
|
||||
|
||||
if (block.dw_start_w || block.pw_exp_w) {
|
||||
found_block = true;
|
||||
if (block.dw_start_w) {
|
||||
block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
|
||||
}
|
||||
if (block.pw_exp_w) {
|
||||
block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
|
||||
if (block.dw_mid_w) {
|
||||
block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
|
||||
if (block.pw_proj_w) {
|
||||
block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
|
||||
}
|
||||
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Check for Attention (MQA)
|
||||
// Even if UIR/Edge check failed, this might be a pure attention block
|
||||
ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
|
||||
if (attn_q_check) {
|
||||
found_block = true;
|
||||
block.attn_q_w = attn_q_check;
|
||||
block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
|
||||
block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
|
||||
block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
|
||||
block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
|
||||
block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
|
||||
block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
|
||||
block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
|
||||
block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
|
||||
// Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
|
||||
if (!block.layer_scale_w) {
|
||||
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
|
||||
}
|
||||
}
|
||||
|
||||
if (found_block) {
|
||||
model.mobilenet_blocks.push_back(block);
|
||||
blocks_found_in_stage++;
|
||||
} else {
|
||||
// End of blocks for this stage
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Track where this stage ends in the flat vector
|
||||
if (blocks_found_in_stage > 0) {
|
||||
model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
|
||||
LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
|
||||
}
|
||||
}
|
||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||
|
|
@ -2002,6 +2112,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|||
|
||||
try {
|
||||
clip_model_loader loader(fname);
|
||||
bool skip_audio = false;
|
||||
|
||||
if (loader.has_vision) {
|
||||
ctx_vision = new clip_ctx(ctx_params);
|
||||
|
|
@ -2011,10 +2122,14 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|||
loader.warmup(*ctx_vision);
|
||||
}
|
||||
|
||||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||
// we can remove this check when we implement audio support for Gemma 3N
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
||||
|
||||
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
||||
}
|
||||
|
||||
if (loader.has_audio) {
|
||||
if (loader.has_audio && !skip_audio) {
|
||||
ctx_audio = new clip_ctx(ctx_params);
|
||||
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
||||
loader.load_tensors(*ctx_audio);
|
||||
|
|
@ -2852,6 +2967,16 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
clip_image_u8 resized_image;
|
||||
int sz = params.image_size;
|
||||
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
|
||||
clip_image_f32_ptr img_f32(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(img_f32));
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
{
|
||||
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
|
||||
|
|
@ -3114,6 +3239,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
int scale_factor = ctx->model.hparams.n_merge;
|
||||
n_patches /= (scale_factor * scale_factor);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
|
||||
// regardless of input size (see architecture description)
|
||||
n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
|
|
@ -3506,6 +3637,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
set_input_i32("patches", patches);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
|
|
@ -3633,6 +3765,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
// main path + deepstack paths
|
||||
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
return ctx->model.mm_input_proj_w->ne[0];
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
return ctx->model.projection->ne[1];
|
||||
|
|
@ -3663,6 +3796,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
}
|
||||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
|
||||
return ctx->model.hparams.minicpmv_version;
|
||||
}
|
||||
|
|
@ -3670,24 +3804,26 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
|||
}
|
||||
|
||||
bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
||||
}
|
||||
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||
return ctx->model.hparams.has_llava_projector;
|
||||
}
|
||||
|
||||
bool clip_is_gemma3(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
|
||||
}
|
||||
|
||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->model.modality == CLIP_MODALITY_VISION;
|
||||
}
|
||||
|
|
@ -3697,11 +3833,16 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
|||
}
|
||||
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO;
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||
|
|
|
|||
|
|
@ -106,7 +106,8 @@ int clip_is_minicpmv(const struct clip_ctx * ctx);
|
|||
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx);
|
||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||
// note for contributor: this clip_is_(model) pattern is deprecated
|
||||
// do NOT add new functions like this
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,451 @@
|
|||
#include "models.h"
|
||||
|
||||
// Helpers for MobileNetV5 Blocks
|
||||
// RMS Norm 2D - normalizes over channels for each spatial position
|
||||
ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
|
||||
// inp: [W, H, C, B]
|
||||
|
||||
ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (weight) {
|
||||
cur = ggml_mul(ctx0, cur, weight);
|
||||
}
|
||||
|
||||
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
|
||||
ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
|
||||
const int64_t ih = inp->ne[1]; // height
|
||||
const int64_t iw = inp->ne[0]; // width
|
||||
|
||||
// Calculate output size (ceil division)
|
||||
const int64_t oh = (ih + stride_h - 1) / stride_h;
|
||||
const int64_t ow = (iw + stride_w - 1) / stride_w;
|
||||
|
||||
// Calculate padding needed
|
||||
const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
|
||||
const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
|
||||
|
||||
// Split padding asymmetrically
|
||||
const int pad_h_top = pad_h / 2;
|
||||
const int pad_h_bottom = pad_h - pad_h_top;
|
||||
const int pad_w_left = pad_w / 2;
|
||||
const int pad_w_right = pad_w - pad_w_left;
|
||||
|
||||
// Apply padding if needed
|
||||
// ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
|
||||
// For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
|
||||
if (pad_h > 0 || pad_w > 0) {
|
||||
inp = ggml_pad_ext(ctx0, inp,
|
||||
pad_w_left, pad_w_right, // width padding (dim 0)
|
||||
pad_h_top, pad_h_bottom, // height padding (dim 1)
|
||||
0, 0, // no channel padding (dim 2)
|
||||
0, 0); // no batch padding (dim 3)
|
||||
}
|
||||
|
||||
return inp;
|
||||
}
|
||||
|
||||
|
||||
// Edge Residual Block (Stage 0)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// 1. Expansion Conv (3x3)
|
||||
if (stride == 2) {
|
||||
// Case: Downsampling (Block 0)
|
||||
// Replicates Conv2dSame(kernel=3, stride=2)
|
||||
cur = pad_same_2d(cur, 3, 3, stride, stride);
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
|
||||
} else {
|
||||
// Case: Normal 3x3 Block (Block 1, 2)
|
||||
// Replicates Conv2d(kernel=3, stride=1, padding=1)
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
|
||||
}
|
||||
|
||||
// BN + Activation
|
||||
if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
// 2. Pointwise Linear Conv (1x1)
|
||||
// 1x1 Convs usually have padding=0 and stride=1
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
|
||||
|
||||
// 3. Residual Connection
|
||||
// Only apply residual if spatial dimensions and channels match (stride 1)
|
||||
if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Universal Inverted Residual Block (Stage 1+)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// 1. Depthwise Start (Optional)
|
||||
// NOTE: dw_start always has stride=1 (no downsampling here)
|
||||
if (block.dw_start_w) {
|
||||
int k = block.dw_start_w->ne[0]; // 3 or 5
|
||||
int p = k / 2;
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
|
||||
if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
|
||||
}
|
||||
|
||||
// 2. Pointwise Expansion (1x1)
|
||||
if (block.pw_exp_w) {
|
||||
// Standard 1x1 conv, pad=0, stride=1
|
||||
cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
}
|
||||
|
||||
// 3. Depthwise Mid (Optional)
|
||||
// NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
|
||||
if (block.dw_mid_w) {
|
||||
int k = block.dw_mid_w->ne[0]; // 3 or 5
|
||||
|
||||
if (stride > 1) {
|
||||
// Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
|
||||
cur = pad_same_2d(cur, k, k, stride, stride);
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
|
||||
} else {
|
||||
// Case: Stride 1 -> Use Standard Symmetric Padding
|
||||
int p = k / 2;
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
|
||||
}
|
||||
|
||||
if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
}
|
||||
|
||||
// 4. Pointwise Projection (1x1)
|
||||
if (block.pw_proj_w) {
|
||||
cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
|
||||
}
|
||||
|
||||
// Apply Layer Scaling if present
|
||||
if (block.layer_scale_w) {
|
||||
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
|
||||
}
|
||||
|
||||
// 5. Residual Connection
|
||||
bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
|
||||
bool same_channel = (inp->ne[2] == cur->ne[2]);
|
||||
if (same_spatial && same_channel) {
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Attention Block (MQA)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// Norm
|
||||
if (block.attn_norm_w) {
|
||||
cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
|
||||
}
|
||||
|
||||
// 1. Q Calculation
|
||||
ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// 2. K Calculation (Downsampled)
|
||||
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
|
||||
ggml_tensor * k_inp = cur;
|
||||
if (block.attn_k_dw_w) {
|
||||
int k_size = block.attn_k_dw_w->ne[0]; // Usually 3
|
||||
k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding
|
||||
k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (block.attn_k_norm_w) {
|
||||
k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
|
||||
}
|
||||
}
|
||||
ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// 3. V Calculation (Downsampled)
|
||||
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
|
||||
ggml_tensor * v_inp = cur;
|
||||
if (block.attn_v_dw_w) {
|
||||
int v_size = block.attn_v_dw_w->ne[0]; // Usually 3
|
||||
v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding
|
||||
v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (block.attn_v_norm_w) {
|
||||
v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
|
||||
}
|
||||
}
|
||||
ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
|
||||
const int D = k->ne[2]; // Head dimension
|
||||
const int n_head = q->ne[2] / D;
|
||||
const int N = W * H;
|
||||
|
||||
// Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
|
||||
q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
|
||||
q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
|
||||
q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
|
||||
q = ggml_cont(ctx0, q);
|
||||
|
||||
const int Wk = k->ne[0]; const int Hk = k->ne[1];
|
||||
const int M = Wk * Hk;
|
||||
|
||||
// Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
|
||||
k = ggml_reshape_3d(ctx0, k, M, D, B);
|
||||
k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
|
||||
k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
|
||||
k = ggml_cont(ctx0, k);
|
||||
|
||||
// Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
|
||||
v = ggml_reshape_3d(ctx0, v, M, D, B);
|
||||
v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
|
||||
v = ggml_cont(ctx0, v); // [M, D, 1, B]
|
||||
|
||||
// Multi-Query Attention
|
||||
float scale = 1.0f / sqrtf((float)D);
|
||||
|
||||
// Step 1: Compute Q @ K.T
|
||||
ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
|
||||
|
||||
scores = ggml_scale(ctx0, scores, scale);
|
||||
|
||||
scores = ggml_soft_max(ctx0, scores);
|
||||
|
||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
|
||||
|
||||
kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
|
||||
kqv = ggml_cont(ctx0, kqv);
|
||||
|
||||
|
||||
kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
|
||||
kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
|
||||
kqv = ggml_cont(ctx0, kqv);
|
||||
|
||||
// Output projection
|
||||
cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// Residual & Layer Scale
|
||||
if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
|
||||
if (block.layer_scale_w) {
|
||||
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_mobilenetv5::build() {
|
||||
ggml_tensor * inp = build_inp_raw();
|
||||
|
||||
// 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
|
||||
ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding
|
||||
|
||||
cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (model.mobilenet_stem_conv_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
|
||||
}
|
||||
if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
|
||||
// 2. Blocks
|
||||
std::vector<ggml_tensor*> intermediate_features;
|
||||
const int total_blocks = model.mobilenet_blocks.size();
|
||||
|
||||
auto is_stage_start = [&](int i) {
|
||||
if (i == 0) return true;
|
||||
for (int end_idx : model.mobilenet_stage_ends) {
|
||||
if (i == end_idx + 1) return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
auto is_fusion_point = [&](int i) {
|
||||
if (model.mobilenet_stage_ends.size() >= 4) {
|
||||
if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
|
||||
if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
|
||||
} else {
|
||||
if (i == total_blocks - 1) return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
for (int i = 0; i < total_blocks; i++) {
|
||||
const auto & block = model.mobilenet_blocks[i];
|
||||
int stride = is_stage_start(i) ? 2 : 1;
|
||||
|
||||
if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride);
|
||||
else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block);
|
||||
else cur = build_inverted_residual(cur, block, stride);
|
||||
|
||||
if (is_fusion_point(i)) {
|
||||
|
||||
intermediate_features.push_back(cur);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Multi-Scale Fusion Adapter (MSFA)
|
||||
if (!intermediate_features.empty()) {
|
||||
|
||||
// A. Reference Resolution: PyTorch implementation uses inputs[0]
|
||||
// We assume intermediate_features[0] is the "High Resolution" target.
|
||||
// In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
|
||||
ggml_tensor* target_feat = intermediate_features[0];
|
||||
int high_res_w = target_feat->ne[0];
|
||||
int high_res_h = target_feat->ne[1];
|
||||
|
||||
std::vector<ggml_tensor*> resized_feats;
|
||||
|
||||
// B. Resize inputs to match inputs[0] (High Resolution)
|
||||
for (auto feat : intermediate_features) {
|
||||
int feat_w = feat->ne[0];
|
||||
int feat_h = feat->ne[1];
|
||||
|
||||
// PyTorch: if feat_size < high_resolution: interpolate
|
||||
if (feat_w < high_res_w || feat_h < high_res_h) {
|
||||
// Calculate scale factor.
|
||||
// Note: PyTorch 'nearest' works on arbitrary float scales.
|
||||
// ggml_upscale generally takes integer factors or target sizes depending on helper.
|
||||
// Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
|
||||
int scale_w = high_res_w / feat_w;
|
||||
// int scale_h = high_res_h / feat_h;
|
||||
|
||||
// Safety check for non-integer scaling if strictly replicating
|
||||
GGML_ASSERT(high_res_w % feat_w == 0);
|
||||
|
||||
// Upsample (Nearest Neighbor)
|
||||
// 2 is the scale factor
|
||||
feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
|
||||
}
|
||||
resized_feats.push_back(feat);
|
||||
}
|
||||
|
||||
// C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
|
||||
cur = resized_feats[0];
|
||||
for (size_t k = 1; k < resized_feats.size(); ++k) {
|
||||
cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
|
||||
}
|
||||
|
||||
// D. FFN (UniversalInvertedResidual)
|
||||
// Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
|
||||
|
||||
// 1. Expansion
|
||||
if (model.msfa_ffn_expand_w) {
|
||||
// 1x1 Conv
|
||||
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
if (model.msfa_ffn_expand_bn) {
|
||||
cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
|
||||
}
|
||||
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
}
|
||||
|
||||
// 2. Projection (No DW because kernel_size=0)
|
||||
if (model.msfa_ffn_project_w) {
|
||||
// 1x1 Conv
|
||||
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// UniversalInvertedResidual typically has a norm after projection
|
||||
if (model.msfa_ffn_project_bn) {
|
||||
cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// E. Final Downsample to Target Resolution (Output Resolution)
|
||||
// PyTorch: matches self.output_resolution (e.g. 16x16)
|
||||
const int target_out_res = 16;
|
||||
int current_w = cur->ne[0];
|
||||
|
||||
if (current_w > target_out_res) {
|
||||
int s = current_w / target_out_res;
|
||||
|
||||
GGML_ASSERT(current_w % target_out_res == 0);
|
||||
|
||||
// Avg Pool: Kernel=s, Stride=s
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
// F. Final Norm
|
||||
if (model.msfa_concat_norm_w) {
|
||||
cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Gemma 3n Multimodal Projection (Embedder)
|
||||
// Input: 'cur' is [Width, Height, Channels, Batch]
|
||||
int W = cur->ne[0];
|
||||
int H = cur->ne[1];
|
||||
int C = cur->ne[2];
|
||||
int B = cur->ne[3];
|
||||
|
||||
GGML_ASSERT(C == hparams.n_embd);
|
||||
|
||||
// 1. Permute and Flatten to [Channels, Tokens, Batch]
|
||||
// PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
|
||||
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
|
||||
// 2. FEATURE SCALING
|
||||
// PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
|
||||
const float scale_factor = sqrtf((float)C);
|
||||
cur = ggml_scale(ctx0, cur, scale_factor);
|
||||
|
||||
|
||||
// 3. SOFT EMBEDDING NORM
|
||||
// PyTorch: self._norm(x) * self.weight
|
||||
// We must normalize regardless, then multiply if weight exists.
|
||||
{
|
||||
const float eps = 1e-6f; // Gemma3n uses 1e-6
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (model.mm_soft_emb_norm_w) {
|
||||
// Weight shape is (2048,) -> Element-wise broadcast multiply
|
||||
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// 4. PROJECTION
|
||||
// PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
|
||||
// Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
|
||||
if (model.mm_input_proj_w) {
|
||||
cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
|
||||
}
|
||||
|
||||
// 5. POST PROJECTION NORM
|
||||
// PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
|
||||
// with_scale=False means weight is registered as buffer with value 1.0
|
||||
// So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
|
||||
{
|
||||
const float eps = 1e-6f;
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (model.mm_post_proj_norm_w) {
|
||||
// If weight is loaded, multiply (should be ~1.0 anyway)
|
||||
cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -76,3 +76,36 @@ struct clip_graph_glm4v : clip_graph {
|
|||
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_mobilenetv5 : clip_graph {
|
||||
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
ggml_tensor * rms_norm_2d(
|
||||
ggml_tensor * inp,
|
||||
ggml_tensor * weight,
|
||||
float eps = 1e-6f);
|
||||
|
||||
ggml_tensor* pad_same_2d(
|
||||
ggml_tensor* inp,
|
||||
int kernel_h,
|
||||
int kernel_w,
|
||||
int stride_h,
|
||||
int stride_w,
|
||||
int dilation_h = 1,
|
||||
int dilation_w = 1);
|
||||
|
||||
ggml_tensor * build_edge_residual(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block,
|
||||
int stride);
|
||||
|
||||
ggml_tensor * build_inverted_residual(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block,
|
||||
int stride);
|
||||
|
||||
ggml_tensor * build_mobilenet_attn(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -266,7 +266,7 @@ struct mtmd_context {
|
|||
}
|
||||
|
||||
// set boi/eoi
|
||||
if (proj == PROJECTOR_TYPE_GEMMA3) {
|
||||
if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
img_beg = "<start_of_image>";
|
||||
img_end = "<end_of_image>";
|
||||
|
|
@ -862,10 +862,15 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
|||
}
|
||||
|
||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
|
||||
return true;
|
||||
switch (ctx->proj_type_v()) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue