diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2afaf85fb8..64c10ec40d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4056,6 +4056,87 @@ class InternVisionModel(MmprojModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register( + "NemotronH_Nano_VL_V2", + "RADIOModel", +) +class NemotronNanoV2VLModel(MmprojModel): + # ViT-Huge architecture parameters for RADIO v2.5-h + _vit_hidden_size = 1280 + _vit_intermediate_size = 5120 + _vit_num_layers = 32 + _vit_num_heads = 16 + + def get_vision_config(self) -> dict[str, Any] | None: + # RADIO config doesn't have standard ViT parameters, so they need to be constructed manually + vision_config = self.global_config.get("vision_config") + if vision_config is None: + return None + # Add ViT-H parameters + vision_config = { + **vision_config, + "hidden_size": self._vit_hidden_size, + "intermediate_size": self._vit_intermediate_size, + "num_hidden_layers": self._vit_num_layers, + "num_attention_heads": self._vit_num_heads, + "image_size": self.global_config.get("force_image_size", 512), + } + return vision_config + + def set_gguf_parameters(self): + if "image_mean" not in self.preprocessor_config: + self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406] + if "image_std" not in self.preprocessor_config: + self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225] + + super().set_gguf_parameters() + hparams = self.global_config + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL) + self.gguf_writer.add_vision_attention_layernorm_eps(1e-6) + self.gguf_writer.add_vision_use_gelu(True) + downsample_ratio = hparams.get("downsample_ratio", 0.5) + self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".position_embd." in new_name or "pos_embed" in new_name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "input_conditioner" in name: + return + + # RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it + if "patch_generator.pos_embed" in name: + if not name.endswith(".weight"): + name += ".weight" + # Downsample position embeddings for fixed 512x512 image size + import torch.nn.functional as F + n_embd = self.hparams["hidden_size"] + image_size = self.global_config.get("force_image_size", 512) + patch_size = self.hparams["patch_size"] + target_patches_per_side = image_size // patch_size # 32 + max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128 + if target_patches_per_side != max_patches_per_side: + # Reshape to grid, interpolate, flatten back + data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd) + data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128] + data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side), + mode='bilinear', align_corners=True) + data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd] + data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd) + + # Reshape linear patch embedding to conv2d format for ggml_conv_2d + # From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size] + if "patch_generator.embedder" in name: + patch_size = self.hparams["patch_size"] + n_embd = self.hparams["hidden_size"] + data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size) + + if name.startswith("vision_model.radio_model.model.") or name.startswith("mlp1."): + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("WavTokenizerDec") class WavTokenizerDecModel(TextModel): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC @@ -7037,6 +7118,8 @@ class Mamba2Model(TextModel): if hparams is None: with open(dir_model / "config.json", "r", encoding="utf-8") as f: hparams = json.load(f) + if "llm_config" in hparams: + hparams["text_config"] = hparams["llm_config"] super().__init__(dir_model, *args, hparams=hparams, **kwargs) self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model @@ -9525,6 +9608,14 @@ class NemotronHModel(GraniteHybridModel): self.gguf_writer.add_add_bos_token(True) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Skip vision model and projector tensors for VLM models (handled by mmproj) (e.g., Nemotron Nano 12B v2 VL) + if name.startswith(("vision_model.", "mlp1.")): + return + + # Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL) + if name.startswith("language_model."): + name = name[len("language_model."):] + if self.is_moe and bid is not None: if name.endswith("mixer.gate.e_score_correction_bias"): new_name = name.replace("e_score_correction_bias", "e_score_correction.bias") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 9dab0df08a..fe93716a35 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3774,6 +3774,7 @@ class VisionProjectorType: MUSIC_FLAMINGO = "musicflamingo" # audio GLM4V = "glm4v" YOUTUVL = "youtuvl" + NEMOTRON_V2_VL = "nemotron_v2_vl" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 43647904b4..734fb765f6 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1330,6 +1330,7 @@ class TensorNameMap: "model.vision_tower.embeddings.cls_token", # Intern-S1 "vision_model.class_embedding", # llama 4 "model.vision.patch_embedding.cls_embedding", # cogvlm + "vision_model.radio_model.model.patch_generator.cls_token.token", # Nemotron Nano v2 VL ), MODEL_TENSOR.V_ENC_EMBD_PATCH: ( @@ -1344,6 +1345,7 @@ class TensorNameMap: "vision_tower.patch_embed.proj", # kimi-vl "model.vision.patch_embedding.proj", # cogvlm "siglip2.vision_model.embeddings.patch_embedding", + "vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL ), MODEL_TENSOR.V_ENC_EMBD_NORM: ( @@ -1360,12 +1362,14 @@ class TensorNameMap: "visual.pos_embed", # qwen3vl "model.vision.patch_embedding.position_embedding", # cogvlm "visual.embeddings.position_embedding", # glm4v + "vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL ), MODEL_TENSOR.V_ENC_ATTN_QKV: ( "visual.blocks.{bid}.attn.qkv", # qwen3vl "model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm - "vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5 + "vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5 + "vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL ), MODEL_TENSOR.V_ENC_ATTN_Q: ( @@ -1430,6 +1434,7 @@ class TensorNameMap: "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1) "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm "siglip2.vision_model.encoder.layers.{bid}.layer_norm1", + "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL ), MODEL_TENSOR.V_ENC_ATTN_O: ( @@ -1446,6 +1451,7 @@ class TensorNameMap: "vision_tower.encoder.blocks.{bid}.wo", # kimi-vl "model.vision.transformer.layers.{bid}.attention.dense", # cogvlm "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl + "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL ), MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( @@ -1461,6 +1467,7 @@ class TensorNameMap: "vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1) "model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm "siglip2.vision_model.encoder.layers.{bid}.layer_norm2", + "vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL ), MODEL_TENSOR.V_ENC_FFN_UP: ( @@ -1477,6 +1484,7 @@ class TensorNameMap: "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1) "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1", + "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL ), MODEL_TENSOR.V_ENC_FFN_GATE: ( @@ -1499,6 +1507,7 @@ class TensorNameMap: "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1) "model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2", + "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL ), MODEL_TENSOR.V_LAYER_SCALE_1: ( diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 02d71f224e..755a3d4b00 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -20,6 +20,7 @@ add_library(mtmd models/internvl.cpp models/kimivl.cpp models/kimik25.cpp + models/nemotron-v2-vl.cpp models/llama4.cpp models/llava.cpp models/minicpmv.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 3bc93ead86..03bedf9d3f 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -236,6 +236,7 @@ enum projector_type { PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_YOUTUVL, PROJECTOR_TYPE_KIMIK25, + PROJECTOR_TYPE_NEMOTRON_V2_VL, PROJECTOR_TYPE_UNKNOWN, }; @@ -270,6 +271,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_GLM4V, "glm4v"}, { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, { PROJECTOR_TYPE_KIMIK25, "kimik25"}, + { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index d4ff9151bb..e0eb9b32c8 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -15,6 +15,7 @@ enum ffn_op_type { FFN_GELU_ERF, FFN_SILU, FFN_GELU_QUICK, + FFN_RELU_SQR, }; enum norm_type { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index eeccb4cda0..597289b7b6 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -559,6 +559,12 @@ ggml_tensor * clip_graph::build_ffn( cur = ggml_gelu_quick(ctx0, cur); cb(cur, "ffn_gelu_quick", il); } break; + case FFN_RELU_SQR: + { + cur = ggml_relu(ctx0, cur); + cur = ggml_sqr(ctx0, cur); + cb(cur, "ffn_relu_sqr", il); + } break; } if (down) { @@ -810,6 +816,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_NEMOTRON_V2_VL: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_LLAMA4: { builder = std::make_unique(ctx, img); @@ -1110,6 +1120,7 @@ struct clip_model_loader { } } break; case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_NEMOTRON_V2_VL: { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); } break; @@ -1767,6 +1778,12 @@ struct clip_model_loader { model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias")); } break; + case PROJECTOR_TYPE_NEMOTRON_V2_VL: + { + model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); + model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); + model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); + } break; case PROJECTOR_TYPE_GLMA: { model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); @@ -3088,6 +3105,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_GLM_EDGE: case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution + case PROJECTOR_TYPE_NEMOTRON_V2_VL: { clip_image_u8 resized_image; int sz = params.image_size; @@ -3397,6 +3415,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_NEMOTRON_V2_VL: case PROJECTOR_TYPE_LLAMA4: { // both X and Y are downscaled by the scale factor @@ -3805,6 +3824,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_GEMMA3NV: case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_NEMOTRON_V2_VL: case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_ULTRAVOX: @@ -3968,6 +3988,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_MUSIC_FLAMINGO: return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_NEMOTRON_V2_VL: return ctx->model.mm_3_w->ne[1]; case PROJECTOR_TYPE_LLAMA4: return ctx->model.mm_model_proj->ne[1]; diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index c4c67ace62..0beff16c5e 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -42,6 +42,11 @@ struct clip_graph_internvl : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_nemotron_v2_vl : clip_graph { + clip_graph_nemotron_v2_vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_llama4 : clip_graph { clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/models/nemotron-v2-vl.cpp b/tools/mtmd/models/nemotron-v2-vl.cpp new file mode 100644 index 0000000000..03094be1b2 --- /dev/null +++ b/tools/mtmd/models/nemotron-v2-vl.cpp @@ -0,0 +1,35 @@ +#include "models.h" + +ggml_cgraph * clip_graph_nemotron_v2_vl::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_registers = model.class_embedding->ne[1]; + const int n_pos = n_patches + n_registers; + + ggml_tensor * inp = build_inp(); + + // add position embeddings (pre-downsampled during GGUF conversion for fixed 512x512 input) + inp = ggml_add(ctx0, inp, model.position_embeddings); + cb(inp, "inp_pos", -1); + + inp = ggml_concat(ctx0, model.class_embedding, inp, 1); + + ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, hparams.ffn_op, nullptr, nullptr); + + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), + n_registers * ggml_row_size(cur->type, n_embd)); + + cur = build_patch_merge_permute(cur, model.hparams.n_merge); + + { + cur = build_norm(cur, model.mm_0_w, nullptr, NORM_TYPE_RMS, 1e-6, -1); + cur = build_ffn(cur, model.mm_1_w, nullptr, nullptr, nullptr, model.mm_3_w, nullptr, FFN_RELU_SQR, -1); + } + + ggml_build_forward_expand(gf, cur); + + return gf; +}