simplified code; addressed reviews
This commit is contained in:
parent
a9f70e2048
commit
a565bbd1b4
|
|
@ -772,9 +772,6 @@ class TextModel(ModelBase):
|
|||
if "text_config" in self.hparams:
|
||||
# move the text_config to the root level
|
||||
self.hparams = {**self.hparams, **self.hparams["text_config"]}
|
||||
if "llm_config" in self.hparams:
|
||||
# also handle llm_config for VLM models (e.g., Nemotron Nano 12B v2 VL)
|
||||
self.hparams = {**self.hparams, **self.hparams["llm_config"]}
|
||||
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
|
@ -4061,7 +4058,7 @@ class InternVisionModel(MmprojModel):
|
|||
|
||||
@ModelBase.register(
|
||||
"NemotronH_Nano_VL_V2",
|
||||
"RADIOModel"
|
||||
"RADIOModel",
|
||||
)
|
||||
class NemotronNanoV2VLModel(MmprojModel):
|
||||
# ViT-Huge architecture parameters for RADIO v2.5-h
|
||||
|
|
@ -4109,20 +4106,19 @@ class NemotronNanoV2VLModel(MmprojModel):
|
|||
if "input_conditioner" in name:
|
||||
return
|
||||
|
||||
if name.startswith("vision_model.radio_model.model."):
|
||||
if ".attn.qkv." in name:
|
||||
wq, wk, wv = data_torch.chunk(3, dim=0)
|
||||
yield from super().modify_tensors(wq, name.replace("attn.qkv", "attn.q"), bid)
|
||||
yield from super().modify_tensors(wk, name.replace("attn.qkv", "attn.k"), bid)
|
||||
yield from super().modify_tensors(wv, name.replace("attn.qkv", "attn.v"), bid)
|
||||
return
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
return
|
||||
# RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
|
||||
if "patch_generator.pos_embed" in name and not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
|
||||
# Handle projector tensors (mlp1.*)
|
||||
if name.startswith("mlp1."):
|
||||
# Reshape linear patch embedding to conv2d format for ggml_conv_2d
|
||||
# From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size]
|
||||
if "patch_generator.embedder" in name:
|
||||
patch_size = self.hparams["patch_size"]
|
||||
n_embd = self.hparams["hidden_size"]
|
||||
data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size)
|
||||
|
||||
if name.startswith("vision_model.radio_model.model.") or name.startswith("mlp1."):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
return
|
||||
|
||||
|
||||
@ModelBase.register("WavTokenizerDec")
|
||||
|
|
@ -7106,6 +7102,8 @@ class Mamba2Model(TextModel):
|
|||
if hparams is None:
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
if "llm_config" in hparams:
|
||||
hparams["text_config"] = hparams["llm_config"]
|
||||
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
||||
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
||||
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
|
||||
|
|
|
|||
|
|
@ -1368,7 +1368,7 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
||||
"visual.blocks.{bid}.attn.qkv", # qwen3vl
|
||||
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
|
||||
"vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
|
||||
"vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL
|
||||
),
|
||||
|
||||
|
|
@ -1383,7 +1383,6 @@ class TensorNameMap:
|
|||
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
|
||||
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.q", # Nemotron Nano v2 VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
||||
|
|
@ -1402,7 +1401,6 @@ class TensorNameMap:
|
|||
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
|
||||
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.k", # Nemotron Nano v2 VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
|
||||
|
|
@ -1420,7 +1418,6 @@ class TensorNameMap:
|
|||
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
|
||||
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
|
||||
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.v", # Nemotron Nano v2 VL
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
),
|
||||
|
||||
|
|
|
|||
|
|
@ -1373,10 +1373,6 @@ struct clip_model_loader {
|
|||
model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false);
|
||||
|
||||
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
||||
if (!model.position_embeddings) {
|
||||
// fallback for models using tensor name without .weight suffix
|
||||
model.position_embeddings = get_tensor(string_format("%s.position_embd", prefix), false);
|
||||
}
|
||||
|
||||
if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
hparams.n_layer = 0; // gemma3n does not use normal layer structure
|
||||
|
|
|
|||
|
|
@ -7,36 +7,11 @@ ggml_cgraph * clip_graph_nemotron_v2_vl::build() {
|
|||
const int n_registers = model.class_embedding->ne[1];
|
||||
const int n_pos = n_patches + n_registers;
|
||||
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp;
|
||||
{
|
||||
ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
|
||||
patch_size, patch_size, 3, n_embd);
|
||||
inp = ggml_im2col(ctx0, kernel, inp_raw, patch_size, patch_size, 0, 0, 1, 1, true, inp_raw->type);
|
||||
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
||||
cb(inp, "patch_embd", -1);
|
||||
}
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
{
|
||||
const int max_patches_per_side = (int)std::sqrt((float)model.position_embeddings->ne[1]);
|
||||
|
||||
ggml_tensor * pos_embd = ggml_reshape_3d(ctx0, model.position_embeddings, n_embd, max_patches_per_side, max_patches_per_side);
|
||||
|
||||
const int pool_h = max_patches_per_side / n_patches_y;
|
||||
const int pool_w = max_patches_per_side / n_patches_x;
|
||||
|
||||
if (pool_h > 1 || pool_w > 1) {
|
||||
pos_embd = ggml_cont(ctx0, ggml_permute(ctx0, pos_embd, 1, 2, 0, 3));
|
||||
pos_embd = ggml_pool_2d(ctx0, pos_embd, GGML_OP_POOL_AVG, pool_w, pool_h, pool_w, pool_h, 0, 0);
|
||||
pos_embd = ggml_cont(ctx0, ggml_permute(ctx0, pos_embd, 2, 0, 1, 3));
|
||||
}
|
||||
|
||||
pos_embd = ggml_reshape_2d(ctx0, pos_embd, n_embd, n_patches);
|
||||
|
||||
inp = ggml_add(ctx0, inp, pos_embd);
|
||||
cb(inp, "inp_pos", -1);
|
||||
}
|
||||
ggml_tensor * pos_embd = resize_position_embeddings();
|
||||
inp = ggml_add(ctx0, inp, pos_embd);
|
||||
cb(inp, "inp_pos", -1);
|
||||
|
||||
inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
|
||||
|
||||
|
|
@ -47,24 +22,7 @@ ggml_cgraph * clip_graph_nemotron_v2_vl::build() {
|
|||
ggml_row_size(cur->type, n_embd),
|
||||
n_registers * ggml_row_size(cur->type, n_embd));
|
||||
|
||||
{
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
const int bsz = 1;
|
||||
const int height = n_patches_y;
|
||||
const int width = n_patches_x;
|
||||
GGML_ASSERT(scale_factor > 0);
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_cont_4d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
height / scale_factor,
|
||||
width / scale_factor,
|
||||
bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_cont_2d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
cur->ne[1] * cur->ne[2]);
|
||||
}
|
||||
cur = build_patch_merge_permute(cur, model.hparams.n_merge);
|
||||
|
||||
{
|
||||
cur = build_norm(cur, model.mm_0_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
|
||||
|
|
@ -75,4 +33,3 @@ ggml_cgraph * clip_graph_nemotron_v2_vl::build() {
|
|||
|
||||
return gf;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue