diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d4269c4c0a..94b049d0de 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7948,21 +7948,18 @@ class VaetkiVisionModel(MmprojModel): # Handle merger tensors with special index mapping # clip.cpp PROJECTOR_TYPE_VAETKI expects: - # mm.model.mlp.0.* -> ln_q (pre-norm) - # mm.model.mlp.1.* -> mlp.0 (up projection) - # mm.model.mlp.3.* -> mlp.2 (down projection) + # mm.input_norm.* -> ln_q (pre-norm) + # mm.up.* -> mlp.0 (up projection) + # mm.down.* -> mlp.2 (down projection) if "merger.ln_q" in name: - # ln_q -> mm.model.mlp.0 (used as norm in vaetki.cpp) - suffix = "weight" if name.endswith(".weight") else "bias" - return [(f"mm.model.mlp.0.{suffix}", data_torch)] + suffix = ".weight" if name.endswith(".weight") else ".bias" + return [(self.format_tensor_name(gguf.MODEL_TENSOR.V_MM_INP_NORM, suffix=suffix), data_torch)] elif "merger.mlp.0" in name: - # mlp.0 -> mm.model.mlp.1 (up projection) - suffix = "weight" if name.endswith(".weight") else "bias" - return [(f"mm.model.mlp.1.{suffix}", data_torch)] + suffix = ".weight" if name.endswith(".weight") else ".bias" + return [(self.format_tensor_name(gguf.MODEL_TENSOR.V_MM_UP, suffix=suffix), data_torch)] elif "merger.mlp.2" in name: - # mlp.2 -> mm.model.mlp.3 (down projection) - suffix = "weight" if name.endswith(".weight") else "bias" - return [(f"mm.model.mlp.3.{suffix}", data_torch)] + suffix = ".weight" if name.endswith(".weight") else ".bias" + return [(self.format_tensor_name(gguf.MODEL_TENSOR.V_MM_DOWN, suffix=suffix), data_torch)] # Handle class_embedding and class_pos_emb (keep model.visual. prefix for mapping) if "class_embedding" in name or "class_pos_emb" in name: diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 74457c0148..59c3efaa6e 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1555,12 +1555,12 @@ struct clip_model_loader { case PROJECTOR_TYPE_VAETKI: { model.class_pos_emb = get_tensor(TN_CLASS_POS_EMBD); - model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); - model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias")); - model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); - model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias")); - model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); - model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias")); + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); + model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B); + model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight")); + model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias")); + model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight")); + model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias")); } break; case PROJECTOR_TYPE_GLM4V: { diff --git a/tools/mtmd/models/vaetki.cpp b/tools/mtmd/models/vaetki.cpp index 0e176ec925..895d3c5622 100644 --- a/tools/mtmd/models/vaetki.cpp +++ b/tools/mtmd/models/vaetki.cpp @@ -81,7 +81,7 @@ ggml_cgraph * clip_graph_vaetki::build() { cb(embeddings, "patches_only", -1); // merger - embeddings = build_norm(embeddings, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + embeddings = build_norm(embeddings, model.mm_input_norm_w, model.mm_input_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); cb(embeddings, "merger_normed", -1); // pixel shuffle @@ -90,9 +90,9 @@ ggml_cgraph * clip_graph_vaetki::build() { cb(embeddings, "merger_reshaped", -1); embeddings = build_ffn(embeddings, - model.mm_1_w, model.mm_1_b, + model.mm_ffn_up_w, model.mm_ffn_up_b, nullptr, nullptr, - model.mm_3_w, model.mm_3_b, + model.mm_ffn_down_w, model.mm_ffn_down_b, FFN_GELU, -1); cb(embeddings, "merger_out", -1);