diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 4c7f7504cf..3604bf77e8 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -41,6 +41,11 @@ struct clip_graph { virtual ~clip_graph() = default; virtual ggml_cgraph * build() = 0; + // wrapper around ggml_mul_mat, allow hooking (e.g. LoRA, clamping) depending on the model + // tensor w should be the weight matrix, and tensor x should be the input + virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const; + // TODO: build_mm(w, b, x) to support bias + // // utility functions // diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3d6cf6fd84..44a19189ea 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -255,6 +255,10 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false); } +ggml_tensor * clip_graph::build_mm(ggml_tensor * w, ggml_tensor * x) const { + return ggml_mul_mat(ctx0, w, x); +} + void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); @@ -326,7 +330,7 @@ ggml_tensor * clip_graph::build_vit( ggml_tensor * Vcur = nullptr; if (layer.qkv_w != nullptr) { // fused qkv - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = build_mm(layer.qkv_w, cur); if (layer.qkv_b != nullptr) { cur = ggml_add(ctx0, cur, layer.qkv_b); } @@ -360,17 +364,17 @@ ggml_tensor * clip_graph::build_vit( } else { // separate q, k, v - Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + Qcur = build_mm(layer.q_w, cur); if (layer.q_b) { Qcur = ggml_add(ctx0, Qcur, layer.q_b); } - Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + Kcur = build_mm(layer.k_w, cur); if (layer.k_b) { Kcur = ggml_add(ctx0, Kcur, layer.k_b); } - Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + Vcur = build_mm(layer.v_w, cur); if (layer.v_b) { Vcur = ggml_add(ctx0, Vcur, layer.v_b); } @@ -517,7 +521,7 @@ ggml_tensor * clip_graph::build_ffn( ffn_op_type type_op, int il) const { - ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur; + ggml_tensor * tmp = up ? build_mm(up, cur) : cur; cb(tmp, "ffn_up", il); if (up_b) { @@ -526,7 +530,7 @@ ggml_tensor * clip_graph::build_ffn( } if (gate) { - cur = ggml_mul_mat(ctx0, gate, cur); + cur = build_mm(gate, cur); cb(cur, "ffn_gate", il); if (gate_b) { @@ -580,7 +584,7 @@ ggml_tensor * clip_graph::build_ffn( } if (down) { - cur = ggml_mul_mat(ctx0, down, cur); + cur = build_mm(down, cur); } if (down_b) { @@ -646,7 +650,7 @@ ggml_tensor * clip_graph::build_attn( cb(cur, "kqv_out", il); if (wo) { - cur = ggml_mul_mat(ctx0, wo, cur); + cur = build_mm(wo, cur); } if (wo_b) { diff --git a/tools/mtmd/models/cogvlm.cpp b/tools/mtmd/models/cogvlm.cpp index d5b739c687..44bc884421 100644 --- a/tools/mtmd/models/cogvlm.cpp +++ b/tools/mtmd/models/cogvlm.cpp @@ -19,7 +19,7 @@ ggml_cgraph * clip_graph_cogvlm::build() { auto & layer = model.layers[il]; ggml_tensor * cur = inpL; - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = build_mm(layer.qkv_w, cur); cur = ggml_add(ctx0, cur, layer.qkv_b); @@ -67,7 +67,7 @@ ggml_cgraph * clip_graph_cogvlm::build() { ggml_row_size(inpL->type, n_embd), 0); // Multiply with mm_model_proj - cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + cur = build_mm(model.mm_model_proj, cur); // Apply layernorm, weight, bias cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); @@ -76,16 +76,16 @@ ggml_cgraph * clip_graph_cogvlm::build() { cur = ggml_gelu_inplace(ctx0, cur); // Branch 1: multiply with mm_h_to_4h_w - ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur); + ggml_tensor * h_to_4h = build_mm(model.mm_h_to_4h_w, cur); // Branch 2: multiply with mm_gate_w - ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); + ggml_tensor * gate = build_mm(model.mm_gate_w, cur); // Apply silu gate = ggml_swiglu_split(ctx0, gate, h_to_4h); // Apply mm_4h_to_h_w - cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate); + cur = build_mm(model.mm_4h_to_h_w, gate); // Concatenate with boi and eoi cur = ggml_concat(ctx0, model.mm_boi, cur, 1); diff --git a/tools/mtmd/models/conformer.cpp b/tools/mtmd/models/conformer.cpp index 9b1fab4873..f58c5048f5 100644 --- a/tools/mtmd/models/conformer.cpp +++ b/tools/mtmd/models/conformer.cpp @@ -56,7 +56,7 @@ ggml_cgraph * clip_graph_conformer::build() { cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); // calculate out - cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur); + cur = build_mm(model.pre_encode_out_w, cur); cur = ggml_add(ctx0, cur, model.pre_encode_out_b); cb(cur, "conformer.pre_encode.out", -1); } @@ -87,7 +87,7 @@ ggml_cgraph * clip_graph_conformer::build() { cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il); cb(cur, "conformer.layers.{}.norm_self_att", il); - ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + ggml_tensor * Qcur = build_mm(layer.q_w, cur); Qcur = ggml_add(ctx0, Qcur, layer.q_b); Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]); ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); @@ -96,12 +96,12 @@ ggml_cgraph * clip_graph_conformer::build() { Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3); // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases - ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + ggml_tensor * Kcur = build_mm(layer.k_w, cur); Kcur = ggml_add(ctx0, Kcur, layer.k_b); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]); Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + ggml_tensor * Vcur = build_mm(layer.v_w, cur); Vcur = ggml_add(ctx0, Vcur, layer.v_b); Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]); Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3)); @@ -111,7 +111,7 @@ ggml_cgraph * clip_graph_conformer::build() { matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3)); cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il); - auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); + auto * p = build_mm(layer.linear_pos_w, pos_emb); cb(p, "conformer.layers.{}.self_attn.linear_pos", il); p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]); p = ggml_permute(ctx0, p, 0, 2, 1, 3); @@ -143,7 +143,7 @@ ggml_cgraph * clip_graph_conformer::build() { x = ggml_permute(ctx0, x, 2, 0, 1, 3); x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); - ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x); + ggml_tensor * out = build_mm(layer.o_w, x); out = ggml_add(ctx0, out, layer.o_b); cb(out, "conformer.layers.{}.self_attn.linear_out", il); @@ -157,7 +157,7 @@ ggml_cgraph * clip_graph_conformer::build() { // conv { auto * x = cur; - x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x); + x = build_mm(layer.conv_pw1_w, x); x = ggml_add(ctx0, x, layer.conv_pw1_b); cb(x, "conformer.layers.{}.conv.pointwise_conv1", il); @@ -181,7 +181,7 @@ ggml_cgraph * clip_graph_conformer::build() { x = ggml_silu(ctx0, x); // pointwise_conv2 - x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x); + x = build_mm(layer.conv_pw2_w, x); x = ggml_add(ctx0, x, layer.conv_pw2_b); cur = x; diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp index 6f52df41ab..9dbb162c59 100644 --- a/tools/mtmd/models/glm4v.cpp +++ b/tools/mtmd/models/glm4v.cpp @@ -97,7 +97,7 @@ ggml_cgraph * clip_graph_glm4v::build() { // FC projector { - cur = ggml_mul_mat(ctx0, model.projection, cur); + cur = build_mm(model.projection, cur); // default LayerNorm (post_projection_norm) cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); cur = ggml_gelu_erf(ctx0, cur); diff --git a/tools/mtmd/models/llama4.cpp b/tools/mtmd/models/llama4.cpp index 30d1df5bcd..01af54bbab 100644 --- a/tools/mtmd/models/llama4.cpp +++ b/tools/mtmd/models/llama4.cpp @@ -22,7 +22,7 @@ ggml_cgraph * clip_graph_llama4::build() { ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0, patch_size, patch_size, 3, n_embd); inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type); - inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); + inp = build_mm(model.patch_embeddings_0, inp); inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); cb(inp, "patch_conv", -1); } @@ -78,15 +78,15 @@ ggml_cgraph * clip_graph_llama4::build() { // based on Llama4VisionMLP2 (always uses GELU activation, no bias) { - cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur); + cur = build_mm(model.mm_model_mlp_1_w, cur); cur = ggml_gelu(ctx0, cur); - cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur); + cur = build_mm(model.mm_model_mlp_2_w, cur); cur = ggml_gelu(ctx0, cur); cb(cur, "adapter_mlp", -1); } // Llama4MultiModalProjector - cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + cur = build_mm(model.mm_model_proj, cur); cb(cur, "projected", -1); // build the graph diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp index 0bfb5f05f6..4af17ccfe8 100644 --- a/tools/mtmd/models/llava.cpp +++ b/tools/mtmd/models/llava.cpp @@ -70,17 +70,17 @@ ggml_cgraph * clip_graph_llava::build() { // self-attention { - ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + ggml_tensor * Qcur = build_mm(layer.q_w, cur); if (layer.q_b) { Qcur = ggml_add(ctx0, Qcur, layer.q_b); } - ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + ggml_tensor * Kcur = build_mm(layer.k_w, cur); if (layer.k_b) { Kcur = ggml_add(ctx0, Kcur, layer.k_b); } - ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + ggml_tensor * Vcur = build_mm(layer.v_w, cur); if (layer.v_b) { Vcur = ggml_add(ctx0, Vcur, layer.v_b); } @@ -164,17 +164,17 @@ ggml_cgraph * clip_graph_llava::build() { // llava projector if (proj_type == PROJECTOR_TYPE_MLP) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = build_mm(model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); embeddings = ggml_gelu(ctx0, embeddings); if (model.mm_2_w) { - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = build_mm(model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); } } else if (proj_type == PROJECTOR_TYPE_MLP_NORM) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = build_mm(model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); // First LayerNorm @@ -186,7 +186,7 @@ ggml_cgraph * clip_graph_llava::build() { embeddings = ggml_gelu(ctx0, embeddings); // Second linear layer - embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); + embeddings = build_mm(model.mm_3_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); // Second LayerNorm @@ -197,10 +197,10 @@ ggml_cgraph * clip_graph_llava::build() { else if (proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projector int n_patch = 24; - ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); + ggml_tensor * mlp_1 = build_mm(model.mm_model_mlp_1_w, embeddings); mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); mlp_1 = ggml_gelu(ctx0, mlp_1); - ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); + ggml_tensor * mlp_3 = build_mm(model.mm_model_mlp_3_w, mlp_1); mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] @@ -229,10 +229,10 @@ ggml_cgraph * clip_graph_llava::build() { // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] // pointwise conv block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); + block_1 = build_mm(model.mm_model_block_1_block_1_fc1_w, block_1); block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); block_1 = ggml_relu(ctx0, block_1); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); + block_1 = build_mm(model.mm_model_block_1_block_1_fc2_w, block_1); block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); block_1 = ggml_hardsigmoid(ctx0, block_1); // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] @@ -244,7 +244,7 @@ ggml_cgraph * clip_graph_llava::build() { block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); + block_1 = build_mm(model.mm_model_block_1_block_2_0_w, block_1); block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] @@ -277,10 +277,10 @@ ggml_cgraph * clip_graph_llava::build() { // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] // pointwise conv block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); + block_1 = build_mm(model.mm_model_block_2_block_1_fc1_w, block_1); block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); block_1 = ggml_relu(ctx0, block_1); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); + block_1 = build_mm(model.mm_model_block_2_block_1_fc2_w, block_1); block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); block_1 = ggml_hardsigmoid(ctx0, block_1); @@ -292,7 +292,7 @@ ggml_cgraph * clip_graph_llava::build() { block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); + block_1 = build_mm(model.mm_model_block_2_block_2_0_w, block_1); block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); @@ -307,10 +307,10 @@ ggml_cgraph * clip_graph_llava::build() { else if (proj_type == PROJECTOR_TYPE_LDPV2) { int n_patch = 24; - ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + ggml_tensor * mlp_0 = build_mm(model.mm_model_mlp_0_w, embeddings); mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); mlp_0 = ggml_gelu(ctx0, mlp_0); - ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + ggml_tensor * mlp_2 = build_mm(model.mm_model_mlp_2_w, mlp_0); mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); // mlp_2 ne = [2048, 576, 1, 1] // // AVG Pool Layer 2*2, strides = 2 @@ -344,15 +344,15 @@ ggml_cgraph * clip_graph_llava::build() { embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); // GLU { - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + embeddings = build_mm(model.mm_model_mlp_0_w, embeddings); embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); embeddings = ggml_gelu_inplace(ctx0, embeddings); ggml_tensor * x = embeddings; - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); - x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); + embeddings = build_mm(model.mm_model_mlp_2_w, embeddings); + x = build_mm(model.mm_model_mlp_1_w,x); embeddings = ggml_swiglu_split(ctx0, embeddings, x); - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); + embeddings = build_mm(model.mm_model_mlp_3_w, embeddings); } // arrangement of BOI/EOI token embeddings // note: these embeddings are not present in text model, hence we cannot process them as text tokens diff --git a/tools/mtmd/models/minicpmv.cpp b/tools/mtmd/models/minicpmv.cpp index 3594ea29fa..924117ab2a 100644 --- a/tools/mtmd/models/minicpmv.cpp +++ b/tools/mtmd/models/minicpmv.cpp @@ -38,7 +38,7 @@ ggml_cgraph * clip_graph_minicpmv::build() { // resampler projector (it is just another transformer) ggml_tensor * q = model.mm_model_query; - ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + ggml_tensor * v = build_mm(model.mm_model_kv_proj, embeddings); // norm q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1); @@ -77,13 +77,13 @@ ggml_cgraph * clip_graph_minicpmv::build() { // Use actual config value if available, otherwise fall back to hardcoded values int num_query = hparams.minicpmv_query_num; ggml_tensor * Q = ggml_add(ctx0, - ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), + build_mm(model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); ggml_tensor * K = ggml_add(ctx0, - ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), + build_mm(model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); ggml_tensor * V = ggml_add(ctx0, - ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), + build_mm(model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query); @@ -105,7 +105,7 @@ ggml_cgraph * clip_graph_minicpmv::build() { embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1); // projection - embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + embeddings = build_mm(model.mm_model_proj, embeddings); // build the graph ggml_build_forward_expand(gf, embeddings); diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp index 593afa1ddc..1c42218d2a 100644 --- a/tools/mtmd/models/mobilenetv5.cpp +++ b/tools/mtmd/models/mobilenetv5.cpp @@ -429,7 +429,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False) // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size] if (model.mm_input_proj_w) { - cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur); + cur = build_mm(model.mm_input_proj_w, cur); } // 5. POST PROJECTION NORM diff --git a/tools/mtmd/models/pixtral.cpp b/tools/mtmd/models/pixtral.cpp index a849210b53..d6d037b694 100644 --- a/tools/mtmd/models/pixtral.cpp +++ b/tools/mtmd/models/pixtral.cpp @@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_pixtral::build() { // project to n_embd cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); - cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); + cur = build_mm(model.mm_patch_merger_w, cur); } // LlavaMultiModalProjector (always using GELU activation) diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp index 85f158bb1c..ebf1075737 100644 --- a/tools/mtmd/models/qwen2vl.cpp +++ b/tools/mtmd/models/qwen2vl.cpp @@ -90,11 +90,11 @@ ggml_cgraph * clip_graph_qwen2vl::build() { // self-attention { ggml_tensor * Qcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + build_mm(layer.q_w, cur), layer.q_b); ggml_tensor * Kcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + build_mm(layer.k_w, cur), layer.k_b); ggml_tensor * Vcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + build_mm(layer.v_w, cur), layer.v_b); Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index 5ecb10fe43..fa1100dda8 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -85,7 +85,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() { // self-attention { - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = build_mm(layer.qkv_w, cur); cur = ggml_add(ctx0, cur, layer.qkv_b); ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, diff --git a/tools/mtmd/models/siglip.cpp b/tools/mtmd/models/siglip.cpp index 75f9b4db44..9dafa35ea8 100644 --- a/tools/mtmd/models/siglip.cpp +++ b/tools/mtmd/models/siglip.cpp @@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_siglip::build() { // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 const int scale_factor = model.hparams.n_merge; cur = build_patch_merge_permute(cur, scale_factor); - cur = ggml_mul_mat(ctx0, model.projection, cur); + cur = build_mm(model.projection, cur); } else if (proj_type == PROJECTOR_TYPE_LFM2) { // pixel unshuffle block diff --git a/tools/mtmd/models/whisper-enc.cpp b/tools/mtmd/models/whisper-enc.cpp index 2f2b127755..ed61bb05ba 100644 --- a/tools/mtmd/models/whisper-enc.cpp +++ b/tools/mtmd/models/whisper-enc.cpp @@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_whisper_enc::build() { cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); // ffn in - cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + cur = build_mm(model.mm_1_w, cur); // swiglu // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half @@ -70,11 +70,11 @@ ggml_cgraph * clip_graph_whisper_enc::build() { cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w); // ffn out - cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + cur = build_mm(model.mm_2_w, cur); } else if (proj_type == PROJECTOR_TYPE_QWEN2A) { // projector - cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); + cur = build_mm(model.mm_fc_w, cur); cur = ggml_add(ctx0, cur, model.mm_fc_b); } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) { diff --git a/tools/mtmd/models/youtuvl.cpp b/tools/mtmd/models/youtuvl.cpp index ffbf2be554..cd8f6d446f 100644 --- a/tools/mtmd/models/youtuvl.cpp +++ b/tools/mtmd/models/youtuvl.cpp @@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_youtuvl::build() { ctx0, inp, 3*patch_size* patch_size, Hm * Wm * m * m, 1); } - inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); + inp = build_mm(model.patch_embeddings_0, inp); if (model.patch_bias) { inp = ggml_add(ctx0, inp, model.patch_bias); @@ -97,11 +97,11 @@ ggml_cgraph * clip_graph_youtuvl::build() { // self-attention { ggml_tensor * Qcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + build_mm(layer.q_w, cur), layer.q_b); ggml_tensor * Kcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + build_mm(layer.k_w, cur), layer.k_b); ggml_tensor * Vcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + build_mm(layer.v_w, cur), layer.v_b); Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);