llama.cpp/tools/mtmd/models/step3vl.cpp

#include "models.h"

ggml_cgraph * clip_graph_step3vl::build() {
    GGML_ASSERT(model.class_embedding == nullptr);
    GGML_ASSERT(model.patch_embeddings_0 != nullptr);
    GGML_ASSERT(model.position_embeddings != nullptr);

    norm_type norm_t = NORM_TYPE_NORMAL;

    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
    ggml_set_name(pos_h, "pos_h");
    ggml_set_input(pos_h);

    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
    ggml_set_name(pos_w, "pos_w");
    ggml_set_input(pos_w);

    ggml_tensor * inp = build_inp();
    ggml_tensor * learned_pos_embd = resize_position_embeddings();

    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
    };

    auto add_spatial_bias = [&](ggml_tensor * cur, ggml_tensor * bias) {
        if (bias == nullptr) {
            return cur;
        }

        const int64_t width    = cur->ne[0];
        const int64_t height   = cur->ne[1];
        const int64_t channels = cur->ne[2];

        cur = ggml_reshape_2d(ctx0, cur, width * height, channels);
        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
        cur = ggml_add(ctx0, cur, bias);
        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
        cur = ggml_reshape_3d(ctx0, cur, width, height, channels);

        return cur;
    };

    ggml_tensor * cur = build_vit(
        inp,
        n_patches,
        norm_t,
        hparams.ffn_op,
        learned_pos_embd,
        add_pos);
    cb(cur, "vit_out", -1);

    // [n_embd, n_patches] -> [w, h, n_embd] for spatial downsampling convolutions.
    cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
    cur = ggml_cont_3d(ctx0, cur, n_patches_x, n_patches_y, n_embd);

    // First downsampler: Conv2d(1536 -> 3072, k=3, s=2, p=1)
    cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, 2, 2, 1, 1, 1, 1);
    cur = add_spatial_bias(cur, model.mm_0_b);
    cb(cur, "downsample_0", -1);

    // Second downsampler: Conv2d(3072 -> 6144, k=3, s=2, p=1)
    cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 2, 2, 1, 1, 1, 1);
    cur = add_spatial_bias(cur, model.mm_1_b);
    cb(cur, "downsample_1", -1);

    // [w, h, c] -> [c, w*h]
    {
        const int64_t w = cur->ne[0];
        const int64_t h = cur->ne[1];
        cur = ggml_reshape_3d(ctx0, cur, w * h, cur->ne[2], cur->ne[3]);
        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
    }
    cb(cur, "downsample_flatten", -1);

    // Final projector: Linear(6144 -> projection_dim)
    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
    cb(cur, "projector_out", -1);

    ggml_build_forward_expand(gf, cur);
    return gf;
}