llama.cpp/tools/mtmd/models/dotsocr.cpp

#include "models.h"

ggml_cgraph * clip_graph_dotsocr::build() {
    const int n_pos            = n_patches;
    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position

    // note: similar to PaddleOCR
    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);

    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
        return ggml_rope_multi(
                    ctx0, cur, positions, nullptr,
                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
                    32768, 10000, 1, 0, 1, 32, 1);
    };

    ggml_tensor * inp = build_inp();
    ggml_tensor * cur = build_vit(
                            inp, n_patches,
                            NORM_TYPE_RMS,
                            hparams.ffn_op,
                            nullptr,
                            add_pos);

    cb(cur, "vit_out", -1);

    // dots.ocr patch merger + projector
    {
        GGML_ASSERT(hparams.n_merge > 0);
        cur = build_norm(cur, model.mm_input_norm_w, model.mm_input_norm_b, NORM_TYPE_NORMAL, 1e-6, -1);
        cur = build_patch_merge_permute(cur, hparams.n_merge);
        cb(cur, "after_patch_merger", -1);
        cur = build_ffn(cur,
            model.mm_0_w, model.mm_0_b,
            nullptr, nullptr, // no gate
            model.mm_2_w, model.mm_2_b,
            FFN_GELU_ERF, -1); // nn.GELU() defaults to exact erf-based GELU
        cb(cur, "after_projector", -1);
    }

    // build the graph
    ggml_build_forward_expand(gf, cur);

    return gf;
}