refactoring, one single builder function and static helpers

2025-12-13 17:13:32 +01:00 · 2025-12-13 17:13:32 +01:00 · f7736f23ef
parent f95a6fe9f3
commit f7736f23ef
2 changed files with 295 additions and 340 deletions
--- a/tools/mtmd/models/deepseekocr.cpp
+++ b/tools/mtmd/models/deepseekocr.cpp
@ -1,21 +1,115 @@
 #include "models.h"

-ggml_tensor* clip_graph_deepseekocr::build_sam(ggml_tensor* inp_raw)
-{
+// Implementation based on approach suggested by Acly
+// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
+static ggml_tensor * window_partition(ggml_context * ctx0, ggml_tensor * x, const int window) {
+    auto [c, w, h, b] = x->ne;
+    // same as
+    // x = ggml_win_part(m, x, window);
+    // x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]);
+
+    const int64_t px  = (window - w % window) % window;
+    const int64_t py  = (window - h % window) % window;
+    const int64_t npw = (w + px) / window;
+    const int64_t nph = (h + py) / window;
+
+    ggml_tensor * cur = x;
+    if (px > 0 || py > 0) {
+        cur = ggml_pad(ctx0, cur, 0, static_cast<int>(px), static_cast<int>(py), 0);
+    }
+    cur = ggml_reshape_4d(ctx0, cur, c * window, npw, window, nph * b);
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
+    cur = ggml_reshape_4d(ctx0, cur, c, window, window, npw * nph * b);
+    return cur;
+}
+
+// Implementation based on approach suggested by Acly
+// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
+static ggml_tensor * window_unpartition(ggml_context * ctx0,
+                                        ggml_tensor *  x,
+                                        const int      w,
+                                        const int      h,
+                                        const int      window) {
+    const int64_t c = x->ne[0];
+    // same as
+    // x = ggml_reshape_4d(m, x, c, window, window, x->ne[2]);
+    // x = ggml_win_unpart(m, x, w, h, window);
+
+    const int64_t px  = (window - w % window) % window;
+    const int64_t py  = (window - h % window) % window;
+    const int64_t npw = (w + px) / window;
+    const int64_t nph = (h + py) / window;
+
+    const int64_t b = x->ne[3] / (npw * nph);
+    ggml_tensor * cur = x;
+    cur = ggml_reshape_4d(ctx0, cur, c * window, window, npw, nph * b);
+    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
+    cur = ggml_reshape_4d(ctx0, cur, c, w + px, h + py, b);
+    cur = ggml_view_4d(ctx0, cur, cur->ne[0], w, h, cur->ne[3], cur->nb[1], cur->nb[2], cur->nb[3], 0);
+    cur = ggml_cont(ctx0, cur);
+    return cur;
+}
+
+static ggml_tensor * get_rel_pos(ggml_context * ctx0,
+                                 ggml_tensor *  rel_pos,  // [L, C]
+                                 ggml_tensor *  indices,  // [q_size, k_size]
+                                 const int      q_size,
+                                 const int      k_size) {
+    const int64_t C = rel_pos->ne[0];  // channels
+    const int64_t L = rel_pos->ne[1];  // length
+
+    GGML_ASSERT(indices != nullptr);
+    GGML_ASSERT(indices->type == GGML_TYPE_I32);
+    GGML_ASSERT(indices->ne[0] == k_size);
+    GGML_ASSERT(indices->ne[1] == q_size);
+
+    const auto    max_rel_dist = 2 * std::max(q_size, k_size) - 1;
+    ggml_tensor * cur          = rel_pos;
+
+    if (max_rel_dist != L) {
+        // Linear interpolation
+        const int64_t ne0 = cur->ne[0];
+        const int64_t ne1 = cur->ne[1];
+        const int64_t ne2 = cur->ne[2];
+        const int64_t ne3 = cur->ne[3];
+
+        cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)), ne1, 1, ne0 * ne2 * ne3);
+        cur = ggml_reshape_4d(
+            ctx0, ggml_interpolate(ctx0, cur, max_rel_dist, 1, ne0 * ne2 * ne3, 1, GGML_SCALE_MODE_BILINEAR),
+            max_rel_dist, ne0, ne2, ne3);
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
+    }
+
+    // Flatten indices to 1D for ggml_get_rows
+    const int qk = q_size * k_size;
+
+    cur = ggml_reshape_3d(ctx0, ggml_get_rows(ctx0, cur, ggml_reshape_1d(ctx0, indices, qk)), C, k_size, q_size);
+
+    return cur;  // [C, k_size, q_size]
+}
+
+ggml_cgraph * clip_graph_deepseekocr::build() {
+    //patch embedding
+    ggml_tensor * inp_raw = build_inp_raw();
+    //ggml_tensor * sam_out  = build_sam(inp_raw);
+
+    ggml_tensor * sam_out;
+    // Building SAM
+    {
        const int n_embd  = hparams.sam_n_embd;
        const int n_layer = hparams.sam_n_layer;
        const int n_heads = hparams.sam_n_head;
        const int d_heads = n_embd / n_heads;
        const int window  = hparams.attn_window_size;

-    ggml_tensor* inpL;
+        ggml_tensor * inpL;

        inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
        inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
        inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));

-    ggml_tensor* rel_pos_indices_local;
-    ggml_tensor* rel_pos_indices_global;
+        ggml_tensor * rel_pos_indices_local;
+        ggml_tensor * rel_pos_indices_global;

        rel_pos_indices_local  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
        rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
@ -24,36 +118,25 @@ ggml_tensor* clip_graph_deepseekocr::build_sam(ggml_tensor* inp_raw)
        ggml_set_input(rel_pos_indices_local);
        ggml_set_input(rel_pos_indices_global);

-    ggml_tensor* cur;
+        ggml_tensor * cur;
        const auto    tgt_size = inpL->ne[1];
        const auto    str_size = model.pos_embed->ne[1];

-    if (str_size != tgt_size)
-    {
-        ggml_tensor* old_pos_embed = nullptr;
+        if (str_size != tgt_size) {
+            ggml_tensor * old_pos_embed = nullptr;
            old_pos_embed               = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
-        ggml_tensor* new_pos_embed = ggml_interpolate(
-            ctx0,
-            old_pos_embed,
-            tgt_size,
-            tgt_size,
-            n_embd,
-            1,
-            ggml_scale_mode::GGML_SCALE_MODE_BICUBIC
-        );
+            ggml_tensor * new_pos_embed =
+                ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
            new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
            cur           = ggml_add(ctx0, inpL, new_pos_embed);
-    }
-    else
-    {
+        } else {
            cur = ggml_add(ctx0, inpL, model.pos_embed);
        }

        // loop over layers
-    for (int il = 0; il < n_layer; il++)
-    {
-        auto& layer = model.sam_layers[il];
-        ggml_tensor* shortcut = cur;
+        for (int il = 0; il < n_layer; il++) {
+            auto &        layer    = model.sam_layers[il];
+            ggml_tensor * shortcut = cur;

            // layernorm1
            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
@ -61,16 +144,13 @@ ggml_tensor* clip_graph_deepseekocr::build_sam(ggml_tensor* inp_raw)
            const int64_t w0 = cur->ne[1];
            const int64_t h0 = cur->ne[2];

-        ggml_tensor* indices;
+            ggml_tensor * indices;

-        if (hparams.is_global_attn(il))
-        {
+            if (hparams.is_global_attn(il)) {
                indices = rel_pos_indices_global;
-        }
-        else
-        {
+            } else {
                // local attention layer - apply window partition
-            cur = window_partition(cur, window);
+                cur     = window_partition(ctx0, cur, window);
                indices = rel_pos_indices_local;
            }

@ -85,9 +165,9 @@ ggml_tensor* clip_graph_deepseekocr::build_sam(ggml_tensor* inp_raw)
                cur = ggml_cont(ctx0, cur);  // Ensure tensor is contiguous before reshape
                cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);

-            ggml_tensor* Q;
-            ggml_tensor* K;
-            ggml_tensor* V;
+                ggml_tensor * Q;
+                ggml_tensor * K;
+                ggml_tensor * V;

                Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
                Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
@ -98,18 +178,18 @@ ggml_tensor* clip_graph_deepseekocr::build_sam(ggml_tensor* inp_raw)
                V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
                V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);

-            ggml_tensor* mask;
-            ggml_tensor* rw;
-            ggml_tensor* rh;
-            ggml_tensor* qr;
+                ggml_tensor * mask;
+                ggml_tensor * rw;
+                ggml_tensor * rh;
+                ggml_tensor * qr;

-            rw = get_rel_pos(layer.rel_pos_w, indices, W, W); // [W, W, C]
-            rh = get_rel_pos(layer.rel_pos_h, indices, H, H); // [H, H, C]
+                rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W);  // [W, W, C]
+                rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H);  // [H, H, C]
                qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
                qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);

-
-            rw = ggml_mul_mat(ctx0, rw, ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
+                rw   = ggml_mul_mat(ctx0, rw,
+                                    ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3)));  // [B*n_heads, W, H, W]
                rw   = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3));                // [B*n_heads, H, W, W]
                rw   = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
                rw   = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
@ -119,30 +199,29 @@ ggml_tensor* clip_graph_deepseekocr::build_sam(ggml_tensor* inp_raw)
                mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
                mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);

-            float scale = 1.0f / sqrtf((float)d_heads);
+                const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));

                cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
                                 il);  // [B, H*W, n_embd]
                cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
            }

-        if (hparams.is_global_attn(il) == false)
-        {
+            if (hparams.is_global_attn(il) == false) {
                // local attention layer - reverse window partition
-            cur = window_unpartition(cur, w0, h0, window);
+                cur = window_unpartition(ctx0, cur, w0, h0, window);
            }

            // re-add the layer input, e.g., residual
            cur = ggml_add(ctx0, cur, shortcut);

-        ggml_tensor* inpFF = cur;
+            ggml_tensor * inpFF = cur;

            // layernorm2
            cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);

            // ffn
-        cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w,
-                        layer.ff_down_b, hparams.ffn_op, il);
+            cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
+                            hparams.ffn_op, il);

            // residual 2
            cur = ggml_add(ctx0, cur, inpFF);
@ -166,24 +245,60 @@ ggml_tensor* clip_graph_deepseekocr::build_sam(ggml_tensor* inp_raw)
        cb(cur, "sam_output", -1);

        ggml_build_forward_expand(gf, cur);
-    return cur;
-}
+        sam_out = cur;
+    }
+    //ggml_tensor * clip_out = build_dsocr_clip(sam_out);
+    ggml_tensor * clip_out;
+    // Building DS-OCR CLIP
+    {
+        ggml_tensor * inp;

+        inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out));
+        inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));

-ggml_cgraph* clip_graph_deepseekocr::build()
-{
-    //patch embedding
-    ggml_tensor* inp_raw = build_inp_raw();
-    ggml_tensor* sam_out = build_sam(inp_raw);
-    ggml_tensor* clip_out = build_dsocr_clip(sam_out);
+        ggml_tensor * new_pos_embd =
+            ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));

-    int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
+        int        n_pos    = new_pos_embd->ne[1];  // +1 for [CLS]
+        const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
+        const auto src_size = static_cast<int>(std::sqrt(n_pos - 1));
+
+        if (tgt_size != src_size) {
+            ggml_tensor * old_pos_embd;
+            ggml_tensor * cls_tok;
+
+            old_pos_embd = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], src_size * src_size,
+                                        ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0);
+            cls_tok      = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], 1,
+                                        ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size);
+            new_pos_embd = ggml_interpolate(ctx0, old_pos_embd, tgt_size, tgt_size, new_pos_embd->ne[0], 1,
+                                            GGML_SCALE_MODE_BICUBIC);
+            new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1);
+            new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1);
+            n_pos        = tgt_size * tgt_size + 1;
+        }
+
+        // add CLS token
+        inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
+
+        // for selecting learned pos embd, used by ViT
+        ggml_tensor * positions        = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32);
+        ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions);
+
+        ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr);
+
+        ggml_build_forward_expand(gf, cur);
+        clip_out = cur;
+    }
+
+    const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];

    sam_out  = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
    sam_out  = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
    clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);

-    ggml_tensor* cur;
+    ggml_tensor * cur;
    cur = ggml_concat(ctx0, clip_out, sam_out, 0);
    cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches);
    cur = ggml_cont(ctx0, cur);
@ -194,8 +309,8 @@ ggml_cgraph* clip_graph_deepseekocr::build()
    const auto w     = h;
    const auto n_dim = cur->ne[0];

-    ggml_tensor* imgnl;
-    ggml_tensor* vs;
+    ggml_tensor * imgnl;
+    ggml_tensor * vs;

    imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
    vs    = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1);  // (n_dim, 1)
@ -208,157 +323,3 @@ ggml_cgraph* clip_graph_deepseekocr::build()
    ggml_build_forward_expand(gf, cur);
    return gf;
 }
-
-ggml_tensor* clip_graph_deepseekocr::build_dsocr_clip(ggml_tensor* patch_embeds)
-{
-    ggml_tensor* inp;
-
-    inp = ggml_cpy(ctx0, patch_embeds, ggml_dup_tensor(ctx0, patch_embeds));
-    inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
-    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
-
-    ggml_tensor* new_pos_embd = ggml_cpy(ctx0, model.position_embeddings,
-                                         ggml_dup_tensor(ctx0, model.position_embeddings));
-
-    int n_pos = new_pos_embd->ne[1]; // +1 for [CLS]
-    const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
-    const auto src_size = static_cast<int>(std::sqrt(n_pos - 1));
-
-    if (tgt_size != src_size)
-    {
-        ggml_tensor* old_pos_embd;
-        ggml_tensor* cls_tok;
-
-        old_pos_embd = ggml_view_2d(
-            ctx0, new_pos_embd,
-            new_pos_embd->ne[0], src_size * src_size,
-            ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0
-        );
-        cls_tok = ggml_view_2d(
-            ctx0, new_pos_embd,
-            new_pos_embd->ne[0], 1,
-            ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size
-        );
-        new_pos_embd = ggml_interpolate(ctx0,
-                                        old_pos_embd,
-                                        tgt_size,
-                                        tgt_size,
-                                        new_pos_embd->ne[0], 1, GGML_SCALE_MODE_BICUBIC
-        );
-        new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1);
-        new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1);
-        n_pos = tgt_size * tgt_size + 1;
-    }
-
-    // add CLS token
-    inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
-
-    // for selecting learned pos embd, used by ViT
-    ggml_tensor* positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32);
-    ggml_tensor* learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions);
-
-    ggml_tensor* cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, ffn_op_type::FFN_GELU_QUICK,
-                                 learned_pos_embd, nullptr);
-
-    ggml_build_forward_expand(gf, cur);
-
-    return cur;
-}
-
-ggml_tensor * clip_graph_deepseekocr::get_rel_pos(
-        ggml_tensor * rel_pos,     // [L, C]
-        ggml_tensor * indices,     // [q_size, k_size]
-        int q_size,
-        int k_size
-    ) {
-        const int64_t C = rel_pos->ne[0];   // channels
-        const int64_t L = rel_pos->ne[1];   // length
-
-        GGML_ASSERT(indices != nullptr);
-        GGML_ASSERT(indices->type == GGML_TYPE_I32);
-        GGML_ASSERT(indices->ne[0] == k_size);
-        GGML_ASSERT(indices->ne[1] == q_size);
-
-        const auto max_rel_dist = 2*std::max(q_size, k_size) - 1;
-        ggml_tensor * cur = rel_pos;
-
-        if (max_rel_dist != L) {
-            // Linear interpolation
-            int64_t ne0 = cur->ne[0];
-            int64_t ne1 = cur->ne[1];
-            int64_t ne2 = cur->ne[2];
-            int64_t ne3 = cur->ne[3];
-
-            cur = ggml_reshape_3d(
-                ctx0,
-                ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)),
-                ne1, 1, ne0*ne2*ne3
-            );
-            cur = ggml_reshape_4d(
-                ctx0,
-                ggml_interpolate(
-                    ctx0,
-                    cur,
-                    max_rel_dist, 1, ne0*ne2*ne3, 1,
-                    ggml_scale_mode::GGML_SCALE_MODE_BILINEAR
-                ),
-                max_rel_dist, ne0, ne2, ne3
-            );
-            cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
-        }
-
-        // Flatten indices to 1D for ggml_get_rows
-        int qk = q_size * k_size;
-
-        cur = ggml_reshape_3d(
-            ctx0,
-            ggml_get_rows(ctx0, cur, ggml_reshape_1d(ctx0, indices, qk)),
-            C, k_size, q_size
-        );
-
-        return cur;   // [C, k_size, q_size]
-    }
-
-    // Implementation based on approach suggested by Acly
-    // See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
-    ggml_tensor* clip_graph_deepseekocr::window_partition(ggml_tensor* x, int window) {
-        auto [c, w, h, b] = x->ne;
-        // same as
-        // x = ggml_win_part(m, x, window);
-        // x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]);
-
-        int64_t px = (window - w % window) % window;
-        int64_t py = (window - h % window) % window;
-        int64_t npw = (w + px) / window;
-        int64_t nph = (h + py) / window;
-
-        if (px > 0 || py > 0) {
-            x = ggml_pad(ctx0, x, 0, int(px), int(py), 0);
-        }
-        x = ggml_reshape_4d(ctx0, x, c * window, npw, window, nph * b);
-        x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
-        x = ggml_reshape_4d(ctx0, x, c, window, window, npw * nph * b);
-        return x;
-    }
-
-    // Implementation based on approach suggested by Acly
-    // See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
-    ggml_tensor* clip_graph_deepseekocr::window_unpartition(ggml_tensor* x, int w, int h, int window) {
-        int64_t c = x->ne[0];
-        // same as
-        // x = ggml_reshape_4d(m, x, c, window, window, x->ne[2]);
-        // x = ggml_win_unpart(m, x, w, h, window);
-
-        int64_t px = (window - w % window) % window;
-        int64_t py = (window - h % window) % window;
-        int64_t npw = (w + px) / window;
-        int64_t nph = (h + py) / window;
-
-        int64_t b = x->ne[3] / (npw * nph);
-        x = ggml_reshape_4d(ctx0, x, c * window, window, npw, nph * b);
-        x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
-        x = ggml_reshape_4d(ctx0, x, c, w + px, h + py, b);
-        x = ggml_view_4d(ctx0, x, x->ne[0], w, h, x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0);
-        x = ggml_cont(ctx0, x);
-        return x;
-    }
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@ -60,10 +60,4 @@ struct clip_graph_whisper_enc : clip_graph {
 struct clip_graph_deepseekocr : clip_graph {
    clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
-
-    ggml_tensor * build_sam(ggml_tensor * inp_raw);
-    ggml_tensor * build_dsocr_clip(ggml_tensor * patch_embeds);
-    ggml_tensor * get_rel_pos(ggml_tensor * rel_pos, ggml_tensor * indices, int q_size, int k_size);
-    ggml_tensor * window_partition(ggml_tensor * x, int window);
-    ggml_tensor * window_unpartition(ggml_tensor * x, int w, int h, int window);
 };