Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into sf/deepseek-ocr

2025-11-17 08:46:27 +00:00 · 2025-11-17 08:46:27 +00:00 · e8b2610227
parent 2de3436705 2aab52e2c4
commit e8b2610227
2 changed files with 12 additions and 43 deletions
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@ -141,10 +141,7 @@
 #define TN_SAM_FFN_UP     "sam.blk.%d.mlp.lin1.%s"
 #define TN_SAM_FFN_DOWN   "sam.blk.%d.mlp.lin2.%s"
 #define TN_SAM_NECK       "sam.neck.%d.%s"
-#define TN_SAM_NET      "sam.net_%d.%s"
+#define TN_SAM_NET        "sam.net_%d.%s"
 #define TN_SAM_ATTN_OUT   "sam.blk.%d.attn_out"
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -1558,48 +1558,20 @@ struct clip_graph {
        // add CLS token
        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
-        // The larger models use a different ViT, which uses RMS norm instead of layer norm
+        //TODO : check norm type for dp-ocr-clip
-        // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
+        norm_type norm_t  = NORM_TYPE_NORMAL;
        norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) ?
                               NORM_TYPE_RMS      // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
                               :
                               NORM_TYPE_NORMAL;  // 300M ViT (Used by all smaller InternVL models)
-        ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, model.position_embeddings,
+        // for selecting learned pos embd, used by ViT
        struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
        ggml_set_name(positions, "positions");
        ggml_set_input(positions);
        ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
        ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, learned_pos_embd,
                                      nullptr);  // shape [1024, 16, 16]
-        // remove CLS token
+        ggml_build_forward_expand(gf, cur);
        cur = ggml_view_2d(ctx0, cur, n_embd, n_patches, ggml_row_size(cur->type, n_embd), 0);
        // pixel shuffle
        {
            const int scale_factor = model.hparams.n_merge;
            const int bsz          = 1;  // batch size, always 1 for now since we don't support batching
            const int height       = n_patches_y;
            const int width        = n_patches_x;
            GGML_ASSERT(scale_factor > 0);
            cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
            cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor,
                               width / scale_factor, bsz);
            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
            // flatten to 2D
            cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, cur->ne[1] * cur->ne[2]);
        }
        // projector (always using GELU activation)
        {
            // projector LayerNorm uses pytorch's default eps = 1e-5
            // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
            cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
            cur = ggml_add(ctx0, cur, model.mm_1_b);
            cur = ggml_gelu(ctx0, cur);
            cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
            cur = ggml_add(ctx0, cur, model.mm_3_b);
        }
        // build the graph
        return cur;
    }