From 2aab52e2c43a886e4b231c13e1ad6d27b0ae7fc0 Mon Sep 17 00:00:00 2001 From: Saba Fallah <10401143+sfallah@users.noreply.github.com> Date: Sat, 15 Nov 2025 15:30:07 +0100 Subject: [PATCH] deepseek-ocr clip-vit model impl --- tools/mtmd/clip-impl.h | 5 +---- tools/mtmd/clip.cpp | 50 ++++++++++-------------------------------- 2 files changed, 12 insertions(+), 43 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 88535df55f..4cb2808c26 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -141,10 +141,7 @@ #define TN_SAM_FFN_UP "sam.blk.%d.mlp.lin1.%s" #define TN_SAM_FFN_DOWN "sam.blk.%d.mlp.lin2.%s" #define TN_SAM_NECK "sam.neck.%d.%s" -#define TN_SAM_NET "sam.net_%d.%s" - - -#define TN_SAM_ATTN_OUT "sam.blk.%d.attn_out" +#define TN_SAM_NET "sam.net_%d.%s" // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 039644b688..d94d05b2f2 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1558,48 +1558,20 @@ struct clip_graph { // add CLS token inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - // The larger models use a different ViT, which uses RMS norm instead of layer norm - // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 - norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) ? - NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B) - : - NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models) + //TODO : check norm type for dp-ocr-clip + norm_type norm_t = NORM_TYPE_NORMAL; - ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, model.position_embeddings, + // for selecting learned pos embd, used by ViT + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); + + + ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, learned_pos_embd, nullptr); // shape [1024, 16, 16] - // remove CLS token - cur = ggml_view_2d(ctx0, cur, n_embd, n_patches, ggml_row_size(cur->type, n_embd), 0); - - // pixel shuffle - { - const int scale_factor = model.hparams.n_merge; - const int bsz = 1; // batch size, always 1 for now since we don't support batching - const int height = n_patches_y; - const int width = n_patches_x; - GGML_ASSERT(scale_factor > 0); - cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, - width / scale_factor, bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - // flatten to 2D - cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, cur->ne[1] * cur->ne[2]); - } - - // projector (always using GELU activation) - { - // projector LayerNorm uses pytorch's default eps = 1e-5 - // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79 - cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); - cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); - cur = ggml_add(ctx0, cur, model.mm_1_b); - cur = ggml_gelu(ctx0, cur); - cur = ggml_mul_mat(ctx0, model.mm_3_w, cur); - cur = ggml_add(ctx0, cur, model.mm_3_b); - } - - // build the graph + ggml_build_forward_expand(gf, cur); return cur; }