Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into sf/deepseek-ocr

This commit is contained in:
bluebread 2025-11-17 08:46:27 +00:00
commit e8b2610227
2 changed files with 12 additions and 43 deletions

View File

@ -141,10 +141,7 @@
#define TN_SAM_FFN_UP "sam.blk.%d.mlp.lin1.%s" #define TN_SAM_FFN_UP "sam.blk.%d.mlp.lin1.%s"
#define TN_SAM_FFN_DOWN "sam.blk.%d.mlp.lin2.%s" #define TN_SAM_FFN_DOWN "sam.blk.%d.mlp.lin2.%s"
#define TN_SAM_NECK "sam.neck.%d.%s" #define TN_SAM_NECK "sam.neck.%d.%s"
#define TN_SAM_NET "sam.net_%d.%s" #define TN_SAM_NET "sam.net_%d.%s"
#define TN_SAM_ATTN_OUT "sam.blk.%d.attn_out"
// align x to upper multiple of n // align x to upper multiple of n
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

View File

@ -1558,48 +1558,20 @@ struct clip_graph {
// add CLS token // add CLS token
inp = ggml_concat(ctx0, inp, model.class_embedding, 1); inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
// The larger models use a different ViT, which uses RMS norm instead of layer norm //TODO : check norm type for dp-ocr-clip
// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 norm_type norm_t = NORM_TYPE_NORMAL;
norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) ?
NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
:
NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, model.position_embeddings, // for selecting learned pos embd, used by ViT
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, learned_pos_embd,
nullptr); // shape [1024, 16, 16] nullptr); // shape [1024, 16, 16]
// remove CLS token ggml_build_forward_expand(gf, cur);
cur = ggml_view_2d(ctx0, cur, n_embd, n_patches, ggml_row_size(cur->type, n_embd), 0);
// pixel shuffle
{
const int scale_factor = model.hparams.n_merge;
const int bsz = 1; // batch size, always 1 for now since we don't support batching
const int height = n_patches_y;
const int width = n_patches_x;
GGML_ASSERT(scale_factor > 0);
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor,
width / scale_factor, bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// flatten to 2D
cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, cur->ne[1] * cur->ne[2]);
}
// projector (always using GELU activation)
{
// projector LayerNorm uses pytorch's default eps = 1e-5
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
cur = ggml_add(ctx0, cur, model.mm_1_b);
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
cur = ggml_add(ctx0, cur, model.mm_3_b);
}
// build the graph
return cur; return cur;
} }