Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into sf/deepseek-ocr
This commit is contained in:
commit
e8b2610227
|
|
@ -141,10 +141,7 @@
|
||||||
#define TN_SAM_FFN_UP "sam.blk.%d.mlp.lin1.%s"
|
#define TN_SAM_FFN_UP "sam.blk.%d.mlp.lin1.%s"
|
||||||
#define TN_SAM_FFN_DOWN "sam.blk.%d.mlp.lin2.%s"
|
#define TN_SAM_FFN_DOWN "sam.blk.%d.mlp.lin2.%s"
|
||||||
#define TN_SAM_NECK "sam.neck.%d.%s"
|
#define TN_SAM_NECK "sam.neck.%d.%s"
|
||||||
#define TN_SAM_NET "sam.net_%d.%s"
|
#define TN_SAM_NET "sam.net_%d.%s"
|
||||||
|
|
||||||
|
|
||||||
#define TN_SAM_ATTN_OUT "sam.blk.%d.attn_out"
|
|
||||||
|
|
||||||
// align x to upper multiple of n
|
// align x to upper multiple of n
|
||||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||||
|
|
|
||||||
|
|
@ -1558,48 +1558,20 @@ struct clip_graph {
|
||||||
// add CLS token
|
// add CLS token
|
||||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||||
|
|
||||||
// The larger models use a different ViT, which uses RMS norm instead of layer norm
|
//TODO : check norm type for dp-ocr-clip
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
|
norm_type norm_t = NORM_TYPE_NORMAL;
|
||||||
norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) ?
|
|
||||||
NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
|
|
||||||
:
|
|
||||||
NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
|
|
||||||
|
|
||||||
ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, model.position_embeddings,
|
// for selecting learned pos embd, used by ViT
|
||||||
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||||
|
ggml_set_name(positions, "positions");
|
||||||
|
ggml_set_input(positions);
|
||||||
|
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
||||||
|
|
||||||
|
|
||||||
|
ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, learned_pos_embd,
|
||||||
nullptr); // shape [1024, 16, 16]
|
nullptr); // shape [1024, 16, 16]
|
||||||
|
|
||||||
// remove CLS token
|
ggml_build_forward_expand(gf, cur);
|
||||||
cur = ggml_view_2d(ctx0, cur, n_embd, n_patches, ggml_row_size(cur->type, n_embd), 0);
|
|
||||||
|
|
||||||
// pixel shuffle
|
|
||||||
{
|
|
||||||
const int scale_factor = model.hparams.n_merge;
|
|
||||||
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
|
||||||
const int height = n_patches_y;
|
|
||||||
const int width = n_patches_x;
|
|
||||||
GGML_ASSERT(scale_factor > 0);
|
|
||||||
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
|
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
||||||
cur = ggml_cont_4d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor,
|
|
||||||
width / scale_factor, bsz);
|
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
||||||
// flatten to 2D
|
|
||||||
cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, cur->ne[1] * cur->ne[2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// projector (always using GELU activation)
|
|
||||||
{
|
|
||||||
// projector LayerNorm uses pytorch's default eps = 1e-5
|
|
||||||
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
|
||||||
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
|
||||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
|
||||||
cur = ggml_gelu(ctx0, cur);
|
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
|
|
||||||
cur = ggml_add(ctx0, cur, model.mm_3_b);
|
|
||||||
}
|
|
||||||
|
|
||||||
// build the graph
|
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue