Merge branch 'sf/deepseek-ocr' into sf/deepseek-ocr

This commit is contained in:
Saba Fallah 2025-11-23 12:29:37 +01:00 committed by GitHub
commit 6dfda99c69
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 61 additions and 25 deletions

View File

@ -2347,6 +2347,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "_<EOT>" || t.first == "_<EOT>"
|| t.first == "<|end_of_text|>" || t.first == "<|end_of_text|>"
|| t.first == "<end_of_utterance>" // smoldocling || t.first == "<end_of_utterance>" // smoldocling
|| t.first == "<end▁of▁sentence>" // deepseek-ocr
) { ) {
special_eog_ids.insert(t.second); special_eog_ids.insert(t.second);
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {

View File

@ -803,9 +803,8 @@ struct clip_graph {
cur = sam_layer_norm_2d(ctx0, cur, 256, model.neck_3_w, model.neck_3_b, hparams.eps); cur = sam_layer_norm_2d(ctx0, cur, 256, model.neck_3_w, model.neck_3_b, hparams.eps);
//TODO : check conv padding cur = ggml_conv_2d(ctx0, model.net_2, cur, 2,2,1,1, 1,1);
cur = ggml_conv_2d_s1_ph(ctx0, model.net_2, cur); cur = ggml_conv_2d(ctx0, model.net_3, cur, 2,2,1,1, 1,1);
cur = ggml_conv_2d_s1_ph(ctx0, model.net_3, cur);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
return cur; return cur;
@ -840,22 +839,27 @@ struct clip_graph {
ggml_tensor * global_features_2 = build_dp_ocr_clip(global_features_1); ggml_tensor * global_features_2 = build_dp_ocr_clip(global_features_1);
// FIXME remove n_patches is hardcoded
int clip_n_patches = 256; // FIXME hardcoded for sam 1024x1024 with 16x16 patches
// torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1) // torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
global_features_1 = ggml_cont(ctx0,ggml_permute(ctx0, global_features_1,2,1,0,3)); global_features_1 = ggml_cont(ctx0,ggml_permute(ctx0, global_features_1,2,1,0,3));
global_features_1 = ggml_reshape_2d(ctx0, global_features_1, n_embd, n_patches); // flatten 2nd and 3rd dims
global_features_1 = ggml_reshape_2d(ctx0, global_features_1, global_features_1->ne[0], clip_n_patches);
// remove CLS token // remove CLS token
global_features_2 = ggml_view_2d(ctx0, global_features_2, global_features_2 = ggml_view_2d(ctx0, global_features_2,
n_embd, n_patches, n_embd, clip_n_patches,
ggml_row_size(global_features_2->type, n_embd), 0); ggml_row_size(global_features_2->type, n_embd), 0);
ggml_tensor * global_features = ggml_concat(ctx0, global_features_2, global_features_1, 1); ggml_tensor * global_features = ggml_concat(ctx0, global_features_2, global_features_1, 1);
global_features = ggml_reshape_2d(ctx0, global_features, 2* n_embd, n_patches); global_features = ggml_reshape_2d(ctx0, global_features, 2* n_embd,clip_n_patches);
global_features = ggml_cont(ctx0, global_features); global_features = ggml_cont(ctx0, global_features);
global_features = ggml_mul_mat(ctx0, model.fc_w, global_features); global_features = ggml_mul_mat(ctx0, model.fc_w, global_features);
global_features = ggml_add(ctx0, global_features, model.fc_b); global_features = ggml_add(ctx0, global_features, model.fc_b);
global_features = build_global_local_features(ctx0,global_features); global_features = build_global_local_features(ctx0,global_features);
global_features = ggml_cont(ctx0, ggml_permute(ctx0, global_features, 1, 0, 2, 3));
ggml_build_forward_expand(gf, global_features); ggml_build_forward_expand(gf, global_features);
return gf; return gf;
} }
@ -870,16 +874,16 @@ struct clip_graph {
GGML_ASSERT(model.view_seperator != nullptr); GGML_ASSERT(model.view_seperator != nullptr);
// 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim] // 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, 1280, 64, 64, 1); // (n_dim, w, h) ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, 1280, 16, 16, 1); // (n_dim, w, h)
t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 1, 0, 3)); // (h, w, n_dim) t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 1, 0, 3)); // (h, w, n_dim)
ggml_tensor * nl = ggml_cont(ctx0,ggml_permute(ctx0, model.image_newline, 2, 1, 0, 3)); ggml_tensor * nl = ggml_cont(ctx0,ggml_permute(ctx0, model.image_newline, 2, 1, 0, 3));
nl = ggml_repeat_4d(ctx0, nl, 64, 1, 1280, 1); // n_pos rows nl = ggml_repeat_4d(ctx0, nl, 16, 1, 1280, 1); // n_pos rows
// 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim] // 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim]
t = ggml_concat(ctx0, t, nl, 1); // (h, w+1, n_dim) t = ggml_concat(ctx0, t, nl, 1); // (h, w+1, n_dim)
t = ggml_reshape_2d(ctx0, t, 1280, 64 * (64 + 1)); // (n_dim, h*(w+1)) t = ggml_reshape_2d(ctx0, t, 1280, 16 * (16 + 1)); // (n_dim, h*(w+1))
// 5) append view_separator as an extra "token": // 5) append view_separator as an extra "token":
@ -1540,9 +1544,12 @@ struct clip_graph {
GGML_ASSERT(model.class_embedding != nullptr); GGML_ASSERT(model.class_embedding != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr); GGML_ASSERT(model.position_embeddings != nullptr);
const int n_pos = n_patches + 1; ggml_tensor * inp = ggml_cpy(ctx0, patch_embeds, ggml_dup_tensor(ctx0, patch_embeds));
ggml_tensor * inp = ggml_cont(ctx0,ggml_permute(ctx0, patch_embeds,2,1,0,3));
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
const int n_pos = 257; // +1 for [CLS]
inp = ggml_cont(ctx0,ggml_permute(ctx0, inp,2,1,0,3));
inp = ggml_reshape_2d(ctx0, inp, n_embd, inp->ne[1]*inp->ne[2]*inp->ne[3]);
@ -1554,7 +1561,9 @@ struct clip_graph {
// for selecting learned pos embd, used by ViT // for selecting learned pos embd, used by ViT
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
cb(positions, "positions", -1); ggml_set_name(positions, "positions");
ggml_set_input(positions);
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
@ -5676,10 +5685,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_COGVLM:
case PROJECTOR_TYPE_DEEPSEEKOCR:
{ {
// do nothing // do nothing
} break; } break;
case PROJECTOR_TYPE_DEEPSEEKOCR:
{
//FIXME we need correct this when all model configs are set correctly
//n_patch is not correct right now
int32_t n_pos = 16 * 16 + 1; //hardcode for now
std::vector<int32_t> positions(n_pos);
for (int i = 0; i < n_pos; i++) {
positions[i] = i;
}
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_LLAMA4: case PROJECTOR_TYPE_LLAMA4:
{ {
// set the 2D positions // set the 2D positions

View File

@ -222,14 +222,18 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
bool add_bos = ctx.chat_history.empty(); bool add_bos = ctx.chat_history.empty();
auto formatted_chat = chat_add_and_format(ctx, msg);
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
mtmd_input_text text; mtmd_input_text text;
text.text = formatted_chat.c_str(); text.text = msg.content.c_str();
text.add_special = add_bos; text.add_special = add_bos;
text.parse_special = true; text.parse_special = true;
if (!mtmd_is_deepseekocr(ctx.ctx_vision.get())) {
auto formatted_chat = chat_add_and_format(ctx, msg);
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
text.text = formatted_chat.c_str();
}
if (g_is_interrupted) return 0; if (g_is_interrupted) return 0;
mtmd::input_chunks chunks(mtmd_input_chunks_init()); mtmd::input_chunks chunks(mtmd_input_chunks_init());
@ -332,6 +336,11 @@ int main(int argc, char ** argv) {
} }
} else { } else {
if (mtmd_is_deepseekocr(ctx.ctx_vision.get())) {
LOG_ERR("\n DeepSeek-OCR doesn't support chat mode.");
return 1;
}
LOG("\n Running in chat mode, available commands:"); LOG("\n Running in chat mode, available commands:");
if (mtmd_support_vision(ctx.ctx_vision.get())) { if (mtmd_support_vision(ctx.ctx_vision.get())) {
LOG("\n /image <path> load an image"); LOG("\n /image <path> load an image");

View File

@ -864,6 +864,10 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
return 16000; // 16kHz return 16000; // 16kHz
} }
bool mtmd_is_deepseekocr(mtmd_context * ctx) {
return ctx->ctx_v && clip_is_deepseekocr(ctx->ctx_v);
}
// //
// public API functions // public API functions
// //

View File

@ -117,6 +117,9 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
// return -1 if audio is not supported // return -1 if audio is not supported
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
// whether the current model is DeepSeek-OCR
MTMD_API bool mtmd_is_deepseekocr(mtmd_context * ctx);
// mtmd_bitmap // mtmd_bitmap
// //
// if bitmap is image: // if bitmap is image: