From 4cfa15fcd718700f7cee0c8c619238d5b50d0348 Mon Sep 17 00:00:00 2001 From: Saba Fallah <10401143+sfallah@users.noreply.github.com> Date: Sat, 22 Nov 2025 16:57:34 +0100 Subject: [PATCH 1/2] - image encoding debugged - issues fixed mainly related wrong config like n_patches etc. - configs need to be corrected in the converter --- tools/mtmd/clip.cpp | 67 ++++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 22441d0f69..37e6e2a106 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -739,8 +739,8 @@ struct clip_graph { struct ggml_tensor * q_r = ggml_reshape_4d(ctx0, Qcur, enc_d_heads, W, H, B * enc_n_heads); - struct ggml_tensor * rel_w = ggml_cont(ctx0,ggml_permute(ctx0, - ggml_mul_mat(ctx0, + struct ggml_tensor * rel_w = ggml_cont(ctx0,ggml_permute(ctx0, + ggml_mul_mat(ctx0, rw, ggml_cont(ctx0, ggml_permute(ctx0, q_r, 0, 2, 1, 3))), 0, 2, 1, 3)); @@ -801,9 +801,8 @@ struct clip_graph { cur = sam_layer_norm_2d(ctx0, cur, 256, model.neck_3_w, model.neck_3_b, hparams.eps); - //TODO : check conv padding - cur = ggml_conv_2d_s1_ph(ctx0, model.net_2, cur); - cur = ggml_conv_2d_s1_ph(ctx0, model.net_3, cur); + cur = ggml_conv_2d(ctx0, model.net_2, cur, 2,2,1,1, 1,1); + cur = ggml_conv_2d(ctx0, model.net_3, cur, 2,2,1,1, 1,1); ggml_build_forward_expand(gf, cur); return cur; @@ -838,22 +837,27 @@ struct clip_graph { ggml_tensor * global_features_2 = build_dp_ocr_clip(global_features_1); + // FIXME remove n_patches is hardcoded + int clip_n_patches = 256; // FIXME hardcoded for sam 1024x1024 with 16x16 patches + // torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1) global_features_1 = ggml_cont(ctx0,ggml_permute(ctx0, global_features_1,2,1,0,3)); - global_features_1 = ggml_reshape_2d(ctx0, global_features_1, n_embd, n_patches); + // flatten 2nd and 3rd dims + global_features_1 = ggml_reshape_2d(ctx0, global_features_1, global_features_1->ne[0], clip_n_patches); // remove CLS token global_features_2 = ggml_view_2d(ctx0, global_features_2, - n_embd, n_patches, + n_embd, clip_n_patches, ggml_row_size(global_features_2->type, n_embd), 0); ggml_tensor * global_features = ggml_concat(ctx0, global_features_2, global_features_1, 1); - global_features = ggml_reshape_2d(ctx0, global_features, 2* n_embd, n_patches); + global_features = ggml_reshape_2d(ctx0, global_features, 2* n_embd,clip_n_patches); global_features = ggml_cont(ctx0, global_features); global_features = ggml_mul_mat(ctx0, model.fc_w, global_features); global_features = ggml_add(ctx0, global_features, model.fc_b); global_features = build_global_local_features(ctx0,global_features); + global_features = ggml_cont(ctx0, ggml_permute(ctx0, global_features, 1, 0, 2, 3)); ggml_build_forward_expand(gf, global_features); return gf; } @@ -868,16 +872,16 @@ struct clip_graph { GGML_ASSERT(model.view_seperator != nullptr); // 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim] - ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, 1280, 64, 64, 1); // (n_dim, w, h) + ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, 1280, 16, 16, 1); // (n_dim, w, h) t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 1, 0, 3)); // (h, w, n_dim) ggml_tensor * nl = ggml_cont(ctx0,ggml_permute(ctx0, model.image_newline, 2, 1, 0, 3)); - nl = ggml_repeat_4d(ctx0, nl, 64, 1, 1280, 1); // n_pos rows + nl = ggml_repeat_4d(ctx0, nl, 16, 1, 1280, 1); // n_pos rows // 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim] t = ggml_concat(ctx0, t, nl, 1); // (h, w+1, n_dim) - t = ggml_reshape_2d(ctx0, t, 1280, 64 * (64 + 1)); // (n_dim, h*(w+1)) + t = ggml_reshape_2d(ctx0, t, 1280, 16 * (16 + 1)); // (n_dim, h*(w+1)) // 5) append view_separator as an extra "token": @@ -1538,9 +1542,12 @@ struct clip_graph { GGML_ASSERT(model.class_embedding != nullptr); GGML_ASSERT(model.position_embeddings != nullptr); - const int n_pos = n_patches + 1; - ggml_tensor * inp = ggml_cont(ctx0,ggml_permute(ctx0, patch_embeds,2,1,0,3)); - inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); + ggml_tensor * inp = ggml_cpy(ctx0, patch_embeds, ggml_dup_tensor(ctx0, patch_embeds)); + + + const int n_pos = 257; // +1 for [CLS] + inp = ggml_cont(ctx0,ggml_permute(ctx0, inp,2,1,0,3)); + inp = ggml_reshape_2d(ctx0, inp, n_embd, inp->ne[1]*inp->ne[2]*inp->ne[3]); @@ -1552,7 +1559,9 @@ struct clip_graph { // for selecting learned pos embd, used by ViT ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); - cb(positions, "positions", -1); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); @@ -2525,7 +2534,7 @@ private: ggml_tensor * q_coord = ggml_arange(ctx, 0.0f, static_cast(q_size), 1.0f); // [q_size] ggml_tensor * k_coord = ggml_arange(ctx, 0.0f, static_cast(k_size), 1.0f); // [k_size] ggml_tensor * rel = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, k_size, q_size); - + // broadcast reshape: q_coord = ggml_cont(ctx, ggml_repeat(ctx, @@ -2538,8 +2547,8 @@ private: float q_scale = std::max((float)k_size/q_size, 1.0f); float k_scale = std::max((float)q_size/k_size, 1.0f); - // This wouldn't be triggered in DeepSeek-OCR. Just for compatibility with - // the original implementation. + // This wouldn't be triggered in DeepSeek-OCR. Just for compatibility with + // the original implementation. if (q_size != k_size) { q_coord = ggml_scale_inplace(ctx, q_coord, q_scale); k_coord = ggml_scale_inplace(ctx, k_coord, k_scale); @@ -2548,7 +2557,7 @@ private: // ------------------------------------------------- // relative_coords = q - k + (k_size - 1) // SAME as PyTorch when no scaling // ------------------------------------------------- - + rel = ggml_sub(ctx, q_coord, k_coord); // [q_size, k_size] rel = ggml_scale_bias(ctx, rel, 1.0f, (k_size - 1.0f)*k_scale); // [q_size, k_size] // Clamp to [0, L-1] range for valid indexing @@ -2559,10 +2568,10 @@ private: // ------------------------------------------------- ggml_tensor * idx_2d = ggml_cast(ctx, rel, GGML_TYPE_I32); // [q_size, k_size] - + // Gather from rel_pos → [qk, C] // ------------------------------------------------- - + // flatten to 1D for ggml_get_rows int qk = q_size * k_size; ggml_tensor * idx_flat = ggml_reshape_1d(ctx, idx_2d, qk); // [qk] @@ -5237,9 +5246,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_DEEPSEEKOCR: { - int x_patch = img->nx / (params.patch_size); - - n_patches += x_patch + 1; + n_patches = 1280; } break; default: @@ -5573,10 +5580,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_COGVLM: - case PROJECTOR_TYPE_DEEPSEEKOCR: { // do nothing } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + //FIXME we need correct this when all model configs are set correctly + //n_patch is not correct right now + int32_t n_pos = 16 * 16 + 1; //hardcode for now + std::vector positions(n_pos); + for (int i = 0; i < n_pos; i++) { + positions[i] = i; + } + set_input_i32("positions", positions); + } break; case PROJECTOR_TYPE_LLAMA4: { // set the 2D positions From 3f71188303d9bdab9b1b51b786a7b3ecf55ee944 Mon Sep 17 00:00:00 2001 From: bluebread Date: Sun, 23 Nov 2025 09:22:00 +0000 Subject: [PATCH 2/2] mtmd: correct token order --- src/llama-vocab.cpp | 1 + tools/mtmd/mtmd-cli.cpp | 15 ++++++++++++--- tools/mtmd/mtmd.cpp | 4 ++++ tools/mtmd/mtmd.h | 3 +++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 735c5d547f..2634ab7c5e 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2347,6 +2347,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "_" || t.first == "<|end_of_text|>" || t.first == "" // smoldocling + || t.first == "<|end▁of▁sentence|>" // deepseek-ocr ) { special_eog_ids.insert(t.second); if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 3e19e95958..8ff93f08b9 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -222,14 +222,18 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { bool add_bos = ctx.chat_history.empty(); - auto formatted_chat = chat_add_and_format(ctx, msg); - LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); mtmd_input_text text; - text.text = formatted_chat.c_str(); + text.text = msg.content.c_str(); text.add_special = add_bos; text.parse_special = true; + if (!mtmd_is_deepseekocr(ctx.ctx_vision.get())) { + auto formatted_chat = chat_add_and_format(ctx, msg); + LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); + text.text = formatted_chat.c_str(); + } + if (g_is_interrupted) return 0; mtmd::input_chunks chunks(mtmd_input_chunks_init()); @@ -332,6 +336,11 @@ int main(int argc, char ** argv) { } } else { + if (mtmd_is_deepseekocr(ctx.ctx_vision.get())) { + LOG_ERR("\n DeepSeek-OCR doesn't support chat mode."); + return 1; + } + LOG("\n Running in chat mode, available commands:"); if (mtmd_support_vision(ctx.ctx_vision.get())) { LOG("\n /image load an image"); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 16349e8f40..994013bea9 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -864,6 +864,10 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) { return 16000; // 16kHz } +bool mtmd_is_deepseekocr(mtmd_context * ctx) { + return ctx->ctx_v && clip_is_deepseekocr(ctx->ctx_v); +} + // // public API functions // diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 775fba6215..99fdcd4650 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -117,6 +117,9 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx); // return -1 if audio is not supported MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); +// whether the current model is DeepSeek-OCR +MTMD_API bool mtmd_is_deepseekocr(mtmd_context * ctx); + // mtmd_bitmap // // if bitmap is image: