From 4cfa15fcd718700f7cee0c8c619238d5b50d0348 Mon Sep 17 00:00:00 2001
From: Saba Fallah <10401143+sfallah@users.noreply.github.com>
Date: Sat, 22 Nov 2025 16:57:34 +0100
Subject: [PATCH 1/2] - image encoding debugged - issues fixed mainly related
 wrong config like n_patches etc. - configs need to be corrected in the
 converter

---
 tools/mtmd/clip.cpp | 67 ++++++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 25 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 22441d0f69..37e6e2a106 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -739,8 +739,8 @@ struct clip_graph {
 
                 struct ggml_tensor * q_r = ggml_reshape_4d(ctx0, Qcur, enc_d_heads, W, H, B * enc_n_heads);
 
-                struct ggml_tensor * rel_w = ggml_cont(ctx0,ggml_permute(ctx0, 
-                            ggml_mul_mat(ctx0, 
+                struct ggml_tensor * rel_w = ggml_cont(ctx0,ggml_permute(ctx0,
+                            ggml_mul_mat(ctx0,
                                 rw,
                                 ggml_cont(ctx0, ggml_permute(ctx0, q_r, 0, 2, 1, 3))),
                             0, 2, 1, 3));
@@ -801,9 +801,8 @@ struct clip_graph {
 
         cur = sam_layer_norm_2d(ctx0, cur, 256, model.neck_3_w, model.neck_3_b, hparams.eps);
 
-        //TODO : check conv padding
-        cur = ggml_conv_2d_s1_ph(ctx0, model.net_2, cur);
-        cur = ggml_conv_2d_s1_ph(ctx0, model.net_3, cur);
+        cur = ggml_conv_2d(ctx0, model.net_2, cur, 2,2,1,1, 1,1);
+        cur = ggml_conv_2d(ctx0, model.net_3, cur, 2,2,1,1, 1,1);
 
         ggml_build_forward_expand(gf, cur);
         return cur;
@@ -838,22 +837,27 @@ struct clip_graph {
 
         ggml_tensor * global_features_2 = build_dp_ocr_clip(global_features_1);
 
+        // FIXME remove n_patches is hardcoded
+        int clip_n_patches = 256; // FIXME hardcoded for sam 1024x1024 with 16x16 patches
+
         // torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
         global_features_1 = ggml_cont(ctx0,ggml_permute(ctx0, global_features_1,2,1,0,3));
-        global_features_1 = ggml_reshape_2d(ctx0, global_features_1, n_embd, n_patches);
+        // flatten 2nd and 3rd dims
+        global_features_1 = ggml_reshape_2d(ctx0, global_features_1, global_features_1->ne[0], clip_n_patches);
 
         // remove CLS token
         global_features_2 = ggml_view_2d(ctx0, global_features_2,
-            n_embd, n_patches,
+            n_embd, clip_n_patches,
             ggml_row_size(global_features_2->type, n_embd), 0);
 
         ggml_tensor * global_features = ggml_concat(ctx0, global_features_2, global_features_1, 1);
-        global_features = ggml_reshape_2d(ctx0, global_features, 2* n_embd, n_patches);
+        global_features = ggml_reshape_2d(ctx0, global_features, 2* n_embd,clip_n_patches);
         global_features = ggml_cont(ctx0, global_features);
         global_features = ggml_mul_mat(ctx0, model.fc_w, global_features);
         global_features = ggml_add(ctx0, global_features, model.fc_b);
 
         global_features = build_global_local_features(ctx0,global_features);
+        global_features = ggml_cont(ctx0, ggml_permute(ctx0, global_features, 1, 0, 2, 3));
         ggml_build_forward_expand(gf, global_features);
         return gf;
     }
@@ -868,16 +872,16 @@ struct clip_graph {
         GGML_ASSERT(model.view_seperator != nullptr);
 
         // 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
-        ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, 1280, 64, 64, 1);  // (n_dim, w, h)
+        ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, 1280, 16, 16, 1);  // (n_dim, w, h)
         t               = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 1, 0, 3)); // (h, w, n_dim)
         ggml_tensor * nl = ggml_cont(ctx0,ggml_permute(ctx0, model.image_newline, 2, 1, 0, 3));
-        nl = ggml_repeat_4d(ctx0, nl, 64, 1, 1280, 1); // n_pos rows
+        nl = ggml_repeat_4d(ctx0, nl, 16, 1, 1280, 1); // n_pos rows
 
 
         // 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim]
         t = ggml_concat(ctx0, t, nl, 1);  // (h, w+1, n_dim)
 
-        t = ggml_reshape_2d(ctx0, t, 1280, 64 * (64 + 1));  // (n_dim, h*(w+1))
+        t = ggml_reshape_2d(ctx0, t, 1280, 16 * (16 + 1));  // (n_dim, h*(w+1))
 
 
         // 5) append view_separator as an extra "token":
@@ -1538,9 +1542,12 @@ struct clip_graph {
         GGML_ASSERT(model.class_embedding != nullptr);
         GGML_ASSERT(model.position_embeddings != nullptr);
 
-        const int     n_pos = n_patches + 1;
-        ggml_tensor * inp = ggml_cont(ctx0,ggml_permute(ctx0, patch_embeds,2,1,0,3));
-        inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+        ggml_tensor * inp = ggml_cpy(ctx0, patch_embeds, ggml_dup_tensor(ctx0, patch_embeds));
+
+
+        const int n_pos = 257; // +1 for [CLS]
+        inp = ggml_cont(ctx0,ggml_permute(ctx0, inp,2,1,0,3));
+        inp = ggml_reshape_2d(ctx0, inp, n_embd, inp->ne[1]*inp->ne[2]*inp->ne[3]);
 
 
 
@@ -1552,7 +1559,9 @@ struct clip_graph {
 
         // for selecting learned pos embd, used by ViT
         ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
-        cb(positions, "positions", -1);
+        ggml_set_name(positions, "positions");
+        ggml_set_input(positions);
+
         ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
 
 
@@ -2525,7 +2534,7 @@ private:
         ggml_tensor * q_coord = ggml_arange(ctx, 0.0f, static_cast<float>(q_size), 1.0f); // [q_size]
         ggml_tensor * k_coord = ggml_arange(ctx, 0.0f, static_cast<float>(k_size), 1.0f); // [k_size]
         ggml_tensor * rel = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, k_size, q_size);
-        
+
         // broadcast reshape:
         q_coord = ggml_cont(ctx,
             ggml_repeat(ctx,
@@ -2538,8 +2547,8 @@ private:
         float q_scale = std::max((float)k_size/q_size, 1.0f);
         float k_scale = std::max((float)q_size/k_size, 1.0f);
 
-        // This wouldn't be triggered in DeepSeek-OCR. Just for compatibility with 
-        // the original implementation. 
+        // This wouldn't be triggered in DeepSeek-OCR. Just for compatibility with
+        // the original implementation.
         if (q_size != k_size) {
             q_coord = ggml_scale_inplace(ctx, q_coord, q_scale);
             k_coord = ggml_scale_inplace(ctx, k_coord, k_scale);
@@ -2548,7 +2557,7 @@ private:
         // -------------------------------------------------
         // relative_coords = q - k + (k_size - 1)    // SAME as PyTorch when no scaling
         // -------------------------------------------------
-        
+
         rel = ggml_sub(ctx, q_coord, k_coord); // [q_size, k_size]
         rel = ggml_scale_bias(ctx, rel, 1.0f, (k_size - 1.0f)*k_scale); // [q_size, k_size]
         // Clamp to [0, L-1] range for valid indexing
@@ -2559,10 +2568,10 @@ private:
         // -------------------------------------------------
 
         ggml_tensor * idx_2d = ggml_cast(ctx, rel, GGML_TYPE_I32); // [q_size, k_size]
-        
+
         // Gather from rel_pos  → [qk, C]
         // -------------------------------------------------
-        
+
         // flatten to 1D for ggml_get_rows
         int qk = q_size * k_size;
         ggml_tensor * idx_flat = ggml_reshape_1d(ctx, idx_2d, qk);          // [qk]
@@ -5237,9 +5246,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             } break;
         case PROJECTOR_TYPE_DEEPSEEKOCR:
         {
-            int x_patch = img->nx / (params.patch_size);
-
-            n_patches += x_patch + 1;
+            n_patches = 1280;
 
         } break;
         default:
@@ -5573,10 +5580,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         case PROJECTOR_TYPE_VOXTRAL:
         case PROJECTOR_TYPE_JANUS_PRO:
         case PROJECTOR_TYPE_COGVLM:
-        case PROJECTOR_TYPE_DEEPSEEKOCR:
             {
                 // do nothing
             } break;
+        case PROJECTOR_TYPE_DEEPSEEKOCR:
+        {
+            //FIXME we need correct this when all model configs are set correctly
+            //n_patch is not correct right now
+            int32_t n_pos = 16 * 16 + 1; //hardcode for now
+            std::vector<int32_t> positions(n_pos);
+            for (int i = 0; i < n_pos; i++) {
+                positions[i] = i;
+            }
+            set_input_i32("positions", positions);
+        } break;
         case PROJECTOR_TYPE_LLAMA4:
             {
                 // set the 2D positions

From 3f71188303d9bdab9b1b51b786a7b3ecf55ee944 Mon Sep 17 00:00:00 2001
From: bluebread <hotbread70127@gmail.com>
Date: Sun, 23 Nov 2025 09:22:00 +0000
Subject: [PATCH 2/2] mtmd: correct token order

---
 src/llama-vocab.cpp     |  1 +
 tools/mtmd/mtmd-cli.cpp | 15 ++++++++++++---
 tools/mtmd/mtmd.cpp     |  4 ++++
 tools/mtmd/mtmd.h       |  3 +++
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 735c5d547f..2634ab7c5e 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2347,6 +2347,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     || t.first == "_<EOT>"
                     || t.first == "<|end_of_text|>"
                     || t.first == "<end_of_utterance>" // smoldocling
+                    || t.first == "<｜end▁of▁sentence｜>" // deepseek-ocr
                ) {
                 special_eog_ids.insert(t.second);
                 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 3e19e95958..8ff93f08b9 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -222,14 +222,18 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
 
 static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
     bool add_bos = ctx.chat_history.empty();
-    auto formatted_chat = chat_add_and_format(ctx, msg);
-    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
 
     mtmd_input_text text;
-    text.text          = formatted_chat.c_str();
+    text.text          = msg.content.c_str();
     text.add_special   = add_bos;
     text.parse_special = true;
 
+    if (!mtmd_is_deepseekocr(ctx.ctx_vision.get())) {
+        auto formatted_chat = chat_add_and_format(ctx, msg);
+        LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
+        text.text = formatted_chat.c_str();        
+    }
+
     if (g_is_interrupted) return 0;
 
     mtmd::input_chunks chunks(mtmd_input_chunks_init());
@@ -332,6 +336,11 @@ int main(int argc, char ** argv) {
         }
 
     } else {
+        if (mtmd_is_deepseekocr(ctx.ctx_vision.get())) {
+            LOG_ERR("\n DeepSeek-OCR doesn't support chat mode.");
+            return 1;
+        }
+        
         LOG("\n Running in chat mode, available commands:");
         if (mtmd_support_vision(ctx.ctx_vision.get())) {
             LOG("\n   /image <path>    load an image");
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 16349e8f40..994013bea9 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -864,6 +864,10 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
     return 16000; // 16kHz
 }
 
+bool mtmd_is_deepseekocr(mtmd_context * ctx) {
+    return ctx->ctx_v && clip_is_deepseekocr(ctx->ctx_v);
+}
+
 //
 // public API functions
 //
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 775fba6215..99fdcd4650 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -117,6 +117,9 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
 // return -1 if audio is not supported
 MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
 
+// whether the current model is DeepSeek-OCR
+MTMD_API bool mtmd_is_deepseekocr(mtmd_context * ctx);
+
 // mtmd_bitmap
 //
 // if bitmap is image: