From cec9a5c6e0d0fc949ecc92e0eadccb2195174f3b Mon Sep 17 00:00:00 2001
From: Saba Fallah <10401143+sfallah@users.noreply.github.com>
Date: Mon, 17 Nov 2025 18:59:40 +0100
Subject: [PATCH] sam erroneous return corrected

---
 tools/mtmd/clip.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f4dc48e442..1d29bc8afe 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -786,8 +786,6 @@ struct clip_graph {
             // residual 2
             cur = ggml_add(ctx0, cur, inpFF);
             cb(cur, "layer_out", il);
-
-            return cur;  // B, 1024, 16, 16
         }
 
         cur = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 2, 0, 1, 3));
@@ -1538,12 +1536,17 @@ struct clip_graph {
     ggml_tensor * build_dp_ocr_clip(ggml_tensor * inpL, ggml_tensor * patch_embeds) {
         GGML_ASSERT(model.class_embedding != nullptr);
         GGML_ASSERT(model.position_embeddings != nullptr);
-        auto n_embd_vit_clip = 1024;
 
         const int     n_pos = n_patches + 1;
         ggml_tensor * inp =
             ggml_cont_3d(ctx0, ggml_dup_tensor(ctx0, patch_embeds), patch_embeds->ne[0], n_patches_x, n_patches_y);
-        //ggml_tensor * inp = ggml_cpy(ctx0, inpL, ggml_dup_tensor(ctx0, inpL));
+
+        auto inp_n_elems = ggml_nelements(inp);
+        GGML_ASSERT(inp_n_elems == inp->ne[0] * inp->ne[1] * inp->ne[2]);
+        inp = ggml_permute(ctx0, inp, 2, 1,0,3); // [n_patches, n_embd]
+        inp = ggml_cont(ctx0, inp);
+        GGML_ASSERT(ggml_nelements(inp) == n_patches_x*patch_size*4*768);
+        inp= ggml_reshape_2d(ctx0,inp,n_patches_x*patch_size, 4*768);
 
         // add CLS token
         inp = ggml_concat(ctx0, inp, model.class_embedding, 1);