diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 96f7f5b1a3..b163078a6c 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3533,8 +3533,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 const int ph = image_size_height / patch_size;
                 std::vector<int> positions(n_pos * 4);
                 int ptr = 0;
-                for (int dy = 0; dy < 2; dy++) {
-                    for (int y = 0; y < ph; y += merge_ratio) {
+                for (int y = 0; y < ph; y += merge_ratio) {
+                    for (int dy = 0; dy < 2; dy++) {
                         for (int x = 0; x < pw; x += merge_ratio) {
                             for (int dx = 0; dx < 2; dx++) {
                                 positions[                  ptr] = y + dy;
diff --git a/tools/mtmd/models/paddleocr.cpp b/tools/mtmd/models/paddleocr.cpp
index 92356f154b..5d3a13fb57 100644
--- a/tools/mtmd/models/paddleocr.cpp
+++ b/tools/mtmd/models/paddleocr.cpp
@@ -36,10 +36,7 @@ ggml_cgraph * clip_graph_paddleocr::build() {
                     NORM_TYPE_NORMAL, proj_norm_eps, -1);
 
         const int scale_factor = model.hparams.n_merge;
-        int width  = img.nx / patch_size;
-        int height = img.ny / patch_size;
-        cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor * scale_factor, width / scale_factor * height / scale_factor, 1);
-
+        cur = build_patch_merge_permute(cur, scale_factor);
         cur = build_ffn(cur,
                     model.mm_1_w, model.mm_1_b,
                     nullptr, nullptr,