diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 96f7f5b1a3..b163078a6c 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3533,8 +3533,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int ph = image_size_height / patch_size; std::vector positions(n_pos * 4); int ptr = 0; - for (int dy = 0; dy < 2; dy++) { - for (int y = 0; y < ph; y += merge_ratio) { + for (int y = 0; y < ph; y += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { for (int x = 0; x < pw; x += merge_ratio) { for (int dx = 0; dx < 2; dx++) { positions[ ptr] = y + dy; diff --git a/tools/mtmd/models/paddleocr.cpp b/tools/mtmd/models/paddleocr.cpp index 92356f154b..5d3a13fb57 100644 --- a/tools/mtmd/models/paddleocr.cpp +++ b/tools/mtmd/models/paddleocr.cpp @@ -36,10 +36,7 @@ ggml_cgraph * clip_graph_paddleocr::build() { NORM_TYPE_NORMAL, proj_norm_eps, -1); const int scale_factor = model.hparams.n_merge; - int width = img.nx / patch_size; - int height = img.ny / patch_size; - cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor * scale_factor, width / scale_factor * height / scale_factor, 1); - + cur = build_patch_merge_permute(cur, scale_factor); cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr,