refactor: merge VAETKI positions case with QWEN2VL

2026-01-10 20:35:39 +09:00 · 2026-01-10 20:35:39 +09:00 · c5e9eac8c5
parent d8e8b77c44
commit c5e9eac8c5
1 changed files with 4 additions and 26 deletions
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -3499,11 +3499,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_QWEN2VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
+        case PROJECTOR_TYPE_VAETKI:
            {
                const int merge_ratio = hparams.n_merge;
                const int pw = image_size_width  / patch_size;
                const int ph = image_size_height / patch_size;
-                std::vector<int> positions(n_pos * 4);
+
+                const int pos_size = (model.proj_type == PROJECTOR_TYPE_VAETKI) ? num_patches : n_pos;
+                std::vector<int> positions(pos_size * 4);
                int ptr = 0;
                for (int y = 0; y < ph; y += merge_ratio) {
                    for (int x = 0; x < pw; x += merge_ratio) {
@ -3519,31 +3522,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                    }
                }

-                set_input_i32("positions", positions);
-            } break;
-        case PROJECTOR_TYPE_VAETKI:
-            {
-                const int merge_ratio = 2;
-                const int ipw = image_size_width  / patch_size;
-                const int iph = image_size_height / patch_size;
-
-                std::vector<int> positions(num_patches * 4);
-
-                int ptr = 0;
-                for (int y = 0; y < iph; y += merge_ratio) {
-                    for (int x = 0; x < ipw; x += merge_ratio) {
-                        for (int dy = 0; dy < 2; dy++) {
-                            for (int dx = 0; dx < 2; dx++) {
-                                positions[                  ptr] = y + dy;
-                                positions[    num_patches + ptr] = x + dx;
-                                positions[2 * num_patches + ptr] = y + dy;
-                                positions[3 * num_patches + ptr] = x + dx;
-                                ptr++;
-                            }
-                        }
-                    }
-                }
-
                set_input_i32("positions", positions);
            } break;
        case PROJECTOR_TYPE_QWEN25VL: