merge with changes from https://github.com/ggml-org/llama.cpp/pull/17909

added new opt to tests.sh to disable flash-attn
2025-12-11 10:11:27 +01:00 · 2025-12-11 10:11:27 +01:00 · d70f171fac
parent 33fabf0bd8
commit d70f171fac
2 changed files with 15 additions and 9 deletions
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -2562,8 +2562,7 @@ private:

            ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
            cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
-
+            cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
        }

        cb(cur, "kqv_out", il);
@ -2782,7 +2781,6 @@ private:
                qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
                qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);

-                const int WH_pad = GGML_PAD(W*H, GGML_KQ_MASK_PAD) - W*H;

                rw   = ggml_mul_mat   (ctx0, rw, ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3)));  // [B*n_heads, W, H, W]
                rw   = ggml_cont      (ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
@ -2792,7 +2790,6 @@ private:
                rh   = ggml_reshape_4d(ctx0, rh, 1, H, W*H, n_heads*B);
                mask = ggml_add       (ctx0, rw, rh); // [B*n_heads, H*W, H, W]
                mask = ggml_reshape_4d(ctx0, mask, W*H, W*H, n_heads, B);
-                mask = ggml_pad       (ctx0, mask, 0, WH_pad, 0, 0);
                mask = ggml_cast      (ctx0, mask, GGML_TYPE_F16);

                float scale = 1.0f / sqrtf((float)d_heads);
@ -5213,8 +5210,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
            } break;
    case PROJECTOR_TYPE_DEEPSEEKOCR:
        {
-            const int native_resolutions[] = {
-                /* 512 tiny ,640  small ,*/ 1024 /* base */, 1280 /* large */
+            const std::vector native_resolutions = {
+                /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
            };
            // original image size
            const int orig_w = original_size.width;
@ -5226,10 +5223,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                color[i] = (int)(255 * params.image_mean[i]);
            }

-            int mode_i = 0;
+            size_t mode_i = 0;
            int min_diff = orig_area;

-            for (int i = 0; i < 2; i++) {
+            for (size_t i = 0; i < native_resolutions.size(); i++) {
                int r = native_resolutions[i];
                if (std::abs(orig_area - r * r) < min_diff) {
                    mode_i = i;
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@ -28,6 +28,14 @@ if [ "${1:-}" = "huge" ]; then
    echo "Include BIG and HUGE models..."
 fi

+# Check if the second argument is "flash", then enable flash attention
+# This is useful to test if flash attention off works correctly
+FLASH_ATTN="on"
+if [ "${2:-}" = "flash_off" ] || [ "${1:-}" = "flash_off" ]; then
+    FLASH_ATTN="off"
+    echo "Flash attention disabled..."
+fi
+
 ###############

 arr_prefix=()
@ -142,6 +150,7 @@ for i in "${!arr_hf[@]}"; do
        -hf $(printf %q "$hf") \
        --image $(printf %q "$SCRIPT_DIR/$inp_file") \
        --temp 0 -n 128 \
+        --flash-attn $(printf %q "$FLASH_ATTN") \
        ${extra_args}" 

    # if extra_args does not contain -p, we add a default prompt