From e6267a935901313dc727ec74d159fc66e206e9c4 Mon Sep 17 00:00:00 2001 From: Saba Fallah <10401143+sfallah@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:50:29 +0100 Subject: [PATCH] mtmd: build_attn modified, flash_attn on/off via ctx_params (#19729) --- tools/mtmd/clip.cpp | 5 +---- tools/mtmd/mtmd.cpp | 2 +- tools/mtmd/tests.sh | 9 +++++++++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 0796644f08..57f6dd00a3 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -628,9 +628,6 @@ ggml_tensor * clip_graph::build_attn( ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); v = ggml_cont(ctx0, v); - const auto n_tokens = q->ne[1]; - const auto n_head = q->ne[2]; - ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); // F32 may not needed for vision encoders? // ggml_mul_mat_set_prec(kq, GGML_PREC_F32); @@ -639,7 +636,7 @@ ggml_tensor * clip_graph::build_attn( ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens); + cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]); } cb(cur, "kqv_out", il); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index b7636279cb..af733d97d5 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -175,7 +175,7 @@ struct mtmd_context { clip_context_params ctx_clip_params { /* use_gpu */ ctx_params.use_gpu, - /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO, + /* flash_attn_type */ mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type), /* image_min_tokens */ ctx_params.image_min_tokens, /* image_max_tokens */ ctx_params.image_max_tokens, /* warmup */ ctx_params.warmup, diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index 012958e0e0..d2b7e684af 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -28,6 +28,14 @@ if [ "${1:-}" = "huge" ]; then echo "Include BIG and HUGE models..." fi +# Check if the second argument is "flash", then enable flash attention +# This is useful to test if flash attention off works correctly +FLASH_ATTN="on" +if [ "${2:-}" = "flash_off" ] || [ "${1:-}" = "flash_off" ]; then + FLASH_ATTN="off" + echo "Flash attention disabled..." +fi + ############### arr_prefix=() @@ -143,6 +151,7 @@ for i in "${!arr_hf[@]}"; do -hf $(printf %q "$hf") \ --image $(printf %q "$SCRIPT_DIR/$inp_file") \ --temp 0 -n 128 \ + --flash-attn $(printf %q "$FLASH_ATTN") \ ${extra_args}" # if extra_args does not contain -p, we add a default prompt