diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 88dab8517c..10324e165a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2562,8 +2562,7 @@ private: ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]); - + cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]); } cb(cur, "kqv_out", il); @@ -2782,7 +2781,6 @@ private: qr = ggml_permute(ctx0, Q, 0, 2, 1, 3); qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads); - const int WH_pad = GGML_PAD(W*H, GGML_KQ_MASK_PAD) - W*H; rw = ggml_mul_mat (ctx0, rw, ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W] rw = ggml_cont (ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W] @@ -2792,7 +2790,6 @@ private: rh = ggml_reshape_4d(ctx0, rh, 1, H, W*H, n_heads*B); mask = ggml_add (ctx0, rw, rh); // [B*n_heads, H*W, H, W] mask = ggml_reshape_4d(ctx0, mask, W*H, W*H, n_heads, B); - mask = ggml_pad (ctx0, mask, 0, WH_pad, 0, 0); mask = ggml_cast (ctx0, mask, GGML_TYPE_F16); float scale = 1.0f / sqrtf((float)d_heads); @@ -5213,8 +5210,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str } break; case PROJECTOR_TYPE_DEEPSEEKOCR: { - const int native_resolutions[] = { - /* 512 tiny ,640 small ,*/ 1024 /* base */, 1280 /* large */ + const std::vector native_resolutions = { + /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */ }; // original image size const int orig_w = original_size.width; @@ -5226,10 +5223,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str color[i] = (int)(255 * params.image_mean[i]); } - int mode_i = 0; + size_t mode_i = 0; int min_diff = orig_area; - for (int i = 0; i < 2; i++) { + for (size_t i = 0; i < native_resolutions.size(); i++) { int r = native_resolutions[i]; if (std::abs(orig_area - r * r) < min_diff) { mode_i = i; diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index 82b486ec93..765779b4bb 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -28,6 +28,14 @@ if [ "${1:-}" = "huge" ]; then echo "Include BIG and HUGE models..." fi +# Check if the second argument is "flash", then enable flash attention +# This is useful to test if flash attention off works correctly +FLASH_ATTN="on" +if [ "${2:-}" = "flash_off" ] || [ "${1:-}" = "flash_off" ]; then + FLASH_ATTN="off" + echo "Flash attention disabled..." +fi + ############### arr_prefix=() @@ -142,7 +150,8 @@ for i in "${!arr_hf[@]}"; do -hf $(printf %q "$hf") \ --image $(printf %q "$SCRIPT_DIR/$inp_file") \ --temp 0 -n 128 \ - ${extra_args}" + --flash-attn $(printf %q "$FLASH_ATTN") \ + ${extra_args}" # if extra_args does not contain -p, we add a default prompt if ! [[ "$extra_args" =~ "-p" ]]; then