merge with changes from https://github.com/ggml-org/llama.cpp/pull/17909
added new opt to tests.sh to disable flash-attn
This commit is contained in:
parent
33fabf0bd8
commit
d70f171fac
|
|
@ -2562,8 +2562,7 @@ private:
|
||||||
|
|
||||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
||||||
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||||
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
|
cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
@ -2782,7 +2781,6 @@ private:
|
||||||
qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
|
qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
|
||||||
qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
|
qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
|
||||||
|
|
||||||
const int WH_pad = GGML_PAD(W*H, GGML_KQ_MASK_PAD) - W*H;
|
|
||||||
|
|
||||||
rw = ggml_mul_mat (ctx0, rw, ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
|
rw = ggml_mul_mat (ctx0, rw, ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
|
||||||
rw = ggml_cont (ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
|
rw = ggml_cont (ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
|
||||||
|
|
@ -2792,7 +2790,6 @@ private:
|
||||||
rh = ggml_reshape_4d(ctx0, rh, 1, H, W*H, n_heads*B);
|
rh = ggml_reshape_4d(ctx0, rh, 1, H, W*H, n_heads*B);
|
||||||
mask = ggml_add (ctx0, rw, rh); // [B*n_heads, H*W, H, W]
|
mask = ggml_add (ctx0, rw, rh); // [B*n_heads, H*W, H, W]
|
||||||
mask = ggml_reshape_4d(ctx0, mask, W*H, W*H, n_heads, B);
|
mask = ggml_reshape_4d(ctx0, mask, W*H, W*H, n_heads, B);
|
||||||
mask = ggml_pad (ctx0, mask, 0, WH_pad, 0, 0);
|
|
||||||
mask = ggml_cast (ctx0, mask, GGML_TYPE_F16);
|
mask = ggml_cast (ctx0, mask, GGML_TYPE_F16);
|
||||||
|
|
||||||
float scale = 1.0f / sqrtf((float)d_heads);
|
float scale = 1.0f / sqrtf((float)d_heads);
|
||||||
|
|
@ -5213,8 +5210,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
{
|
{
|
||||||
const int native_resolutions[] = {
|
const std::vector native_resolutions = {
|
||||||
/* 512 tiny ,640 small ,*/ 1024 /* base */, 1280 /* large */
|
/*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
|
||||||
};
|
};
|
||||||
// original image size
|
// original image size
|
||||||
const int orig_w = original_size.width;
|
const int orig_w = original_size.width;
|
||||||
|
|
@ -5226,10 +5223,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
color[i] = (int)(255 * params.image_mean[i]);
|
color[i] = (int)(255 * params.image_mean[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
int mode_i = 0;
|
size_t mode_i = 0;
|
||||||
int min_diff = orig_area;
|
int min_diff = orig_area;
|
||||||
|
|
||||||
for (int i = 0; i < 2; i++) {
|
for (size_t i = 0; i < native_resolutions.size(); i++) {
|
||||||
int r = native_resolutions[i];
|
int r = native_resolutions[i];
|
||||||
if (std::abs(orig_area - r * r) < min_diff) {
|
if (std::abs(orig_area - r * r) < min_diff) {
|
||||||
mode_i = i;
|
mode_i = i;
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,14 @@ if [ "${1:-}" = "huge" ]; then
|
||||||
echo "Include BIG and HUGE models..."
|
echo "Include BIG and HUGE models..."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Check if the second argument is "flash", then enable flash attention
|
||||||
|
# This is useful to test if flash attention off works correctly
|
||||||
|
FLASH_ATTN="on"
|
||||||
|
if [ "${2:-}" = "flash_off" ] || [ "${1:-}" = "flash_off" ]; then
|
||||||
|
FLASH_ATTN="off"
|
||||||
|
echo "Flash attention disabled..."
|
||||||
|
fi
|
||||||
|
|
||||||
###############
|
###############
|
||||||
|
|
||||||
arr_prefix=()
|
arr_prefix=()
|
||||||
|
|
@ -142,7 +150,8 @@ for i in "${!arr_hf[@]}"; do
|
||||||
-hf $(printf %q "$hf") \
|
-hf $(printf %q "$hf") \
|
||||||
--image $(printf %q "$SCRIPT_DIR/$inp_file") \
|
--image $(printf %q "$SCRIPT_DIR/$inp_file") \
|
||||||
--temp 0 -n 128 \
|
--temp 0 -n 128 \
|
||||||
${extra_args}"
|
--flash-attn $(printf %q "$FLASH_ATTN") \
|
||||||
|
${extra_args}"
|
||||||
|
|
||||||
# if extra_args does not contain -p, we add a default prompt
|
# if extra_args does not contain -p, we add a default prompt
|
||||||
if ! [[ "$extra_args" =~ "-p" ]]; then
|
if ! [[ "$extra_args" =~ "-p" ]]; then
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue