mtmd: some small clean up (#17909)
* clip: add support for fused qkv in build_vit * use bulid_ffn whenever possible * fix internvl * mtmd-cli: move image to beginning * test script: support custom args
This commit is contained in:
parent
34a6d86982
commit
c6b2c9310c
|
|
@ -595,11 +595,12 @@ struct clip_graph {
|
|||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
|
||||
cur = build_ffn(cur,
|
||||
|
|
@ -667,16 +668,12 @@ struct clip_graph {
|
|||
|
||||
// LlavaMultiModalProjector (always using GELU activation)
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
if (model.mm_1_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
}
|
||||
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
if (model.mm_2_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
}
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
}
|
||||
|
||||
// arrangement of the [IMG_BREAK] token
|
||||
|
|
@ -866,16 +863,12 @@ struct clip_graph {
|
|||
// multimodal projection
|
||||
ggml_tensor * embeddings = inpL;
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
// GELU activation
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
// Second linear layer
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
if (use_window_attn) {
|
||||
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
||||
|
|
@ -1253,11 +1246,12 @@ struct clip_graph {
|
|||
// projector LayerNorm uses pytorch's default eps = 1e-5
|
||||
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
||||
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_3_b);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_3_w, model.mm_3_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
|
|
@ -1408,11 +1402,12 @@ struct clip_graph {
|
|||
cb(cur, "proj_inp_normed", -1);
|
||||
|
||||
// projection mlp
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
cb(cur, "proj_out", -1);
|
||||
}
|
||||
|
||||
|
|
@ -1883,9 +1878,12 @@ struct clip_graph {
|
|||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
|
||||
// projector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
} else {
|
||||
GGML_ABORT("%s: unknown projector type", __func__);
|
||||
|
|
@ -2070,17 +2068,48 @@ private:
|
|||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
ggml_tensor * Qcur = nullptr;
|
||||
ggml_tensor * Kcur = nullptr;
|
||||
ggml_tensor * Vcur = nullptr;
|
||||
if (layer.qkv_w != nullptr) {
|
||||
// fused qkv
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
if (layer.qkv_b != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
}
|
||||
|
||||
Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ 0);
|
||||
|
||||
Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ ggml_row_size(cur->type, n_embd));
|
||||
|
||||
Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
|
||||
|
||||
// TODO: q/k norm requires row size == n_embd, while here it's d_head
|
||||
// we can add support in the future if needed
|
||||
GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
|
||||
|
||||
} else {
|
||||
// separate q, k, v
|
||||
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
|
|
@ -2098,6 +2127,7 @@ private:
|
|||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
|
|
|||
|
|
@ -318,7 +318,9 @@ int main(int argc, char ** argv) {
|
|||
g_is_generating = true;
|
||||
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
||||
for (size_t i = 0; i < params.image.size(); i++) {
|
||||
params.prompt += mtmd_default_marker();
|
||||
// most models require the marker before each image
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17616
|
||||
params.prompt = mtmd_default_marker() + params.prompt;
|
||||
}
|
||||
}
|
||||
common_chat_msg msg;
|
||||
|
|
|
|||
|
|
@ -32,23 +32,32 @@ fi
|
|||
|
||||
arr_prefix=()
|
||||
arr_hf=()
|
||||
arr_tmpl=() # chat template
|
||||
arr_extra_args=()
|
||||
arr_file=()
|
||||
|
||||
add_test_vision() {
|
||||
local hf=$1
|
||||
local tmpl=${2:-""} # default to empty string if not provided
|
||||
shift
|
||||
local extra_args=""
|
||||
if [ $# -gt 0 ]; then
|
||||
extra_args=$(printf " %q" "$@")
|
||||
fi
|
||||
arr_prefix+=("[vision]")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("$tmpl")
|
||||
arr_extra_args+=("$extra_args")
|
||||
arr_file+=("test-1.jpeg")
|
||||
}
|
||||
|
||||
add_test_audio() {
|
||||
local hf=$1
|
||||
shift
|
||||
local extra_args=""
|
||||
if [ $# -gt 0 ]; then
|
||||
extra_args=$(printf " %q" "$@")
|
||||
fi
|
||||
arr_prefix+=("[audio] ")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("") # no need for chat tmpl
|
||||
arr_extra_args+=("$extra_args")
|
||||
arr_file+=("test-2.mp3")
|
||||
}
|
||||
|
||||
|
|
@ -56,9 +65,9 @@ add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
|||
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
|
||||
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
|
||||
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
|
||||
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
|
||||
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
|
|
@ -79,7 +88,7 @@ add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
|
|||
# to test the big models, run: ./tests.sh big
|
||||
if [ "$RUN_BIG_TESTS" = true ]; then
|
||||
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
|
||||
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
|
||||
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
|
|
@ -89,7 +98,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
|
|||
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
||||
add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
|
||||
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
|
|
@ -122,21 +131,25 @@ for i in "${!arr_hf[@]}"; do
|
|||
bin="llama-mtmd-cli"
|
||||
prefix="${arr_prefix[$i]}"
|
||||
hf="${arr_hf[$i]}"
|
||||
tmpl="${arr_tmpl[$i]}"
|
||||
extra_args="${arr_extra_args[$i]}"
|
||||
inp_file="${arr_file[$i]}"
|
||||
|
||||
echo "Running test with binary: $bin and HF model: $hf"
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
output=$(\
|
||||
"$PROJ_ROOT/build/bin/$bin" \
|
||||
-hf "$hf" \
|
||||
--image $SCRIPT_DIR/$inp_file \
|
||||
-p "what is the publisher name of the newspaper?" \
|
||||
cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
|
||||
-hf $(printf %q "$hf") \
|
||||
--image $(printf %q "$SCRIPT_DIR/$inp_file") \
|
||||
--temp 0 -n 128 \
|
||||
${tmpl:+--chat-template "$tmpl"} \
|
||||
2>&1 | tee /dev/tty)
|
||||
${extra_args}"
|
||||
|
||||
# if extra_args does not contain -p, we add a default prompt
|
||||
if ! [[ "$extra_args" =~ "-p" ]]; then
|
||||
cmd+=" -p \"what is the publisher name of the newspaper?\""
|
||||
fi
|
||||
|
||||
output=$(eval "$cmd" 2>&1 | tee /dev/tty)
|
||||
|
||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||
|
||||
|
|
@ -144,9 +157,9 @@ for i in "${!arr_hf[@]}"; do
|
|||
if echo "$output" | grep -iq "new york" \
|
||||
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
|
||||
then
|
||||
result="$prefix \033[32mOK\033[0m: $bin $hf"
|
||||
result="$prefix \033[32mOK\033[0m: $hf"
|
||||
else
|
||||
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
|
||||
result="$prefix \033[31mFAIL\033[0m: $hf"
|
||||
fi
|
||||
echo -e "$result"
|
||||
arr_res+=("$result")
|
||||
|
|
|
|||
Loading…
Reference in New Issue