mtmd: some small clean up (#17909)

* clip: add support for fused qkv in build_vit

* use bulid_ffn whenever possible

* fix internvl

* mtmd-cli: move image to beginning

* test script: support custom args
This commit is contained in:
Xuan-Son Nguyen 2025-12-10 22:20:06 +01:00 committed by GitHub
parent 34a6d86982
commit c6b2c9310c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 126 additions and 81 deletions

View File

@ -595,11 +595,12 @@ struct clip_graph {
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_add(ctx0, cur, model.mm_input_norm_b); cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); cur = build_ffn(cur,
cur = ggml_add(ctx0, cur, model.mm_1_b); model.mm_1_w, model.mm_1_b,
cur = ggml_gelu(ctx0, cur); nullptr, nullptr,
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); model.mm_2_w, model.mm_2_b,
cur = ggml_add(ctx0, cur, model.mm_2_b); FFN_GELU,
-1);
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) { } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
cur = build_ffn(cur, cur = build_ffn(cur,
@ -667,16 +668,12 @@ struct clip_graph {
// LlavaMultiModalProjector (always using GELU activation) // LlavaMultiModalProjector (always using GELU activation)
{ {
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); cur = build_ffn(cur,
if (model.mm_1_b) { model.mm_1_w, model.mm_1_b,
cur = ggml_add(ctx0, cur, model.mm_1_b); nullptr, nullptr,
} model.mm_2_w, model.mm_2_b,
FFN_GELU,
cur = ggml_gelu(ctx0, cur); -1);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
if (model.mm_2_b) {
cur = ggml_add(ctx0, cur, model.mm_2_b);
}
} }
// arrangement of the [IMG_BREAK] token // arrangement of the [IMG_BREAK] token
@ -866,16 +863,12 @@ struct clip_graph {
// multimodal projection // multimodal projection
ggml_tensor * embeddings = inpL; ggml_tensor * embeddings = inpL;
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
embeddings = build_ffn(embeddings,
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); model.mm_0_w, model.mm_0_b,
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
// GELU activation FFN_GELU,
embeddings = ggml_gelu(ctx0, embeddings); -1);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
if (use_window_attn) { if (use_window_attn) {
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
@ -1253,11 +1246,12 @@ struct clip_graph {
// projector LayerNorm uses pytorch's default eps = 1e-5 // projector LayerNorm uses pytorch's default eps = 1e-5
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79 // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); cur = build_ffn(cur,
cur = ggml_add(ctx0, cur, model.mm_1_b); model.mm_1_w, model.mm_1_b,
cur = ggml_gelu(ctx0, cur); nullptr, nullptr,
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur); model.mm_3_w, model.mm_3_b,
cur = ggml_add(ctx0, cur, model.mm_3_b); FFN_GELU,
-1);
} }
// build the graph // build the graph
@ -1408,11 +1402,12 @@ struct clip_graph {
cb(cur, "proj_inp_normed", -1); cb(cur, "proj_inp_normed", -1);
// projection mlp // projection mlp
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); cur = build_ffn(cur,
cur = ggml_add(ctx0, cur, model.mm_1_b); model.mm_1_w, model.mm_1_b,
cur = ggml_gelu(ctx0, cur); nullptr, nullptr,
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); model.mm_2_w, model.mm_2_b,
cur = ggml_add(ctx0, cur, model.mm_2_b); FFN_GELU,
-1);
cb(cur, "proj_out", -1); cb(cur, "proj_out", -1);
} }
@ -1883,9 +1878,12 @@ struct clip_graph {
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) { } else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
// projector // projector
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); cur = build_ffn(cur,
cur = ggml_gelu_erf(ctx0, cur); model.mm_1_w, model.mm_1_b,
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU_ERF,
-1);
} else { } else {
GGML_ABORT("%s: unknown projector type", __func__); GGML_ABORT("%s: unknown projector type", __func__);
@ -2070,17 +2068,48 @@ private:
// self-attention // self-attention
{ {
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); ggml_tensor * Qcur = nullptr;
ggml_tensor * Kcur = nullptr;
ggml_tensor * Vcur = nullptr;
if (layer.qkv_w != nullptr) {
// fused qkv
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
if (layer.qkv_b != nullptr) {
cur = ggml_add(ctx0, cur, layer.qkv_b);
}
Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ 0);
Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, n_embd));
Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
// TODO: q/k norm requires row size == n_embd, while here it's d_head
// we can add support in the future if needed
GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
} else {
// separate q, k, v
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
if (layer.q_b) { if (layer.q_b) {
Qcur = ggml_add(ctx0, Qcur, layer.q_b); Qcur = ggml_add(ctx0, Qcur, layer.q_b);
} }
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
if (layer.k_b) { if (layer.k_b) {
Kcur = ggml_add(ctx0, Kcur, layer.k_b); Kcur = ggml_add(ctx0, Kcur, layer.k_b);
} }
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
if (layer.v_b) { if (layer.v_b) {
Vcur = ggml_add(ctx0, Vcur, layer.v_b); Vcur = ggml_add(ctx0, Vcur, layer.v_b);
} }
@ -2098,6 +2127,7 @@ private:
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
}
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);

View File

@ -318,7 +318,9 @@ int main(int argc, char ** argv) {
g_is_generating = true; g_is_generating = true;
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
for (size_t i = 0; i < params.image.size(); i++) { for (size_t i = 0; i < params.image.size(); i++) {
params.prompt += mtmd_default_marker(); // most models require the marker before each image
// ref: https://github.com/ggml-org/llama.cpp/pull/17616
params.prompt = mtmd_default_marker() + params.prompt;
} }
} }
common_chat_msg msg; common_chat_msg msg;

View File

@ -32,23 +32,32 @@ fi
arr_prefix=() arr_prefix=()
arr_hf=() arr_hf=()
arr_tmpl=() # chat template arr_extra_args=()
arr_file=() arr_file=()
add_test_vision() { add_test_vision() {
local hf=$1 local hf=$1
local tmpl=${2:-""} # default to empty string if not provided shift
local extra_args=""
if [ $# -gt 0 ]; then
extra_args=$(printf " %q" "$@")
fi
arr_prefix+=("[vision]") arr_prefix+=("[vision]")
arr_hf+=("$hf") arr_hf+=("$hf")
arr_tmpl+=("$tmpl") arr_extra_args+=("$extra_args")
arr_file+=("test-1.jpeg") arr_file+=("test-1.jpeg")
} }
add_test_audio() { add_test_audio() {
local hf=$1 local hf=$1
shift
local extra_args=""
if [ $# -gt 0 ]; then
extra_args=$(printf " %q" "$@")
fi
arr_prefix+=("[audio] ") arr_prefix+=("[audio] ")
arr_hf+=("$hf") arr_hf+=("$hf")
arr_tmpl+=("") # no need for chat tmpl arr_extra_args+=("$extra_args")
arr_file+=("test-2.mp3") arr_file+=("test-2.mp3")
} }
@ -56,9 +65,9 @@ add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0" add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M" add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna" add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna" add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M" add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K" add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
@ -79,7 +88,7 @@ add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
# to test the big models, run: ./tests.sh big # to test the big models, run: ./tests.sh big
if [ "$RUN_BIG_TESTS" = true ]; then if [ "$RUN_BIG_TESTS" = true ]; then
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M" add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7" add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
@ -89,7 +98,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M" add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M" add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M" add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
@ -122,21 +131,25 @@ for i in "${!arr_hf[@]}"; do
bin="llama-mtmd-cli" bin="llama-mtmd-cli"
prefix="${arr_prefix[$i]}" prefix="${arr_prefix[$i]}"
hf="${arr_hf[$i]}" hf="${arr_hf[$i]}"
tmpl="${arr_tmpl[$i]}" extra_args="${arr_extra_args[$i]}"
inp_file="${arr_file[$i]}" inp_file="${arr_file[$i]}"
echo "Running test with binary: $bin and HF model: $hf" echo "Running test with binary: $bin and HF model: $hf"
echo "" echo ""
echo "" echo ""
output=$(\ cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
"$PROJ_ROOT/build/bin/$bin" \ -hf $(printf %q "$hf") \
-hf "$hf" \ --image $(printf %q "$SCRIPT_DIR/$inp_file") \
--image $SCRIPT_DIR/$inp_file \
-p "what is the publisher name of the newspaper?" \
--temp 0 -n 128 \ --temp 0 -n 128 \
${tmpl:+--chat-template "$tmpl"} \ ${extra_args}"
2>&1 | tee /dev/tty)
# if extra_args does not contain -p, we add a default prompt
if ! [[ "$extra_args" =~ "-p" ]]; then
cmd+=" -p \"what is the publisher name of the newspaper?\""
fi
output=$(eval "$cmd" 2>&1 | tee /dev/tty)
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
@ -144,9 +157,9 @@ for i in "${!arr_hf[@]}"; do
if echo "$output" | grep -iq "new york" \ if echo "$output" | grep -iq "new york" \
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk") || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
then then
result="$prefix \033[32mOK\033[0m: $bin $hf" result="$prefix \033[32mOK\033[0m: $hf"
else else
result="$prefix \033[31mFAIL\033[0m: $bin $hf" result="$prefix \033[31mFAIL\033[0m: $hf"
fi fi
echo -e "$result" echo -e "$result"
arr_res+=("$result") arr_res+=("$result")