From c45f89d5516221e51eef93e467e3bec6eb811994 Mon Sep 17 00:00:00 2001 From: Shouyu <65317431+joeldushouyu@users.noreply.github.com> Date: Mon, 15 Dec 2025 13:53:56 -0500 Subject: [PATCH] ggml-hexagon: mm for mtmd (#17894) * feat: add run_mtmd script for hexagon * fix: fix issue in fp16xfp32 mm * fix: remove opt_experiment for fp16xfp32 mm * fix: ggml-hexagon: matmul fp16xfp32 support non-contigious src0 * fix: fix syntax check for run-mtmd.sh for cli --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 3 -- ggml/src/ggml-hexagon/htp/matmul-ops.c | 65 +++++++++++++++----------- scripts/snapdragon/adb/run-mtmd.sh | 65 ++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 31 deletions(-) create mode 100755 scripts/snapdragon/adb/run-mtmd.sh diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 72a82a8911..514f086f68 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -1976,9 +1976,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s break; case GGML_TYPE_F16: - if (!opt_experimental) { - return false; - } break; default: diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index c99b6a0d18..346f0bd339 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -903,7 +903,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri const float * restrict vy = (const float * restrict) y; for (uint32_t i = 0; i < n; i++) { - rsum += vx[i] * (__fp16) vy[i]; + rsum += (float)vx[i] * vy[i]; } *s = rsum; return; @@ -917,7 +917,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri // for some reason we need volatile here so that the compiler doesn't try anything funky volatile HVX_Vector rsum = Q6_V_vsplat_R(0); - + float r_sum_scalar = 0.0f; uint32_t i = 0; for (i = 0; i < nv0; i++) { @@ -926,31 +926,42 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri HVX_Vector x = vx[i]; HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 - HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); - HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); + //NOTE: need volatile here to prevent compiler optimization + // Seem compiler cannot guarantee read-after-write?? + volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); + volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo); rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); } if (nv1) { - HVX_VectorPair yp = vy[i]; + // HVX_VectorPair yp = vy[i]; - HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 + // HVX_Vector x = vx[i]; + // HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 - if (nv1 >= 32) { - HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); - rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi); - nv1 -= 32; - } + // if (nv1 >= 32) { + // volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); + // rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi); + // nv1 -= 32; + // } + // rsum = hvx_vec_qf32_reduce_sum(rsum); + + // if (nv1) { + // volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); + // HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1); + // rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); + // } + + //process the remainder using scalar loop rsum = hvx_vec_qf32_reduce_sum(rsum); + const __fp16 * restrict sx = (const __fp16 * restrict) x; + const float * restrict sy = (const float * restrict) y; - if (nv1) { - HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); - HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1); - rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); + for (uint32_t i = nv0 * 64; i < n; i++) { + r_sum_scalar += (float) sx[i] * sy[i]; } // hvx_vec_dump_fp16("X", x); @@ -961,7 +972,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri rsum = hvx_vec_qf32_reduce_sum(rsum); } - *s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)); + *s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)) + r_sum_scalar; # ifdef HTP_DEBUG { @@ -1498,9 +1509,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0, uint64_t t1, t2; t1 = HAP_perf_get_qtimer_count(); - const size_t src0_row_size = sizeof(__fp16) * ne00; - const size_t src1_row_size = sizeof(float) * ne10; - assert(ne12 % ne02 == 0); assert(ne13 % ne03 == 0); @@ -1510,8 +1518,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0, // This is the size of the rest of the dimensions of the result const uint32_t nr1 = ne1 * ne2 * ne3; - uint32_t chunk_size = 64; - // distribute the thread work across the inner or outer loop based on which one is larger uint32_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows uint32_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows @@ -1544,11 +1550,11 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0, const uint32_t blck_0 = 64; const uint32_t blck_1 = 64; - float tmp[32]; + __attribute__((aligned(128))) float tmp[64]; for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { - for (uint32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1++) { + for (uint32_t ir1 = iir1; ir1 < MIN(iir1 + blck_1, ir1_end); ir1++) { const uint32_t i13 = (ir1 / (ne12 * ne1)); const uint32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); @@ -1561,13 +1567,16 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0, const uint32_t i2 = i12; const uint32_t i3 = i13; - const uint8_t * restrict src0_row = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03); + const uint8_t * restrict src0_base = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03); const uint8_t * restrict src1_col = - (const uint8_t *) src1->data + (i11 + i12 * ne11 + i13 * ne12 * ne11) * src1_row_size; + (const uint8_t *) src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13); float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); - for (uint32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0++) { - vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row + ir0 * src0_row_size, src1_col); + const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end); + for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) { + // Use nb01 stride for non-contiguous src0 support + const uint8_t * restrict src0_row = src0_base + ir0 * nb01; + vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row, src1_col); } hvx_copy_fp32_ua((uint8_t *) &dst_col[iir0], (uint8_t *) tmp, MIN(iir0 + blck_0, ir0_end) - iir0); diff --git a/scripts/snapdragon/adb/run-mtmd.sh b/scripts/snapdragon/adb/run-mtmd.sh new file mode 100755 index 0000000000..91d868278a --- /dev/null +++ b/scripts/snapdragon/adb/run-mtmd.sh @@ -0,0 +1,65 @@ +#!/bin/sh +# + +# Basedir on device +basedir=/data/local/tmp/llama.cpp + +cli_opts= + +branch=. +[ "$B" != "" ] && branch=$B + +adbserial= +[ "$S" != "" ] && adbserial="-s $S" + +model="gemma-3-4b-it-Q4_0.gguf" +[ "$M" != "" ] && model="$M" + +mmproj="mmproj-F16.gguf" +[ "$MMPROJ" != "" ] && mmproj="$MMPROJ" + +image= +[ "$IMG" != "" ] && image="$IMG" + +device="HTP0" +[ "$D" != "" ] && device="$D" + +verbose= +[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" + +experimental="GGML_HEXAGON_EXPERIMENTAL=1" +[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E" + +sched= +[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" + +profile= +[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" + +opmask= +[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK" + +nhvx= +[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" + +ndev= +[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV" + +# MTMD backend device for vision model (defaults to CPU if not set) +mtmd_backend= +[ "$MTMD_DEVICE" != "" ] && mtmd_backend="MTMD_BACKEND_DEVICE=$MTMD_DEVICE" + +set -x + +adb $adbserial shell " \ + cd $basedir; ulimit -c unlimited; \ + LD_LIBRARY_PATH=$basedir/$branch/lib \ + ADSP_LIBRARY_PATH=$basedir/$branch/lib \ + $verbose $experimental $sched $opmask $profile $nhvx $ndev $mtmd_backend \ + ./$branch/bin/llama-mtmd-cli --no-mmap -m $basedir/../gguf/$model \ + --mmproj $basedir/../gguf/$mmproj \ + --image $basedir/../gguf/$image \ + --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ + --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \ + -ngl 99 --device $device -v $cli_opts $@ \ +"