From 500c627fbc565ef25ce1c49b63293a9a11cae0ec Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 19 Dec 2025 21:26:40 +0800
Subject: [PATCH] wip

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 27 ++++++++++++--------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index e2b40da18f..1eeee823c3 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -912,23 +912,20 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     const HVX_UVector * restrict vx     = (const HVX_UVector * restrict) x;
     const HVX_UVectorPair * restrict vy = (const HVX_UVectorPair * restrict) y;
 
-    uint32_t nv0 = n / VLEN_FP16;  // num full fp16 hvx vectors
-    uint32_t nv1 = n % VLEN_FP16;  // leftover elements
+    uint32_t nv0 = n / VLEN_FP16;                    // num full fp16 hvx vectors
+    uint32_t nv1 = n % VLEN_FP16;                    // leftover elements
 
-    // for some reason we need volatile here so that the compiler doesn't try anything funky
-    const HVX_Vector    zero = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
-    volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
-    uint32_t            i    = 0;
+    const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
+    HVX_Vector       rsum = Q6_V_vsplat_R(0);
+    uint32_t         i    = 0;
 
     for (i = 0; i < nv0; i++) {
         HVX_VectorPair yp = vy[i];
         HVX_Vector     x  = vx[i];
         HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero);  // mul by 1.0
 
-        //NOTE: need volatile here to prevent compiler optimization
-        // Seem compiler cannot guarantee read-after-write??
-        volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
-        volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
+        HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
+        HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
 
         HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
         rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
@@ -942,8 +939,8 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
         HVX_Vector l_x;
         HVX_Vector l_y;
         if (nv1 >= 32) {
-            volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
-            rsum                   = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
+            HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
+            rsum          = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
             nv1 -= 32;
             l_x = Q6_V_hi_W(xp);
             l_y = Q6_V_hi_W(yp);
@@ -953,9 +950,9 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
         }
 
         if (nv1) {
-            volatile HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y);
-            HVX_Vector          sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float));
-            rsum                    = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
+            HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y);
+            HVX_Vector sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float));
+            rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
         }
 
         // hvx_vec_dump_fp16("X", x);