From afaeb543bc36a1c2c64948d5ccc95b08bc06ef07 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Wed, 17 Dec 2025 17:24:50 +0800
Subject: [PATCH 01/10] refactoring: enhance memory management with tracking
 buffer allocation

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 39 +++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 13b96d61f8..b909487a2b 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -50,6 +50,8 @@ static int    opt_profile      = 0;
 static int    opt_hostbuf      = 1;
 static int    opt_experimental = 0;
 
+static const size_t kMaxMemPerSessInBytes = 2ULL * 1024 * 1024 * 1024;  // 2GB
+
 // Enable all stages by default
 static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE;
 static int opt_opsync = 0;  // synchronous ops
@@ -140,6 +142,7 @@ struct ggml_hexagon_session {
     uint32_t         prof_usecs;
     uint32_t         prof_cycles;
     uint32_t         prof_pkts;
+    uint64_t         avail_mem_bytes = kMaxMemPerSessInBytes;  // available memory for allocations
 };
 
 void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
@@ -267,13 +270,15 @@ struct ggml_backend_hexagon_buffer_context {
     }
 
     ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
-        size += 4 * 1024;  // extra page for padding
+        size = get_padded_buffer_size(size);  // add padding for alignment
 
         if (rpcmem_alloc2) {
-            this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+            this->base =
+                (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
         } else {
             GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
-            this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+            this->base =
+                (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
         }
 
         if (!this->base) {
@@ -296,6 +301,7 @@ struct ggml_backend_hexagon_buffer_context {
         this->size   = size;
         this->mapped = false;
         this->repack = repack;
+        sess->avail_mem_bytes -= size;
     }
 
     ~ggml_backend_hexagon_buffer_context() {
@@ -304,8 +310,11 @@ struct ggml_backend_hexagon_buffer_context {
             rpcmem_free(this->base);
             this->base = NULL;
         }
+        this->sess->avail_mem_bytes += this->size;
     }
 
+    static size_t get_padded_buffer_size(size_t size) { return size + 4 * 1024; }
+
     ggml_hexagon_session * sess;  // primary session
     uint8_t *              base;
     size_t                 size;
@@ -1479,8 +1488,15 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty
 static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
             ggml_backend_buffer_type_t buffer_type, size_t size) {
     auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
+    if (sess->avail_mem_bytes < ggml_backend_hexagon_buffer_context::get_padded_buffer_size(size)) {
+        GGML_LOG_INFO("ggml-hex: %s insufficient memory to allocate buffer of size %zu bytes (available %zu bytes)\n",
+                      sess->name.c_str(), size, sess->avail_mem_bytes);
+        return nullptr;
+    }
+
     try {
-        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
+        ggml_backend_hexagon_buffer_context * ctx =
+            new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
         return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
     } catch (const std::exception & exc) {
         GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
@@ -1489,8 +1505,15 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
 }
 
 static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
-            ggml_backend_buffer_type_t buffer_type, size_t size) {
+    ggml_backend_buffer_type_t buffer_type,
+    size_t                     size) {
     auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
+    if (sess->avail_mem_bytes < ggml_backend_hexagon_buffer_context::get_padded_buffer_size(size)) {
+        GGML_LOG_INFO("ggml-hex: %s insufficient memory to allocate repack buffer of size %zu bytes (available %zu bytes)\n",
+                      sess->name.c_str(), size, sess->avail_mem_bytes);
+        return nullptr;
+    }
+
     try {
         ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
         return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
@@ -2681,9 +2704,11 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
 }
 
 static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    auto sess = static_cast<ggml_hexagon_session *>(dev->context);
+
     // ~2GB per session for now
-    *free  = 2ULL * 1024 * 1024 * 1024;
-    *total = *free;
+    *free  = sess->avail_mem_bytes;
+    *total = kMaxMemPerSessInBytes;
 
     GGML_UNUSED(dev);
 }

From 7ef467ce20876c98610a4dab52709163587c073e Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 19 Dec 2025 11:15:35 +0800
Subject: [PATCH 02/10] wip

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index f14523d485..312dfc40be 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -924,8 +924,8 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     const HVX_UVector * restrict vx     = (const HVX_UVector * restrict) x;
     const HVX_UVectorPair * restrict vy = (const HVX_UVectorPair * restrict) y;
 
-    uint32_t nv0 = n / 64;  // num full fp16 hvx vectors
-    uint32_t nv1 = n % 64;  // leftover elements
+    uint32_t nv0 = n / VLEN_FP16;  // num full fp16 hvx vectors
+    uint32_t nv1 = n % VLEN_FP16;  // leftover elements
 
     // for some reason we need volatile here so that the compiler doesn't try anything funky
     volatile HVX_Vector rsum = Q6_V_vsplat_R(0);

From e0b1435b506307c52c3710075f307cf763eeadcb Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 19 Dec 2025 20:55:49 +0800
Subject: [PATCH 03/10] refactoring: improve code formatting and alignment in
 matmul operations

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 98 ++++++++++----------------
 1 file changed, 39 insertions(+), 59 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 312dfc40be..a88e4ad977 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -485,8 +485,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
     }
 
     // Convert into fp32 and reduce
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    r0_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
     HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
     hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
@@ -658,8 +658,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
     }
 
     // Convert into fp32 and reduce
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    r0_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
     HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
     hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
@@ -900,8 +900,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
     }
 
     // Convert into fp32 and reduce
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    r0_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
     HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
     hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
@@ -909,18 +909,6 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
 
 #if 1
 static void vec_dot_f16_f32(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
-    if (0) {
-        float rsum                 = 0;
-        const __fp16 * restrict vx = (const __fp16 * restrict) x;
-        const float * restrict vy  = (const float * restrict) y;
-
-        for (uint32_t i = 0; i < n; i++) {
-            rsum += (float)vx[i] * vy[i];
-        }
-        *s = rsum;
-        return;
-    }
-
     const HVX_UVector * restrict vx     = (const HVX_UVector * restrict) x;
     const HVX_UVectorPair * restrict vy = (const HVX_UVectorPair * restrict) y;
 
@@ -929,12 +917,10 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
 
     // for some reason we need volatile here so that the compiler doesn't try anything funky
     volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
-    float r_sum_scalar = 0.0f;
-    uint32_t i = 0;
+    uint32_t            i    = 0;
 
     for (i = 0; i < nv0; i++) {
         HVX_VectorPair yp = vy[i];
-
         HVX_Vector     x  = vx[i];
         HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
 
@@ -948,43 +934,37 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     }
 
     if (nv1) {
-        // HVX_VectorPair yp = vy[i];
+        HVX_VectorPair yp = vy[i];
+        HVX_Vector     x  = vx[i];
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
 
-        // HVX_Vector     x  = vx[i];
-        // HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
+        HVX_Vector l_x;
+        HVX_Vector l_y;
+        if (nv1 >= 32) {
+            volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
+            rsum                   = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
+            nv1 -= 32;
+            l_x = Q6_V_hi_W(xp);
+            l_y = Q6_V_hi_W(yp);
+        } else {
+            l_x = Q6_V_lo_W(xp);
+            l_y = Q6_V_lo_W(yp);
+        }
 
-        // if (nv1 >= 32) {
-        //     volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
-        //     rsum          = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
-        //     nv1 -= 32;
-        // }
-
-        // rsum = hvx_vec_qf32_reduce_sum(rsum);
-
-        // if (nv1) {
-        //     volatile HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
-        //     HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
-        //     rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
-        // }
-
-        //process the remainder using scalar loop
-        rsum = hvx_vec_qf32_reduce_sum(rsum);
-        const __fp16 * restrict sx = (const __fp16 * restrict) x;
-        const float * restrict sy  = (const float * restrict) y;
-
-        for (uint32_t i = nv0 * 64; i < n; i++) {
-            r_sum_scalar += (float) sx[i] * sy[i];
+        if (nv1) {
+            volatile HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y);
+            HVX_Vector          sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float));
+            rsum                    = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
         }
 
         // hvx_vec_dump_fp16("X", x);
         // hvx_vec_dump_fp16("Y", y);
         // hvx_vec_dump_fp32("SUM",  Q6_Vsf_equals_Vqf32(sum));
         // hvx_vec_dump_fp32("RSUM", Q6_Vsf_equals_Vqf32(rsum));
-    } else {
-        rsum = hvx_vec_qf32_reduce_sum(rsum);
     }
 
-    *s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)) + r_sum_scalar;
+    rsum = hvx_vec_qf32_reduce_sum(rsum);
+    *s   = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum));
 
 #    ifdef HTP_DEBUG
     {
@@ -1120,8 +1100,8 @@ static void matmul(struct htp_matmul_type * mt,
 
     const uint8_t * restrict src0_row = (const uint8_t *) src0->data;
 
-    // Prefill spad with src0 rows
-    #pragma unroll(4)
+// Prefill spad with src0 rows
+#pragma unroll(4)
     for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
         const int is0 = (ir0 - src0_start_row);
         if (is0 >= HTP_SPAD_SRC0_NROWS) {
@@ -1135,7 +1115,7 @@ static void matmul(struct htp_matmul_type * mt,
     for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
         const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
 
-        #pragma unroll(2)
+#pragma unroll(2)
         for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
             const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_row_size);
             float * restrict dst_row          = (float *) (dst->data + (ir1 * dst_row_size));
@@ -1159,7 +1139,7 @@ static void matmul(struct htp_matmul_type * mt,
                        src0_row_size_padded, src0_row_size, 1);
         const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
 
-        #pragma unroll(2)
+#pragma unroll(2)
         for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
             const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_row_size);
             float * restrict dst_row          = (float *) (dst->data + (ir1 * dst_row_size));
@@ -1222,8 +1202,8 @@ static void matvec(struct htp_matmul_type * mt,
     const uint8_t * restrict src1_col = (const uint8_t *) src1_data;
     float * restrict dst_col          = (float *) dst->data;
 
-    // Prefill spad with 2x src0 rows
-    #pragma unroll(2)
+// Prefill spad with 2x src0 rows
+#pragma unroll(2)
     for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
         const uint32_t is0 = (ir0 - src0_start_row);
         if (is0 >= HTP_SPAD_SRC0_NROWS) {
@@ -1336,8 +1316,8 @@ static void matmul_id(struct htp_matmul_type * mt,
 
         const uint8_t * src0_row = (const uint8_t *) src0->data + (0 + cur_a * nb02 + 0);
 
-        // Prefill spad with src0 rows
-        #pragma unroll(4)
+// Prefill spad with src0 rows
+#pragma unroll(4)
         for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
             const int is0 = (ir0 - src0_start_row);
             if (is0 >= HTP_SPAD_SRC0_NROWS) {
@@ -1460,8 +1440,8 @@ static void matvec_id(struct htp_matmul_type * mt,
         const uint8_t * restrict src1_col = (const uint8_t *) src1_data;
         float * restrict dst_row          = (float *) (dst->data + ie1 * nb1);
 
-        // Prefill spad with src0 rows
-        #pragma unroll(4)
+// Prefill spad with src0 rows
+#pragma unroll(4)
         for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
             const int is0 = (ir0 - src0_start_row);
             if (is0 >= HTP_SPAD_SRC0_NROWS) {
@@ -2347,7 +2327,7 @@ int op_matmul_id(struct htp_ops_context * octx) {
 
                 assert(i02 >= 0 && i02 < n_as);
 
-                MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) { id, iid1 };
+                MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping){ id, iid1 };
                 matrix_row_counts[i02] += 1;
             }
         }

From 398aa853116e3959a8406a3fc63831b3b473fd28 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 19 Dec 2025 21:04:52 +0800
Subject: [PATCH 04/10] wip

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index a88e4ad977..e2b40da18f 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -916,13 +916,14 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     uint32_t nv1 = n % VLEN_FP16;  // leftover elements
 
     // for some reason we need volatile here so that the compiler doesn't try anything funky
+    const HVX_Vector    zero = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
     volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
     uint32_t            i    = 0;
 
     for (i = 0; i < nv0; i++) {
         HVX_VectorPair yp = vy[i];
         HVX_Vector     x  = vx[i];
-        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero);  // mul by 1.0
 
         //NOTE: need volatile here to prevent compiler optimization
         // Seem compiler cannot guarantee read-after-write??
@@ -936,7 +937,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     if (nv1) {
         HVX_VectorPair yp = vy[i];
         HVX_Vector     x  = vx[i];
-        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero);  // mul by 1.0
 
         HVX_Vector l_x;
         HVX_Vector l_y;

From 500c627fbc565ef25ce1c49b63293a9a11cae0ec Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 19 Dec 2025 21:26:40 +0800
Subject: [PATCH 05/10] wip

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 27 ++++++++++++--------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index e2b40da18f..1eeee823c3 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -912,23 +912,20 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     const HVX_UVector * restrict vx     = (const HVX_UVector * restrict) x;
     const HVX_UVectorPair * restrict vy = (const HVX_UVectorPair * restrict) y;
 
-    uint32_t nv0 = n / VLEN_FP16;  // num full fp16 hvx vectors
-    uint32_t nv1 = n % VLEN_FP16;  // leftover elements
+    uint32_t nv0 = n / VLEN_FP16;                    // num full fp16 hvx vectors
+    uint32_t nv1 = n % VLEN_FP16;                    // leftover elements
 
-    // for some reason we need volatile here so that the compiler doesn't try anything funky
-    const HVX_Vector    zero = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
-    volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
-    uint32_t            i    = 0;
+    const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
+    HVX_Vector       rsum = Q6_V_vsplat_R(0);
+    uint32_t         i    = 0;
 
     for (i = 0; i < nv0; i++) {
         HVX_VectorPair yp = vy[i];
         HVX_Vector     x  = vx[i];
         HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero);  // mul by 1.0
 
-        //NOTE: need volatile here to prevent compiler optimization
-        // Seem compiler cannot guarantee read-after-write??
-        volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
-        volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
+        HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
+        HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
 
         HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
         rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
@@ -942,8 +939,8 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
         HVX_Vector l_x;
         HVX_Vector l_y;
         if (nv1 >= 32) {
-            volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
-            rsum                   = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
+            HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
+            rsum          = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
             nv1 -= 32;
             l_x = Q6_V_hi_W(xp);
             l_y = Q6_V_hi_W(yp);
@@ -953,9 +950,9 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
         }
 
         if (nv1) {
-            volatile HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y);
-            HVX_Vector          sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float));
-            rsum                    = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
+            HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y);
+            HVX_Vector sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float));
+            rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
         }
 
         // hvx_vec_dump_fp16("X", x);

From 29171361b5ff11beba4a79b48ac1864b7e68db41 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sat, 20 Dec 2025 21:59:32 +0800
Subject: [PATCH 06/10] wip

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 1eeee823c3..6e2d46b112 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -938,10 +938,10 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
 
         HVX_Vector l_x;
         HVX_Vector l_y;
-        if (nv1 >= 32) {
+        if (nv1 >= VLEN_FP32) {
             HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
             rsum          = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
-            nv1 -= 32;
+            nv1 -= VLEN_FP32;
             l_x = Q6_V_hi_W(xp);
             l_y = Q6_V_hi_W(yp);
         } else {

From ea450209d05acc72f1adc61b4478803f234bfed2 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sat, 20 Dec 2025 23:06:02 +0800
Subject: [PATCH 07/10] opt: use qf32 internal precision for vec_dot_f16_f32

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 38 ++++++++++++++------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 6e2d46b112..bf0edd7fa9 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -915,17 +915,20 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     uint32_t nv0 = n / VLEN_FP16;                    // num full fp16 hvx vectors
     uint32_t nv1 = n % VLEN_FP16;                    // leftover elements
 
-    const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
+    const HVX_Vector one  = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
+    const HVX_Vector zero = Q6_V_vsplat_R(0);       // 0.0 in fp16
     HVX_Vector       rsum = Q6_V_vsplat_R(0);
     uint32_t         i    = 0;
 
     for (i = 0; i < nv0; i++) {
-        HVX_VectorPair yp = vy[i];
-        HVX_Vector     x  = vx[i];
-        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero);  // mul by 1.0
+        HVX_VectorPair yp   = vy[i];
+        HVX_Vector     x    = vx[i];
+        HVX_VectorPair xp   = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one);  // mul by 1.0
+        HVX_Vector     y_hi = Q6_Vqf32_vadd_VsfVsf(Q6_V_hi_W(yp), zero);      // convert to qf32
+        HVX_Vector     y_lo = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero);      // convert to qf32
 
-        HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
-        HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
+        HVX_Vector hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(xp), y_hi);
+        HVX_Vector lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(xp), y_lo);
 
         HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
         rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
@@ -934,24 +937,25 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     if (nv1) {
         HVX_VectorPair yp = vy[i];
         HVX_Vector     x  = vx[i];
-        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero);  // mul by 1.0
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one);  // mul by 1.0
 
-        HVX_Vector l_x;
-        HVX_Vector l_y;
+        HVX_Vector leftover_x;
+        HVX_Vector leftover_y;
         if (nv1 >= VLEN_FP32) {
-            HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
-            rsum          = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
+            HVX_Vector y_lo = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero);  // convert to qf32
+            HVX_Vector lo   = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(xp), y_lo);
+            rsum            = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, lo);
             nv1 -= VLEN_FP32;
-            l_x = Q6_V_hi_W(xp);
-            l_y = Q6_V_hi_W(yp);
+            leftover_x = Q6_V_hi_W(xp);
+            leftover_y = Q6_Vqf32_vadd_VsfVsf(Q6_V_hi_W(yp), zero);  // convert to qf32
         } else {
-            l_x = Q6_V_lo_W(xp);
-            l_y = Q6_V_lo_W(yp);
+            leftover_x = Q6_V_lo_W(xp);
+            leftover_y = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero);  // convert to qf32
         }
 
         if (nv1) {
-            HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y);
-            HVX_Vector sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float));
+            HVX_Vector lo  = Q6_Vqf32_vmpy_Vqf32Vqf32(leftover_x, leftover_y);
+            HVX_Vector sum = Q6_V_valign_VVR(lo, zero, nv1 * sizeof(float));
             rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
         }
 

From cb0a8ff4e7dab37c118379543ee597c14452febc Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 22 Dec 2025 01:26:14 +0800
Subject: [PATCH 08/10] add unroll marker

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index bf0edd7fa9..5d7ec4d88d 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -920,6 +920,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     HVX_Vector       rsum = Q6_V_vsplat_R(0);
     uint32_t         i    = 0;
 
+#pragma unroll(2)
     for (i = 0; i < nv0; i++) {
         HVX_VectorPair yp   = vy[i];
         HVX_Vector     x    = vx[i];

From 2058f28b3ea1be8f9ec836f490e49b841f09ec08 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 22 Dec 2025 12:35:46 +0800
Subject: [PATCH 09/10] Revert "opt: use qf32 internal precision for
 vec_dot_f16_f32"

This reverts commit 8600ecd20d6c902fe16271d6af1e59504eff4a27.
---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 38 ++++++++++++--------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 5d7ec4d88d..04794227ed 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -915,21 +915,18 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     uint32_t nv0 = n / VLEN_FP16;                    // num full fp16 hvx vectors
     uint32_t nv1 = n % VLEN_FP16;                    // leftover elements
 
-    const HVX_Vector one  = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
-    const HVX_Vector zero = Q6_V_vsplat_R(0);       // 0.0 in fp16
+    const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
     HVX_Vector       rsum = Q6_V_vsplat_R(0);
     uint32_t         i    = 0;
 
 #pragma unroll(2)
     for (i = 0; i < nv0; i++) {
-        HVX_VectorPair yp   = vy[i];
-        HVX_Vector     x    = vx[i];
-        HVX_VectorPair xp   = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one);  // mul by 1.0
-        HVX_Vector     y_hi = Q6_Vqf32_vadd_VsfVsf(Q6_V_hi_W(yp), zero);      // convert to qf32
-        HVX_Vector     y_lo = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero);      // convert to qf32
+        HVX_VectorPair yp = vy[i];
+        HVX_Vector     x  = vx[i];
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero);  // mul by 1.0
 
-        HVX_Vector hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(xp), y_hi);
-        HVX_Vector lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(xp), y_lo);
+        HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
+        HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
 
         HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
         rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
@@ -938,25 +935,24 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     if (nv1) {
         HVX_VectorPair yp = vy[i];
         HVX_Vector     x  = vx[i];
-        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one);  // mul by 1.0
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero);  // mul by 1.0
 
-        HVX_Vector leftover_x;
-        HVX_Vector leftover_y;
+        HVX_Vector l_x;
+        HVX_Vector l_y;
         if (nv1 >= VLEN_FP32) {
-            HVX_Vector y_lo = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero);  // convert to qf32
-            HVX_Vector lo   = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(xp), y_lo);
-            rsum            = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, lo);
+            HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
+            rsum          = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
             nv1 -= VLEN_FP32;
-            leftover_x = Q6_V_hi_W(xp);
-            leftover_y = Q6_Vqf32_vadd_VsfVsf(Q6_V_hi_W(yp), zero);  // convert to qf32
+            l_x = Q6_V_hi_W(xp);
+            l_y = Q6_V_hi_W(yp);
         } else {
-            leftover_x = Q6_V_lo_W(xp);
-            leftover_y = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero);  // convert to qf32
+            l_x = Q6_V_lo_W(xp);
+            l_y = Q6_V_lo_W(yp);
         }
 
         if (nv1) {
-            HVX_Vector lo  = Q6_Vqf32_vmpy_Vqf32Vqf32(leftover_x, leftover_y);
-            HVX_Vector sum = Q6_V_valign_VVR(lo, zero, nv1 * sizeof(float));
+            HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y);
+            HVX_Vector sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float));
             rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
         }
 

From 5e3db77f6e9392091c80b21a0ec4b43544162637 Mon Sep 17 00:00:00 2001
From: Hongrui Chen <chraac@gmail.com>
Date: Thu, 25 Dec 2025 13:57:17 +0800
Subject: [PATCH 10/10] wip

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 04794227ed..7ad7a75f0d 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -915,7 +915,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     uint32_t nv0 = n / VLEN_FP16;                    // num full fp16 hvx vectors
     uint32_t nv1 = n % VLEN_FP16;                    // leftover elements
 
-    const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
+    const HVX_Vector one = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
     HVX_Vector       rsum = Q6_V_vsplat_R(0);
     uint32_t         i    = 0;
 
@@ -923,7 +923,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     for (i = 0; i < nv0; i++) {
         HVX_VectorPair yp = vy[i];
         HVX_Vector     x  = vx[i];
-        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero);  // mul by 1.0
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one);  // mul by 1.0
 
         HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
         HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
@@ -935,7 +935,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
     if (nv1) {
         HVX_VectorPair yp = vy[i];
         HVX_Vector     x  = vx[i];
-        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero);  // mul by 1.0
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one);  // mul by 1.0
 
         HVX_Vector l_x;
         HVX_Vector l_y;