From 83412e0b6a20733afc9fbe2ad4785420c1bd2239 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Wed, 10 Dec 2025 18:33:00 -0500
Subject: [PATCH 01/14] feat: inital support for gelu using sigmoid
 approximation

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 21 +++++-
 ggml/src/ggml-hexagon/htp/act-ops.c    | 89 +++++++++++++++++++++++++-
 ggml/src/ggml-hexagon/htp/htp-msg.h    | 11 ++--
 ggml/src/ggml-hexagon/htp/main.c       |  1 +
 4 files changed, 113 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 72a82a8911..c45b292a52 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2164,8 +2164,14 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, dst)) {
-        return false;
+    if(src1){
+        if (!hex_supported_buffer(sess, src0, src1, dst)) {
+            return false;
+        }
+    }else{
+        if (!hex_supported_buffer(sess, src0, dst)) {
+            return false;
+        }
     }
 
     return true;
@@ -2665,6 +2671,10 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
                 req.op    = HTP_OP_UNARY_SILU;
                 supported = true;
             }
+            else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){
+                req.op    = HTP_OP_UNARY_GELU;
+                supported = true;
+            }
             break;
 
         case GGML_OP_GLU:
@@ -2680,6 +2690,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
         case GGML_OP_SOFT_MAX:
             req.op    = HTP_OP_SOFTMAX;
             supported = true;
+            break;
 
         default:
             break;
@@ -2959,6 +2970,8 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
             case GGML_OP_UNARY:
                 if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) {
                     ggml_hexagon_unary(node, flags);
+                } else if (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU) {
+                    ggml_hexagon_unary(node, flags);
                 }
                 break;
             case GGML_OP_GLU:
@@ -3257,7 +3270,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
     auto sess = static_cast<ggml_hexagon_session *>(dev->context);
 
     bool supp = false;
-
     switch (op->op) {
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
@@ -3297,6 +3309,9 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
                 supp = ggml_hexagon_supported_activations(sess, op);
             }
+            else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){
+                supp = ggml_hexagon_supported_activations(sess, op);
+            }
             break;
 
         case GGML_OP_GLU:
diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 87b09cca3a..2db4a2a35b 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -255,6 +255,90 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
          src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
+
+static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
+                                       struct htp_tensor *       dst,
+                                       const int32_t *           op_params,
+                                       struct htp_spad *         src0_spad,
+                                       struct htp_spad *         dst_spad,
+                                       uint32_t                  nth,
+                                       uint32_t                  ith,
+                                       uint32_t                  src0_nrows_per_thread) {
+    htp_act_preamble2;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const size_t src0_row_size = nb01;
+    const size_t dst_row_size  = nb1;
+
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    int is_aligned = 1;
+    int opt_path   = 0;
+    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
+        is_aligned = 0;
+        FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
+    }
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
+        opt_path = 1;
+    }
+
+    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
+    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+
+    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
+    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
+
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
+        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
+        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
+
+        if (ir + 1 < src0_end_row) {
+            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
+        }
+
+
+        // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh
+        // gelu = x * sigmoid(1.702 * x) // current implementation
+        if (1 == opt_path) {
+            hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
+            hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
+
+            hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+        } else {
+            hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
+            // sigmoid
+            hvx_exp_f32((const uint8_t *) src0_spad_data, src0_spad_data, ne0, true);
+            hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0);
+            hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0);
+
+            hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
+         ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = (struct htp_ops_context *) data;
+    unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
+                               octx->src0_nrows_per_thread);
+}
+
+
+
 static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
                                        struct htp_tensor *       dst,
                                        const int32_t *           op_params,
@@ -371,7 +455,10 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
             act_op_func = glu_swiglu_oai_fp32;
             op_type     = "swiglu-oai-f32";
             break;
-
+        case HTP_OP_UNARY_GELU:
+            act_op_func = unary_gelu_fp32;
+            op_type     = "gelu-f32";
+            break;
         default:
             FARF(ERROR, "Unsupported activations Op %u\n", octx->op);
             return HTP_STATUS_NO_SUPPORT;
diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h
index 9278f41f4e..a61652304a 100644
--- a/ggml/src/ggml-hexagon/htp/htp-msg.h
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@@ -51,11 +51,12 @@ enum htp_op {
     HTP_OP_MUL_MAT_ID     = 5,
     HTP_OP_RMS_NORM       = 6,
     HTP_OP_UNARY_SILU     = 7,
-    HTP_OP_GLU_SWIGLU     = 8,
-    HTP_OP_GLU_SWIGLU_OAI = 9,
-    HTP_OP_SOFTMAX        = 10,
-    HTP_OP_ADD_ID         = 11,
-    HTP_OP_ROPE           = 12,
+    HTP_OP_UNARY_GELU     = 8,
+    HTP_OP_GLU_SWIGLU     = 9,
+    HTP_OP_GLU_SWIGLU_OAI = 10,
+    HTP_OP_SOFTMAX        = 11,
+    HTP_OP_ADD_ID         = 12,
+    HTP_OP_ROPE           = 13,
     INVALID
 };
 
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index b60b352a7b..e30ae69502 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -798,6 +798,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
                 break;
 
             case HTP_OP_UNARY_SILU:
+            case HTP_OP_UNARY_GELU:
                 if (n_bufs != 2) {
                     FARF(ERROR, "Bad act-req buffer list");
                     continue;

From 2a787a61d11f9e63e5943a2e6d134b2f0c402ace Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Thu, 11 Dec 2025 16:15:46 -0500
Subject: [PATCH 02/14] snapshot: faster gelu using polynomial approximation

---
 ggml/src/ggml-hexagon/htp/CMakeLists.txt      |   1 +
 ggml/src/ggml-hexagon/htp/act-ops.c           |  16 +-
 ggml/src/ggml-hexagon/htp/hvx-exp.c           |   4 +-
 ggml/src/ggml-hexagon/htp/hvx-utils.c         |  36 +-
 .../src/ggml-hexagon/htp/qhcg_approximation.c | 518 ++++++++++++++++++
 .../src/ggml-hexagon/htp/qhcg_approximation.h |  21 +
 ggml/src/ggml-hexagon/htp/qhcg_internal.h     |  91 +++
 7 files changed, 666 insertions(+), 21 deletions(-)
 create mode 100644 ggml/src/ggml-hexagon/htp/qhcg_approximation.c
 create mode 100644 ggml/src/ggml-hexagon/htp/qhcg_approximation.h
 create mode 100644 ggml/src/ggml-hexagon/htp/qhcg_internal.h

diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
index 22e3fea11d..fa350be19e 100644
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -28,6 +28,7 @@ add_library(${HTP_LIB} SHARED
     softmax-ops.c
     act-ops.c
     rope-ops.c
+    qhcg_approximation.c
 )
 
 target_compile_definitions(${HTP_LIB} PRIVATE
diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 2db4a2a35b..b0d4d1a477 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -24,6 +24,9 @@
 #include "hvx-utils.h"
 #include "ops-utils.h"
 
+
+#include "qhcg_approximation.h"
+
 #define htp_act_preamble3              \
     const uint32_t ne00 = src0->ne[0]; \
     const uint32_t ne01 = src0->ne[1]; \
@@ -306,7 +309,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
             htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
         }
 
-
+        #if 0
         // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh
         // gelu = x * sigmoid(1.702 * x) // current implementation
         if (1 == opt_path) {
@@ -323,6 +326,17 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
 
             hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
         }
+        #else
+
+            // alternative method
+            float low_bound = -6.0f;
+            float up_bound = 6.0f;
+
+            qhcg_approximation(  (float*)src0, (float*)dst, ne0, low_bound, up_bound );
+  
+
+        #endif
+
     }
 
     t2 = HAP_perf_get_qtimer_count();
diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c
index 21bf46a542..1f5e9e476c 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -31,13 +31,13 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
     // assert((0 == unaligned_addr) || (0 == num_elems_whole));
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector vec_out = Q6_V_vzero();
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index e02b1d9099..4597423eb9 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -40,13 +40,13 @@ void hvx_mul_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -252,13 +252,13 @@ void hvx_add_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -392,13 +392,13 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     static const float kInf    = INFINITY;
@@ -454,13 +454,13 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
@@ -507,13 +507,13 @@ void hvx_sub_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -647,13 +647,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
@@ -733,13 +733,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if (0 == htp_is_aligned((void *) src, VLEN)) {
-        FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector sum_vec  = Q6_V_vsplat_R(0x00000000);
@@ -782,13 +782,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
@@ -831,13 +831,13 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if (0 == htp_is_aligned((void *) src, VLEN)) {
-        FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector vec_max   = hvx_vec_splat_fp32(((const float *) src)[0]);
diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c
new file mode 100644
index 0000000000..5068c5fa34
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c
@@ -0,0 +1,518 @@
+/**=============================================================================
+@file
+    qhcg_approximation.c
+
+@brief
+    Calculate polynomial approximation of the function below in
+    floating-point arithmetic using HVX instructions.
+
+    Function: gelu(x)
+
+    Function is approximated in specified input range from -6.0 to 6.0,
+    where inputs and outputs are arrays of 32-bit float values.
+
+    Approximation is performed using the following method:
+
+    1) Input range is split into 16 equidistant segments
+    2) For each segment, Numpy's polynomial package is used to find the best
+       polynomial approximation of order N with the corresponding C0, C1, ..., Cn.
+    3) VLUT instructions are used to select appropriate coefficients for each input sample
+    4) Horner's method is used to compute polynomial values:
+       f(x) = ((((Cn*x + Cn-1)*x + Cn-2)*x + ...)*x + C1)*x + C0
+
+Copyright (c) 2020 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+=============================================================================**/
+
+#if __HVX_ARCH__ >= 68
+
+#include "qhcg_approximation.h"
+#include "qhcg_internal.h"
+
+#define BLOCK_SIZE       (8*1024/128)  /* vector chunks */
+#define L2FETCH_AHEAD    (BLOCK_SIZE)
+
+/* Polynomial coefficients */
+static const float c0_coeffs[32] __attribute__((aligned(VLEN))) =
+{
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+-0.1025868397073178,-1.1184356646199394,-1.9705895994321767,0.11469604839384463,0.40991447569341943,0.00424292239610935,-0.0017846707638177889,4.125901398310816e-09,
+9.718309490480692e-11,-0.0015488336803479719,0.001064556481209511,0.3906162486717146,0.19084584900320978,-1.911422745140333,-1.1879384314707315,-0.10823562636002611,
+};
+static const float c1_coeffs[32] __attribute__((aligned(VLEN))) =
+{
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+-0.1234196807250312,-1.5042580469229814,-2.7701977888429816,1.1561921948215528,1.73891533063333,0.49580124294548433,0.4867587290479026,0.500000435462697,
+0.4999997919981341,0.5116842338641109,0.5163606020356294,-0.6867154811454343,-0.31551789326265844,3.6694157536939014,2.6042137343731855,1.1304321895807614,
+};
+static const float c2_coeffs[32] __attribute__((aligned(VLEN))) =
+{
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+-0.06367500510012546,-0.8689061926460069,-1.674005553705795,1.5013658408230053,1.9798213609930566,0.33731544915026324,0.35673778512915555,0.398953295788538,
+0.3989496120997857,0.3611051680040998,0.31742994078248077,1.9193992198306873,1.6441036493618186,-1.600477678714911,-0.9304878890577859,-0.06740463140212431,
+};
+static const float c3_coeffs[32] __attribute__((aligned(VLEN))) =
+{
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+-0.01826132753437031,-0.27938965958246625,-0.56347555462781,0.8662803078586866,1.0748504969694623,-0.13840364789720844,-0.07490683960610874,0.00011501805987770841,
+-8.89610380930177e-05,0.06815977365648013,0.1564140217086786,-1.036053072449464,-0.9372597866516783,0.5336910940777527,0.3004584208315817,0.019362956684359556,
+};
+static const float c4_coeffs[32] __attribute__((aligned(VLEN))) =
+{
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+-0.003143995202268695,-0.05400134785573118,-0.11405541169720136,0.2730279204613817,0.3235877725936627,-0.21757221119731435,-0.14645680741997966,-0.06620698974306806,
+-0.06630082288474698,-0.14025595963442758,-0.22733077791076023,0.30866276792496655,0.29418673249390104,-0.10682071320119783,-0.05832443091378418,-0.003339162830362702,
+};
+static const float c5_coeffs[32] __attribute__((aligned(VLEN))) =
+{
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+-0.0003249391771954532,-0.006273424264889337,-0.01387698764918236,0.04913223606829913,0.05545197372332761,-0.09031312961799431,-0.04973154630108704,0.001825035162087615,
+-0.0016453620553813022,0.046340833813673266,0.09347637717015225,-0.0520121796723486,-0.0529133082948728,0.012823220702040979,0.00680542921713579,0.00034567793526667706,
+};
+static const float c6_coeffs[32] __attribute__((aligned(VLEN))) =
+{
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+-1.866614032852734e-05,-0.0004055443985444202,-0.0009392734765841164,0.004770459893478637,0.00504881095326808,-0.016904688419800747,-0.0049839929986692675,0.013321931926939602,
+0.01314779303860721,-0.003962384692377502,-0.017472688915232914,0.004609031329816739,0.005145502689303376,-0.0008540539868357813,-0.0004419008815610675,-1.989001488248583e-05,
+};
+static const float c7_coeffs[32] __attribute__((aligned(VLEN))) =
+{
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
+-4.5974928158662533e-07,-1.1252676844375516e-05,-2.7270249210246306e-05,0.0001949129249064408,0.00018774479051508836,-0.001238293494672944,0.000199977799551612,0.0029325624122584024,
+-0.0028654073893250895,-0.00033083897498689484,0.0012818786224478341,-0.00016368340082111905,-0.0002108418119120288,2.431836142521883e-05,1.2317036094266618e-05,4.906925630164402e-07,
+};
+
+/**
+ * @brief       Polynomial approximation of gelu(x) function.
+ * @param[in]   input   Input array of elements in IEEE 32-bit floating-point format.
+ * @param[out]  output  Output array of elements in IEEE 32-bit floating-point format.
+ * @param[in]   length  Number of elements in input/output arrays.
+ * @return      Returns 0 on successful execution. Otherwise -1.
+ */
+int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32_t size, 
+    float limit_left, float limit_right)
+
+{
+    HVX_Vector *input_v_ptr;
+    HVX_UVector *output_v_ptr;
+    HVX_Vector input_min_v_f;
+    HVX_Vector input_max_v_f;
+
+    HVX_Vector input_shifted_v_qf32;
+    HVX_Vector input_scaled_v_qf32;
+    HVX_Vector scale_v;
+    HVX_Vector input_v_qf32;
+    HVX_Vector const16_0_v_sf;
+    HVX_Vector zero_v_sf;
+    HVX_Vector mask_idx1_v, mask_idx2_v;
+    HVX_Vector tmp_v, idx1_v, idx2_v;
+    HVX_Vector output_v;
+    HVX_Vector slinep;
+    HVX_Vector slinec;
+    HVX_Vector sline;
+    HVX_Vector sline_tmp;    
+    HVX_Vector sout;
+    int32_t block, l2fetch_block;
+    int32_t leftover = size & 31;
+    int32_t vectors_in_rounddown = size / 32;
+    int32_t leftover_size = leftover * sizeof(float);
+    HVX_DV c0_coeff_dv;
+    HVX_VectorPair c0_coeff_vp;
+    HVX_Vector c0_coeff_v;
+    HVX_DV c1_coeff_dv;
+    HVX_VectorPair c1_coeff_vp;
+    HVX_Vector c1_coeff_v;
+    HVX_DV c2_coeff_dv;
+    HVX_VectorPair c2_coeff_vp;
+    HVX_Vector c2_coeff_v;
+    HVX_DV c3_coeff_dv;
+    HVX_VectorPair c3_coeff_vp;
+    HVX_Vector c3_coeff_v;
+    HVX_DV c4_coeff_dv;
+    HVX_VectorPair c4_coeff_vp;
+    HVX_Vector c4_coeff_v;
+    HVX_DV c5_coeff_dv;
+    HVX_VectorPair c5_coeff_vp;
+    HVX_Vector c5_coeff_v;
+    HVX_DV c6_coeff_dv;
+    HVX_VectorPair c6_coeff_vp;
+    HVX_Vector c6_coeff_v;
+    HVX_DV c7_coeff_dv;
+    HVX_VectorPair c7_coeff_vp;
+    HVX_Vector c7_coeff_v;
+
+    HVX_Vector zero_vec    = Q6_V_vsplat_R(0x00000000);
+
+    /* Check input arguments. Return error status if some argument has invalid value */
+    if ((input == 0) || (output == 0) || (size == 0))
+    {
+        return -1;
+    }
+
+    input_v_ptr = (HVX_Vector *) input;
+    output_v_ptr = (HVX_UVector *) output;
+
+    /*
+     * If input data is not aligned to HVX vector size, compose aligned vectors
+     * from data loaded in slinep and slinec
+     */
+    slinep = *input_v_ptr++;
+
+    /*
+     * Splat scale factor in order to be used later for finding indexes of coefficients.
+     * Scale factor is represented in IEEE 16-bit floating-point format and it is
+     * calculated using the following formula:
+     *    scale_factor = (16.0 / (b0 - a0))
+     * NOTE: Calculated value is slightly decreased in order to avoid out of bound
+     *       indexes during VLUT lookup.
+     */
+    scale_v = Q6_V_vsplat_R(0x3faaaaa9);
+
+    /*
+     * Vector of zeroes used as neutral element in sf to qf32 conversions.
+     * NOTE: Some of conversions (i.e conversion of scale factor and coefficients)
+     *       can be avoided in real-time, but this is not done in order to don't
+     *       sacrify code readibility in expense of insignificant performance improvement.
+     */
+    zero_v_sf = Q6_V_vzero();
+
+    /* Mask for extracting only 4 bits of mantissa */
+    mask_idx1_v = Q6_V_vsplat_R(0x0000000F);
+    mask_idx2_v = Q6_V_vsplat_R(0x00000010);
+
+    /* 16.0 in IEEE 16-bit floating-point representation */
+    const16_0_v_sf = Q6_V_vsplat_R(0x41800000);
+
+    /*
+     * Prepare vector of input_min values, that is used later in shifting input range.
+     * input_min is low boundary of specified input range.
+     */
+    int32_t input_min_bits = *((int32_t *) &limit_left);
+    int32_t input_max_bits = *((int32_t *) &limit_right);
+    
+    input_min_v_f = Q6_V_vsplat_R(input_min_bits);
+    input_max_v_f = Q6_V_vsplat_R(input_max_bits);
+
+    /* Convert scale factor from sf to q32. Use the same vector for both formats */
+    scale_v = Q6_Vqf32_vadd_VsfVsf(scale_v, zero_v_sf);
+
+    /* Load coefficients */
+    c0_coeff_v = *((HVX_Vector *)(c0_coeffs));
+    c1_coeff_v = *((HVX_Vector *)(c1_coeffs));
+    c2_coeff_v = *((HVX_Vector *)(c2_coeffs));
+    c3_coeff_v = *((HVX_Vector *)(c3_coeffs));
+    c4_coeff_v = *((HVX_Vector *)(c4_coeffs));
+    c5_coeff_v = *((HVX_Vector *)(c5_coeffs));
+    c6_coeff_v = *((HVX_Vector *)(c6_coeffs));
+    c7_coeff_v = *((HVX_Vector *)(c7_coeffs));
+
+    /* Convert coefficients from sf to qf32 format. Use the same vector for both representations */
+    c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_sf);
+    c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_sf);
+    c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_sf);
+    c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_sf);
+    c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_sf);
+    c5_coeff_v = Q6_Vqf32_vadd_VsfVsf(c5_coeff_v, zero_v_sf);
+    c6_coeff_v = Q6_Vqf32_vadd_VsfVsf(c6_coeff_v, zero_v_sf);
+    c7_coeff_v = Q6_Vqf32_vadd_VsfVsf(c7_coeff_v, zero_v_sf);
+
+    /* Split 32-bit coefficients to lower and upper part in order to obtain them later with VLUT16. */
+    c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v);
+    c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v);
+    c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v);
+    c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v);
+    c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v);
+    c5_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c5_coeff_v);
+    c6_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c6_coeff_v);
+    c7_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c7_coeff_v);
+
+    /*
+     * Handle number of whole vectors in input data.
+     * Don't process last vector in order to avoid out-of-boundary load.
+     */
+    for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE)
+    {
+        block = Q6_R_min_RR(i, BLOCK_SIZE);
+        l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+        if (l2fetch_block > 0)
+        {
+            l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0);
+        }
+
+        /* Process one vector at a time */
+        for (int32_t j = 0; j < block; ++j)
+        {
+            slinec = *input_v_ptr++;
+
+            /* Compose vector of input data from slinec and slinep */
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+            sline_tmp = sline;
+            /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */
+            input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f);
+
+            /*
+             * Scale shifted input range from [0, input_max - input_min] to [0,16.0)
+             * in order to get corresponding coefficient indexes
+             */
+            input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v);
+
+            /*
+             * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0)
+             * to [16.0,32.0) in order to convert float indexes to integer values.
+             * Float values, represented in IEEE 754, in range [16.0,32.0] have the
+             * same exponent, which means 4 MSB of mantissa carry information about
+             * integer index.
+             */
+            input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf);
+
+            /* Convert back from qf32 to sf in order to extract integer index */
+            tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32);
+
+            /* Only 4 MSB bits of mantissa represent segment index */
+            idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19);
+
+            idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+            idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+            idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+            /* Obtain the polynomial coefficients from lookup table */
+            c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+            c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1);
+            c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+            c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1);
+            c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+            c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1);
+            c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+            c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1);
+            c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+            c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1);
+            c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1);
+            c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1);
+            c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1);
+            c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1);
+            c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1);
+            c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1);
+
+            /* Convert input from sf vector to qf32 vector for Horner's method*/
+            input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf);
+
+            /* Perform evaluation of polynomial using Horner's method */
+            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32);
+            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp));
+            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp));
+            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp));
+            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp));
+            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp));
+            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp));
+            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp));
+
+            // /* Store results to the output buffer and convert from qf32 to sf */
+            // *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32(output_v);
+
+
+            /* Convert from qf32 to sf, store output and go to handle leftover */
+            HVX_Vector output_v_f32 =  Q6_Vsf_equals_Vqf32(output_v);
+            HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp
+            output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero
+           
+            HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f
+            output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was
+           
+            *((HVX_UVector *)(output_v_ptr++)) = output_v_f32;
+
+
+            /* Prepare slinep for next iteration */
+            slinep = slinec;
+        }
+    }
+
+    /* Handle last whole vector from input data */
+    if (vectors_in_rounddown > 0)
+    {
+        slinec = is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        sline_tmp = sline;
+
+        /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */
+        input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f);
+
+        /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */
+        input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v);
+
+        /*
+         * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0)
+         * to [16.0,32.0) in order to convert float indexes to integer values.
+         * Float values, represented in IEEE 754, in range [16.0,32.0] have the
+         * same exponent, which means 4 MSB of mantissa carry information about
+         * integer index.
+         */
+        input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf);
+
+        /* Convert back from qf32 to sf in order to extract integer index */
+        tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32);
+
+        /* Only 4 MSB bits of mantissa represent segment index */
+        idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19);
+
+        /* Ensure only 4 MSB bits of mantissa are used as indexes */
+        idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+        idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+        idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+        /* Obtain the polynomial coefficients from lookup table */
+        c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+        c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1);
+        c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+        c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1);
+        c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+        c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1);
+        c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+        c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1);
+        c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+        c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1);
+        c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1);
+        c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1);
+        c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1);
+        c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1);
+        c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1);
+        c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1);
+
+        /* Convert input from sf vector to qf32 vector for Horner's method*/
+        input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf);
+
+        /* Perform evaluation of polynomial using Horner's method */
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp));
+
+        /* Convert from qf32 to sf, store output and go to handle leftover */
+        HVX_Vector output_v_f32 =  Q6_Vsf_equals_Vqf32(output_v);
+        HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp
+        output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero
+
+        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f
+        output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was
+
+        *((HVX_UVector *)(output_v_ptr++)) = output_v_f32;
+
+        slinep = slinec;
+    }
+
+    /* Handle leftover elements */
+    if (leftover > 0)
+    {
+        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128)
+                   ? slinep
+                   : *input_v_ptr++);
+
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        sline_tmp = sline;
+
+        /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */
+        input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f);
+
+        /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */
+        input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v);
+
+        /*
+         * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0)
+         * to [16.0,32.0) in order to convert float indexes to integer values.
+         * Float values, represented in IEEE 754, in range [16.0,32.0] have the
+         * same exponent, which means 4 MSB of mantissa carry information about
+         * integer index.
+         */
+        input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf);
+
+        /* Convert back from qf32 to sf in order to extract integer index */
+        tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32);
+
+        /* Only 4 MSB bits of mantissa represent segment index */
+        idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19);
+
+        /* Ensure only 4 MSB bits of mantissa are used as indexes */
+        idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+        idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+        idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+        /* Obtain the polynomial coefficients from lookup table */
+        c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+        c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1);
+        c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+        c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1);
+        c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+        c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1);
+        c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+        c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1);
+        c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+        c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1);
+        c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1);
+        c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1);
+        c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1);
+        c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1);
+        c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1);
+        c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1);
+
+        /* Convert input from sf vector to qf32 vector for Horner's method*/
+        input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf);
+
+        /* Perform evaluation of polynomial using Horner's method */
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp));
+        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp));
+
+        /* Convert from qf32 to sf */
+        // sout = Q6_Vsf_equals_Vqf32(output_v);
+        HVX_Vector output_v_f32 =  Q6_Vsf_equals_Vqf32(output_v);
+        HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp
+        output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero
+
+
+        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f
+        output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was
+
+        sout = output_v_f32;
+        /* Store output */
+        vstu_variable(output_v_ptr, leftover_size, sout);
+    }
+
+    return 0;
+}
+
+#endif /* __HVX_ARCH__ >= 68 */
diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.h b/ggml/src/ggml-hexagon/htp/qhcg_approximation.h
new file mode 100644
index 0000000000..6f70e209ff
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/qhcg_approximation.h
@@ -0,0 +1,21 @@
+/**=============================================================================
+@file
+    qhcg_approximation.h
+
+@brief
+    Header file of polynomial approximation generated by QHCG
+
+Copyright (c) 2020 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+=============================================================================**/
+
+#ifndef __qhcg_approximation__
+#define __qhcg_approximation__
+
+#include <stdint.h>
+
+int32_t qhcg_approximation(float *inputs, float *outputs, uint32_t length,
+    float limit_left, float limit_right
+);
+
+#endif /* __qhcg_approximation__ */
diff --git a/ggml/src/ggml-hexagon/htp/qhcg_internal.h b/ggml/src/ggml-hexagon/htp/qhcg_internal.h
new file mode 100644
index 0000000000..618610dc88
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/qhcg_internal.h
@@ -0,0 +1,91 @@
+/**=============================================================================
+@file
+    hvx_internal.h
+
+@brief
+    Header file for HVX routines.
+
+Copyright (c) 2020 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+=============================================================================**/
+
+#ifndef _HVX_INTERNAL_H
+#define _HVX_INTERNAL_H
+
+#include <stddef.h> // size_t
+#include <hexagon_types.h>
+
+#define HVX_INLINE_ALWAYS inline __attribute__((unused,always_inline))
+
+#ifndef LOG2VLEN
+#define LOG2VLEN    7
+#endif
+#define VLEN        (1<<LOG2VLEN)    // HVX vector - number of int8_t elements
+#define VLEN_SHORT  (1<<LOG2VLEN)>>1 // HVX vector - number of int16_t elements
+#define VLEN_WORD   (1<<LOG2VLEN)>>2 // HVX vector - number of int32_t elements
+
+typedef union
+{
+    HVX_VectorPair VV;
+    struct
+    {
+        HVX_Vector lo;
+        HVX_Vector hi;
+    } V;
+} HVX_DV;
+
+static HVX_INLINE_ALWAYS void l2fetch(const void *p, uint32_t stride,
+                                      uint32_t width, uint32_t height,
+                                      uint32_t dir)
+{
+    uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height);
+    __asm__ __volatile__ (" l2fetch(%0,%1) " : :"r"(p),"r"(control));
+}
+
+/* Return whether address is aligned. */
+
+static HVX_INLINE_ALWAYS int32_t is_aligned(void *addr, uint32_t align)
+{
+    return ((size_t) addr & (align - 1)) == 0;
+}
+
+/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
+
+static HVX_INLINE_ALWAYS int32_t is_in_one_chunk(void *addr, uint32_t n,
+                                                 uint32_t chunk_size)
+{
+    uint32_t left_off = (size_t) addr & (chunk_size - 1);
+    uint32_t right_off = left_off + n;
+    return right_off <= chunk_size;
+}
+
+/*
+ * This function stores the first n bytes from vector vin to address 'addr'.
+ * n must be in range 1..128 and addr may have any alignment. Does one or
+ * two masked stores.
+ */
+
+static HVX_INLINE_ALWAYS void vstu_variable(void *addr, uint32_t n,
+                                            HVX_Vector vin)
+{
+    /* Rotate as needed. */
+    vin = Q6_V_vlalign_VVR(vin, vin, (size_t) addr);
+
+    uint32_t left_off = (size_t) addr & 127;
+    uint32_t right_off = left_off + n;
+
+    HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) addr);
+    HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
+
+    if (right_off > 128)
+    {
+        Q6_vmem_QRIV(qr, (HVX_Vector*) addr + 1, vin);
+        /* all 1's */
+        qr = Q6_Q_vcmp_eq_VbVb(vin, vin);
+    }
+
+    ql_not = Q6_Q_or_QQn(ql_not, qr);
+    Q6_vmem_QnRIV(ql_not, (HVX_Vector*) addr, vin);
+}
+
+#endif /* _HVX_INTERNAL_H */

From 72339994d45b2bed887e79994403c378d90b62b5 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Thu, 11 Dec 2025 16:59:06 -0500
Subject: [PATCH 03/14] test: disable l2-block prefetch in polynomail
 approximation

---
 ggml/src/ggml-hexagon/htp/qhcg_approximation.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c
index 5068c5fa34..c2ecfff4fe 100644
--- a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c
+++ b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c
@@ -120,7 +120,7 @@ int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32
     HVX_Vector sline;
     HVX_Vector sline_tmp;    
     HVX_Vector sout;
-    int32_t block, l2fetch_block;
+    int32_t block; //l2fetch_block;
     int32_t leftover = size & 31;
     int32_t vectors_in_rounddown = size / 32;
     int32_t leftover_size = leftover * sizeof(float);
@@ -241,12 +241,12 @@ int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32
     for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE)
     {
         block = Q6_R_min_RR(i, BLOCK_SIZE);
-        l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+        //l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
 
-        if (l2fetch_block > 0)
-        {
-            l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0);
-        }
+        // if (l2fetch_block > 0)
+        // {
+        //     l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0);
+        // }
 
         /* Process one vector at a time */
         for (int32_t j = 0; j < block; ++j)

From 470b499130e46e7aa3acc7022240775278cbcf1f Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Fri, 12 Dec 2025 10:03:51 -0500
Subject: [PATCH 04/14] Revert "test: disable l2-block prefetch in polynomail
 approximation"

This reverts commit 72339994d45b2bed887e79994403c378d90b62b5.
---
 ggml/src/ggml-hexagon/htp/qhcg_approximation.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c
index c2ecfff4fe..5068c5fa34 100644
--- a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c
+++ b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c
@@ -120,7 +120,7 @@ int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32
     HVX_Vector sline;
     HVX_Vector sline_tmp;    
     HVX_Vector sout;
-    int32_t block; //l2fetch_block;
+    int32_t block, l2fetch_block;
     int32_t leftover = size & 31;
     int32_t vectors_in_rounddown = size / 32;
     int32_t leftover_size = leftover * sizeof(float);
@@ -241,12 +241,12 @@ int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32
     for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE)
     {
         block = Q6_R_min_RR(i, BLOCK_SIZE);
-        //l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+        l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
 
-        // if (l2fetch_block > 0)
-        // {
-        //     l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0);
-        // }
+        if (l2fetch_block > 0)
+        {
+            l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0);
+        }
 
         /* Process one vector at a time */
         for (int32_t j = 0; j < block; ++j)

From 999492fe9b1149df30df57ce91ad08bd8b96c0c0 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Fri, 12 Dec 2025 10:04:05 -0500
Subject: [PATCH 05/14] Revert "snapshot: faster gelu using polynomial
 approximation"

This reverts commit 2a787a61d11f9e63e5943a2e6d134b2f0c402ace.
---
 ggml/src/ggml-hexagon/htp/CMakeLists.txt      |   1 -
 ggml/src/ggml-hexagon/htp/act-ops.c           |  16 +-
 ggml/src/ggml-hexagon/htp/hvx-exp.c           |   4 +-
 ggml/src/ggml-hexagon/htp/hvx-utils.c         |  36 +-
 .../src/ggml-hexagon/htp/qhcg_approximation.c | 518 ------------------
 .../src/ggml-hexagon/htp/qhcg_approximation.h |  21 -
 ggml/src/ggml-hexagon/htp/qhcg_internal.h     |  91 ---
 7 files changed, 21 insertions(+), 666 deletions(-)
 delete mode 100644 ggml/src/ggml-hexagon/htp/qhcg_approximation.c
 delete mode 100644 ggml/src/ggml-hexagon/htp/qhcg_approximation.h
 delete mode 100644 ggml/src/ggml-hexagon/htp/qhcg_internal.h

diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
index fa350be19e..22e3fea11d 100644
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -28,7 +28,6 @@ add_library(${HTP_LIB} SHARED
     softmax-ops.c
     act-ops.c
     rope-ops.c
-    qhcg_approximation.c
 )
 
 target_compile_definitions(${HTP_LIB} PRIVATE
diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index b0d4d1a477..2db4a2a35b 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -24,9 +24,6 @@
 #include "hvx-utils.h"
 #include "ops-utils.h"
 
-
-#include "qhcg_approximation.h"
-
 #define htp_act_preamble3              \
     const uint32_t ne00 = src0->ne[0]; \
     const uint32_t ne01 = src0->ne[1]; \
@@ -309,7 +306,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
             htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
         }
 
-        #if 0
+
         // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh
         // gelu = x * sigmoid(1.702 * x) // current implementation
         if (1 == opt_path) {
@@ -326,17 +323,6 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
 
             hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
         }
-        #else
-
-            // alternative method
-            float low_bound = -6.0f;
-            float up_bound = 6.0f;
-
-            qhcg_approximation(  (float*)src0, (float*)dst, ne0, low_bound, up_bound );
-  
-
-        #endif
-
     }
 
     t2 = HAP_perf_get_qtimer_count();
diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c
index 1f5e9e476c..21bf46a542 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -31,13 +31,13 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
     // assert((0 == unaligned_addr) || (0 == num_elems_whole));
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector vec_out = Q6_V_vzero();
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index 4597423eb9..e02b1d9099 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -40,13 +40,13 @@ void hvx_mul_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -252,13 +252,13 @@ void hvx_add_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -392,13 +392,13 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     static const float kInf    = INFINITY;
@@ -454,13 +454,13 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
@@ -507,13 +507,13 @@ void hvx_sub_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -647,13 +647,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
@@ -733,13 +733,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if (0 == htp_is_aligned((void *) src, VLEN)) {
-        //FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector sum_vec  = Q6_V_vsplat_R(0x00000000);
@@ -782,13 +782,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
@@ -831,13 +831,13 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if (0 == htp_is_aligned((void *) src, VLEN)) {
-        //FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector vec_max   = hvx_vec_splat_fp32(((const float *) src)[0]);
diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c
deleted file mode 100644
index 5068c5fa34..0000000000
--- a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c
+++ /dev/null
@@ -1,518 +0,0 @@
-/**=============================================================================
-@file
-    qhcg_approximation.c
-
-@brief
-    Calculate polynomial approximation of the function below in
-    floating-point arithmetic using HVX instructions.
-
-    Function: gelu(x)
-
-    Function is approximated in specified input range from -6.0 to 6.0,
-    where inputs and outputs are arrays of 32-bit float values.
-
-    Approximation is performed using the following method:
-
-    1) Input range is split into 16 equidistant segments
-    2) For each segment, Numpy's polynomial package is used to find the best
-       polynomial approximation of order N with the corresponding C0, C1, ..., Cn.
-    3) VLUT instructions are used to select appropriate coefficients for each input sample
-    4) Horner's method is used to compute polynomial values:
-       f(x) = ((((Cn*x + Cn-1)*x + Cn-2)*x + ...)*x + C1)*x + C0
-
-Copyright (c) 2020 Qualcomm Technologies Incorporated.
-All Rights Reserved. Qualcomm Proprietary and Confidential.
-=============================================================================**/
-
-#if __HVX_ARCH__ >= 68
-
-#include "qhcg_approximation.h"
-#include "qhcg_internal.h"
-
-#define BLOCK_SIZE       (8*1024/128)  /* vector chunks */
-#define L2FETCH_AHEAD    (BLOCK_SIZE)
-
-/* Polynomial coefficients */
-static const float c0_coeffs[32] __attribute__((aligned(VLEN))) =
-{
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
--0.1025868397073178,-1.1184356646199394,-1.9705895994321767,0.11469604839384463,0.40991447569341943,0.00424292239610935,-0.0017846707638177889,4.125901398310816e-09,
-9.718309490480692e-11,-0.0015488336803479719,0.001064556481209511,0.3906162486717146,0.19084584900320978,-1.911422745140333,-1.1879384314707315,-0.10823562636002611,
-};
-static const float c1_coeffs[32] __attribute__((aligned(VLEN))) =
-{
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
--0.1234196807250312,-1.5042580469229814,-2.7701977888429816,1.1561921948215528,1.73891533063333,0.49580124294548433,0.4867587290479026,0.500000435462697,
-0.4999997919981341,0.5116842338641109,0.5163606020356294,-0.6867154811454343,-0.31551789326265844,3.6694157536939014,2.6042137343731855,1.1304321895807614,
-};
-static const float c2_coeffs[32] __attribute__((aligned(VLEN))) =
-{
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
--0.06367500510012546,-0.8689061926460069,-1.674005553705795,1.5013658408230053,1.9798213609930566,0.33731544915026324,0.35673778512915555,0.398953295788538,
-0.3989496120997857,0.3611051680040998,0.31742994078248077,1.9193992198306873,1.6441036493618186,-1.600477678714911,-0.9304878890577859,-0.06740463140212431,
-};
-static const float c3_coeffs[32] __attribute__((aligned(VLEN))) =
-{
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
--0.01826132753437031,-0.27938965958246625,-0.56347555462781,0.8662803078586866,1.0748504969694623,-0.13840364789720844,-0.07490683960610874,0.00011501805987770841,
--8.89610380930177e-05,0.06815977365648013,0.1564140217086786,-1.036053072449464,-0.9372597866516783,0.5336910940777527,0.3004584208315817,0.019362956684359556,
-};
-static const float c4_coeffs[32] __attribute__((aligned(VLEN))) =
-{
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
--0.003143995202268695,-0.05400134785573118,-0.11405541169720136,0.2730279204613817,0.3235877725936627,-0.21757221119731435,-0.14645680741997966,-0.06620698974306806,
--0.06630082288474698,-0.14025595963442758,-0.22733077791076023,0.30866276792496655,0.29418673249390104,-0.10682071320119783,-0.05832443091378418,-0.003339162830362702,
-};
-static const float c5_coeffs[32] __attribute__((aligned(VLEN))) =
-{
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
--0.0003249391771954532,-0.006273424264889337,-0.01387698764918236,0.04913223606829913,0.05545197372332761,-0.09031312961799431,-0.04973154630108704,0.001825035162087615,
--0.0016453620553813022,0.046340833813673266,0.09347637717015225,-0.0520121796723486,-0.0529133082948728,0.012823220702040979,0.00680542921713579,0.00034567793526667706,
-};
-static const float c6_coeffs[32] __attribute__((aligned(VLEN))) =
-{
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
--1.866614032852734e-05,-0.0004055443985444202,-0.0009392734765841164,0.004770459893478637,0.00504881095326808,-0.016904688419800747,-0.0049839929986692675,0.013321931926939602,
-0.01314779303860721,-0.003962384692377502,-0.017472688915232914,0.004609031329816739,0.005145502689303376,-0.0008540539868357813,-0.0004419008815610675,-1.989001488248583e-05,
-};
-static const float c7_coeffs[32] __attribute__((aligned(VLEN))) =
-{
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
-       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,       0.0,
--4.5974928158662533e-07,-1.1252676844375516e-05,-2.7270249210246306e-05,0.0001949129249064408,0.00018774479051508836,-0.001238293494672944,0.000199977799551612,0.0029325624122584024,
--0.0028654073893250895,-0.00033083897498689484,0.0012818786224478341,-0.00016368340082111905,-0.0002108418119120288,2.431836142521883e-05,1.2317036094266618e-05,4.906925630164402e-07,
-};
-
-/**
- * @brief       Polynomial approximation of gelu(x) function.
- * @param[in]   input   Input array of elements in IEEE 32-bit floating-point format.
- * @param[out]  output  Output array of elements in IEEE 32-bit floating-point format.
- * @param[in]   length  Number of elements in input/output arrays.
- * @return      Returns 0 on successful execution. Otherwise -1.
- */
-int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32_t size, 
-    float limit_left, float limit_right)
-
-{
-    HVX_Vector *input_v_ptr;
-    HVX_UVector *output_v_ptr;
-    HVX_Vector input_min_v_f;
-    HVX_Vector input_max_v_f;
-
-    HVX_Vector input_shifted_v_qf32;
-    HVX_Vector input_scaled_v_qf32;
-    HVX_Vector scale_v;
-    HVX_Vector input_v_qf32;
-    HVX_Vector const16_0_v_sf;
-    HVX_Vector zero_v_sf;
-    HVX_Vector mask_idx1_v, mask_idx2_v;
-    HVX_Vector tmp_v, idx1_v, idx2_v;
-    HVX_Vector output_v;
-    HVX_Vector slinep;
-    HVX_Vector slinec;
-    HVX_Vector sline;
-    HVX_Vector sline_tmp;    
-    HVX_Vector sout;
-    int32_t block, l2fetch_block;
-    int32_t leftover = size & 31;
-    int32_t vectors_in_rounddown = size / 32;
-    int32_t leftover_size = leftover * sizeof(float);
-    HVX_DV c0_coeff_dv;
-    HVX_VectorPair c0_coeff_vp;
-    HVX_Vector c0_coeff_v;
-    HVX_DV c1_coeff_dv;
-    HVX_VectorPair c1_coeff_vp;
-    HVX_Vector c1_coeff_v;
-    HVX_DV c2_coeff_dv;
-    HVX_VectorPair c2_coeff_vp;
-    HVX_Vector c2_coeff_v;
-    HVX_DV c3_coeff_dv;
-    HVX_VectorPair c3_coeff_vp;
-    HVX_Vector c3_coeff_v;
-    HVX_DV c4_coeff_dv;
-    HVX_VectorPair c4_coeff_vp;
-    HVX_Vector c4_coeff_v;
-    HVX_DV c5_coeff_dv;
-    HVX_VectorPair c5_coeff_vp;
-    HVX_Vector c5_coeff_v;
-    HVX_DV c6_coeff_dv;
-    HVX_VectorPair c6_coeff_vp;
-    HVX_Vector c6_coeff_v;
-    HVX_DV c7_coeff_dv;
-    HVX_VectorPair c7_coeff_vp;
-    HVX_Vector c7_coeff_v;
-
-    HVX_Vector zero_vec    = Q6_V_vsplat_R(0x00000000);
-
-    /* Check input arguments. Return error status if some argument has invalid value */
-    if ((input == 0) || (output == 0) || (size == 0))
-    {
-        return -1;
-    }
-
-    input_v_ptr = (HVX_Vector *) input;
-    output_v_ptr = (HVX_UVector *) output;
-
-    /*
-     * If input data is not aligned to HVX vector size, compose aligned vectors
-     * from data loaded in slinep and slinec
-     */
-    slinep = *input_v_ptr++;
-
-    /*
-     * Splat scale factor in order to be used later for finding indexes of coefficients.
-     * Scale factor is represented in IEEE 16-bit floating-point format and it is
-     * calculated using the following formula:
-     *    scale_factor = (16.0 / (b0 - a0))
-     * NOTE: Calculated value is slightly decreased in order to avoid out of bound
-     *       indexes during VLUT lookup.
-     */
-    scale_v = Q6_V_vsplat_R(0x3faaaaa9);
-
-    /*
-     * Vector of zeroes used as neutral element in sf to qf32 conversions.
-     * NOTE: Some of conversions (i.e conversion of scale factor and coefficients)
-     *       can be avoided in real-time, but this is not done in order to don't
-     *       sacrify code readibility in expense of insignificant performance improvement.
-     */
-    zero_v_sf = Q6_V_vzero();
-
-    /* Mask for extracting only 4 bits of mantissa */
-    mask_idx1_v = Q6_V_vsplat_R(0x0000000F);
-    mask_idx2_v = Q6_V_vsplat_R(0x00000010);
-
-    /* 16.0 in IEEE 16-bit floating-point representation */
-    const16_0_v_sf = Q6_V_vsplat_R(0x41800000);
-
-    /*
-     * Prepare vector of input_min values, that is used later in shifting input range.
-     * input_min is low boundary of specified input range.
-     */
-    int32_t input_min_bits = *((int32_t *) &limit_left);
-    int32_t input_max_bits = *((int32_t *) &limit_right);
-    
-    input_min_v_f = Q6_V_vsplat_R(input_min_bits);
-    input_max_v_f = Q6_V_vsplat_R(input_max_bits);
-
-    /* Convert scale factor from sf to q32. Use the same vector for both formats */
-    scale_v = Q6_Vqf32_vadd_VsfVsf(scale_v, zero_v_sf);
-
-    /* Load coefficients */
-    c0_coeff_v = *((HVX_Vector *)(c0_coeffs));
-    c1_coeff_v = *((HVX_Vector *)(c1_coeffs));
-    c2_coeff_v = *((HVX_Vector *)(c2_coeffs));
-    c3_coeff_v = *((HVX_Vector *)(c3_coeffs));
-    c4_coeff_v = *((HVX_Vector *)(c4_coeffs));
-    c5_coeff_v = *((HVX_Vector *)(c5_coeffs));
-    c6_coeff_v = *((HVX_Vector *)(c6_coeffs));
-    c7_coeff_v = *((HVX_Vector *)(c7_coeffs));
-
-    /* Convert coefficients from sf to qf32 format. Use the same vector for both representations */
-    c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_sf);
-    c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_sf);
-    c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_sf);
-    c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_sf);
-    c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_sf);
-    c5_coeff_v = Q6_Vqf32_vadd_VsfVsf(c5_coeff_v, zero_v_sf);
-    c6_coeff_v = Q6_Vqf32_vadd_VsfVsf(c6_coeff_v, zero_v_sf);
-    c7_coeff_v = Q6_Vqf32_vadd_VsfVsf(c7_coeff_v, zero_v_sf);
-
-    /* Split 32-bit coefficients to lower and upper part in order to obtain them later with VLUT16. */
-    c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v);
-    c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v);
-    c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v);
-    c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v);
-    c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v);
-    c5_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c5_coeff_v);
-    c6_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c6_coeff_v);
-    c7_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c7_coeff_v);
-
-    /*
-     * Handle number of whole vectors in input data.
-     * Don't process last vector in order to avoid out-of-boundary load.
-     */
-    for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE)
-    {
-        block = Q6_R_min_RR(i, BLOCK_SIZE);
-        l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
-
-        if (l2fetch_block > 0)
-        {
-            l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0);
-        }
-
-        /* Process one vector at a time */
-        for (int32_t j = 0; j < block; ++j)
-        {
-            slinec = *input_v_ptr++;
-
-            /* Compose vector of input data from slinec and slinep */
-            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-            sline_tmp = sline;
-            /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */
-            input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f);
-
-            /*
-             * Scale shifted input range from [0, input_max - input_min] to [0,16.0)
-             * in order to get corresponding coefficient indexes
-             */
-            input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v);
-
-            /*
-             * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0)
-             * to [16.0,32.0) in order to convert float indexes to integer values.
-             * Float values, represented in IEEE 754, in range [16.0,32.0] have the
-             * same exponent, which means 4 MSB of mantissa carry information about
-             * integer index.
-             */
-            input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf);
-
-            /* Convert back from qf32 to sf in order to extract integer index */
-            tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32);
-
-            /* Only 4 MSB bits of mantissa represent segment index */
-            idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19);
-
-            idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
-            idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
-            idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
-
-            /* Obtain the polynomial coefficients from lookup table */
-            c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
-            c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1);
-            c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
-            c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1);
-            c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
-            c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1);
-            c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
-            c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1);
-            c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
-            c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1);
-            c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1);
-            c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1);
-            c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1);
-            c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1);
-            c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1);
-            c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1);
-
-            /* Convert input from sf vector to qf32 vector for Horner's method*/
-            input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf);
-
-            /* Perform evaluation of polynomial using Horner's method */
-            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32);
-            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp));
-            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp));
-            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp));
-            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp));
-            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp));
-            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp));
-            output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-            output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp));
-
-            // /* Store results to the output buffer and convert from qf32 to sf */
-            // *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32(output_v);
-
-
-            /* Convert from qf32 to sf, store output and go to handle leftover */
-            HVX_Vector output_v_f32 =  Q6_Vsf_equals_Vqf32(output_v);
-            HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp
-            output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero
-           
-            HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f
-            output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was
-           
-            *((HVX_UVector *)(output_v_ptr++)) = output_v_f32;
-
-
-            /* Prepare slinep for next iteration */
-            slinep = slinec;
-        }
-    }
-
-    /* Handle last whole vector from input data */
-    if (vectors_in_rounddown > 0)
-    {
-        slinec = is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
-        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-        sline_tmp = sline;
-
-        /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */
-        input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f);
-
-        /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */
-        input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v);
-
-        /*
-         * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0)
-         * to [16.0,32.0) in order to convert float indexes to integer values.
-         * Float values, represented in IEEE 754, in range [16.0,32.0] have the
-         * same exponent, which means 4 MSB of mantissa carry information about
-         * integer index.
-         */
-        input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf);
-
-        /* Convert back from qf32 to sf in order to extract integer index */
-        tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32);
-
-        /* Only 4 MSB bits of mantissa represent segment index */
-        idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19);
-
-        /* Ensure only 4 MSB bits of mantissa are used as indexes */
-        idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
-        idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
-        idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
-
-        /* Obtain the polynomial coefficients from lookup table */
-        c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
-        c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1);
-        c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
-        c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1);
-        c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
-        c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1);
-        c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
-        c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1);
-        c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
-        c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1);
-        c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1);
-        c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1);
-        c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1);
-        c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1);
-        c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1);
-        c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1);
-
-        /* Convert input from sf vector to qf32 vector for Horner's method*/
-        input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf);
-
-        /* Perform evaluation of polynomial using Horner's method */
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp));
-
-        /* Convert from qf32 to sf, store output and go to handle leftover */
-        HVX_Vector output_v_f32 =  Q6_Vsf_equals_Vqf32(output_v);
-        HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp
-        output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero
-
-        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f
-        output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was
-
-        *((HVX_UVector *)(output_v_ptr++)) = output_v_f32;
-
-        slinep = slinec;
-    }
-
-    /* Handle leftover elements */
-    if (leftover > 0)
-    {
-        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128)
-                   ? slinep
-                   : *input_v_ptr++);
-
-        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-        sline_tmp = sline;
-
-        /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */
-        input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f);
-
-        /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */
-        input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v);
-
-        /*
-         * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0)
-         * to [16.0,32.0) in order to convert float indexes to integer values.
-         * Float values, represented in IEEE 754, in range [16.0,32.0] have the
-         * same exponent, which means 4 MSB of mantissa carry information about
-         * integer index.
-         */
-        input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf);
-
-        /* Convert back from qf32 to sf in order to extract integer index */
-        tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32);
-
-        /* Only 4 MSB bits of mantissa represent segment index */
-        idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19);
-
-        /* Ensure only 4 MSB bits of mantissa are used as indexes */
-        idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
-        idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
-        idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
-
-        /* Obtain the polynomial coefficients from lookup table */
-        c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
-        c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1);
-        c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
-        c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1);
-        c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
-        c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1);
-        c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
-        c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1);
-        c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
-        c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1);
-        c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1);
-        c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1);
-        c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1);
-        c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1);
-        c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1);
-        c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1);
-
-        /* Convert input from sf vector to qf32 vector for Horner's method*/
-        input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf);
-
-        /* Perform evaluation of polynomial using Horner's method */
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp));
-        output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
-        output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp));
-
-        /* Convert from qf32 to sf */
-        // sout = Q6_Vsf_equals_Vqf32(output_v);
-        HVX_Vector output_v_f32 =  Q6_Vsf_equals_Vqf32(output_v);
-        HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp
-        output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero
-
-
-        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f
-        output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was
-
-        sout = output_v_f32;
-        /* Store output */
-        vstu_variable(output_v_ptr, leftover_size, sout);
-    }
-
-    return 0;
-}
-
-#endif /* __HVX_ARCH__ >= 68 */
diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.h b/ggml/src/ggml-hexagon/htp/qhcg_approximation.h
deleted file mode 100644
index 6f70e209ff..0000000000
--- a/ggml/src/ggml-hexagon/htp/qhcg_approximation.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/**=============================================================================
-@file
-    qhcg_approximation.h
-
-@brief
-    Header file of polynomial approximation generated by QHCG
-
-Copyright (c) 2020 Qualcomm Technologies Incorporated.
-All Rights Reserved. Qualcomm Proprietary and Confidential.
-=============================================================================**/
-
-#ifndef __qhcg_approximation__
-#define __qhcg_approximation__
-
-#include <stdint.h>
-
-int32_t qhcg_approximation(float *inputs, float *outputs, uint32_t length,
-    float limit_left, float limit_right
-);
-
-#endif /* __qhcg_approximation__ */
diff --git a/ggml/src/ggml-hexagon/htp/qhcg_internal.h b/ggml/src/ggml-hexagon/htp/qhcg_internal.h
deleted file mode 100644
index 618610dc88..0000000000
--- a/ggml/src/ggml-hexagon/htp/qhcg_internal.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/**=============================================================================
-@file
-    hvx_internal.h
-
-@brief
-    Header file for HVX routines.
-
-Copyright (c) 2020 Qualcomm Technologies Incorporated.
-All Rights Reserved. Qualcomm Proprietary and Confidential.
-=============================================================================**/
-
-#ifndef _HVX_INTERNAL_H
-#define _HVX_INTERNAL_H
-
-#include <stddef.h> // size_t
-#include <hexagon_types.h>
-
-#define HVX_INLINE_ALWAYS inline __attribute__((unused,always_inline))
-
-#ifndef LOG2VLEN
-#define LOG2VLEN    7
-#endif
-#define VLEN        (1<<LOG2VLEN)    // HVX vector - number of int8_t elements
-#define VLEN_SHORT  (1<<LOG2VLEN)>>1 // HVX vector - number of int16_t elements
-#define VLEN_WORD   (1<<LOG2VLEN)>>2 // HVX vector - number of int32_t elements
-
-typedef union
-{
-    HVX_VectorPair VV;
-    struct
-    {
-        HVX_Vector lo;
-        HVX_Vector hi;
-    } V;
-} HVX_DV;
-
-static HVX_INLINE_ALWAYS void l2fetch(const void *p, uint32_t stride,
-                                      uint32_t width, uint32_t height,
-                                      uint32_t dir)
-{
-    uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height);
-    __asm__ __volatile__ (" l2fetch(%0,%1) " : :"r"(p),"r"(control));
-}
-
-/* Return whether address is aligned. */
-
-static HVX_INLINE_ALWAYS int32_t is_aligned(void *addr, uint32_t align)
-{
-    return ((size_t) addr & (align - 1)) == 0;
-}
-
-/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
-
-static HVX_INLINE_ALWAYS int32_t is_in_one_chunk(void *addr, uint32_t n,
-                                                 uint32_t chunk_size)
-{
-    uint32_t left_off = (size_t) addr & (chunk_size - 1);
-    uint32_t right_off = left_off + n;
-    return right_off <= chunk_size;
-}
-
-/*
- * This function stores the first n bytes from vector vin to address 'addr'.
- * n must be in range 1..128 and addr may have any alignment. Does one or
- * two masked stores.
- */
-
-static HVX_INLINE_ALWAYS void vstu_variable(void *addr, uint32_t n,
-                                            HVX_Vector vin)
-{
-    /* Rotate as needed. */
-    vin = Q6_V_vlalign_VVR(vin, vin, (size_t) addr);
-
-    uint32_t left_off = (size_t) addr & 127;
-    uint32_t right_off = left_off + n;
-
-    HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) addr);
-    HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
-
-    if (right_off > 128)
-    {
-        Q6_vmem_QRIV(qr, (HVX_Vector*) addr + 1, vin);
-        /* all 1's */
-        qr = Q6_Q_vcmp_eq_VbVb(vin, vin);
-    }
-
-    ql_not = Q6_Q_or_QQn(ql_not, qr);
-    Q6_vmem_QnRIV(ql_not, (HVX_Vector*) addr, vin);
-}
-
-#endif /* _HVX_INTERNAL_H */

From 84f2f23aa9f17e2fa826db969cd825d0ab192995 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Fri, 12 Dec 2025 10:11:23 -0500
Subject: [PATCH 06/14] debug: temporarily disable unnecessary log message for
 debug purpose

---
 ggml/src/ggml-hexagon/htp/hvx-utils.c | 42 +++++++++++++--------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index e02b1d9099..d6e928c96f 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -40,13 +40,13 @@ void hvx_mul_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -252,13 +252,13 @@ void hvx_add_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -392,13 +392,13 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     static const float kInf    = INFINITY;
@@ -454,13 +454,13 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
@@ -507,13 +507,13 @@ void hvx_sub_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -647,13 +647,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
@@ -694,7 +694,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems)
     int num_elems_whole = num_elems - left_over;
 
     if (0 == htp_is_aligned((void *) src, VLEN)) {
-        FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n");
     }
 
     assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
@@ -733,13 +733,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if (0 == htp_is_aligned((void *) src, VLEN)) {
-        FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector sum_vec  = Q6_V_vsplat_R(0x00000000);
@@ -782,13 +782,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
@@ -831,13 +831,13 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if (0 == htp_is_aligned((void *) src, VLEN)) {
-        FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector vec_max   = hvx_vec_splat_fp32(((const float *) src)[0]);
@@ -877,7 +877,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     size_t num_elems_whole = num_elems - left_over;
 
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
     }
 
     assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
@@ -916,7 +916,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
     size_t num_elems_whole = num_elems - left_over;
 
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        //FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
     }
 
     assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));

From fc2289dc96e2c2622922189b0e04bb30302c3aa1 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Fri, 12 Dec 2025 11:58:45 -0500
Subject: [PATCH 07/14] Feat: optiized unaligned sigmoid_f32

---
 ggml/src/ggml-hexagon/htp/act-ops.c   |   5 +-
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 108 ++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 2db4a2a35b..5266567d37 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -317,10 +317,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
         } else {
             hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
             // sigmoid
-            hvx_exp_f32((const uint8_t *) src0_spad_data, src0_spad_data, ne0, true);
-            hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0);
-            hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0);
-
+            hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
             hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
         }
     }
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 80658105c5..6c713b40eb 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -265,12 +265,16 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
     }
 }
 
+
+/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
 static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
     uint32_t left_off  = (size_t) addr & (chunk_size - 1);
     uint32_t right_off = left_off + n;
     return right_off <= chunk_size;
 }
 
+
+
 static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
     HVX_VectorAlias u = { .v = v };
 
@@ -994,6 +998,110 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     }
 }
 
+
+static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
+    int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+    int leftover = num_elems - (step_of_1 * VLEN_FP32);
+
+    // assert(remaining == 0);//TODO: handle remaining elements later
+
+
+    int32_t leftover_size = leftover * sizeof(float);
+
+    static const float kMinExp = -87.f;  // 0
+    static const float kMaxExp = 87.f;   // 1
+
+    const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
+    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
+
+    const float *input = (float *)src;
+    float *output = (float *)dst;
+
+    HVX_Vector *  input_v_ptr = (HVX_Vector *) input;
+    HVX_UVector *  output_v_ptr       = (HVX_UVector *) output;
+
+
+    HVX_Vector slinep;
+    HVX_Vector slinec;
+    HVX_Vector sline;
+    
+
+    slinep = *input_v_ptr++; 
+    #pragma unroll(4)
+    for(uint32_t i = step_of_1 -1; i> 0; i--){
+        slinec = *input_v_ptr++;
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);       
+        *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        /* Prepare slinep for next iteration */
+        slinep = slinec;        
+    }
+
+    if(step_of_1> 0){
+
+        slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);;
+
+        slinep = slinec;
+    }
+    if(leftover> 0){
+        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128)
+                   ? slinep
+                   : *input_v_ptr++);
+
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+
+        HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        /* Store output */
+        hvx_vec_store_u(output_v_ptr, leftover_size, sout);        
+    }
+
+  
+}
+
+
+
+// static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
+//     int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+//     int leftover = num_elems - (step_of_1 * VLEN_FP32);
+
+//     // assert(remaining == 0);//TODO: handle remaining elements later
+
+
+//     int32_t leftover_size = leftover * sizeof(float);
+
+//     static const float kMinExp = -87.f;  // 0
+//     static const float kMaxExp = 87.f;   // 1
+
+//     const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
+//     const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
+//     const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
+
+//     const float *input = (float *)src;
+//     float *output = (float *)dst;
+
+//     HVX_UVector *  input_v_ptr = (HVX_UVector *) input;
+//     HVX_UVector *  output_v_ptr       = (HVX_UVector *) output;
+
+//     // #pragma unroll(4)  NOTE: this actual got slower
+//     for(uint32_t i = step_of_1; i> 0; i--){
+//         *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp);
+ 
+//     }
+
+
+//     if(leftover> 0){
+
+
+//         HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp);
+//         /* Store output */
+//         hvx_vec_store_u(output_v_ptr, leftover_size, sout);        
+//     }
+
+  
+// }
+
 float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
 void  hvx_mul_f32(const uint8_t * restrict src0,
                   const uint8_t * restrict src1,

From 8bc299ddefc2a46bbc0a3800f2bed7ae390ffd49 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Fri, 12 Dec 2025 17:15:53 -0500
Subject: [PATCH 08/14] Feat: larger l2prefetch block

---
 ggml/src/ggml-hexagon/htp/act-ops.c | 42 +++++++++++++++++------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 5266567d37..97823575c1 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -298,27 +298,35 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
     uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
     uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
 
-    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
-        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
-
-        if (ir + 1 < src0_end_row) {
-            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
+    const int BLOCK = 8;
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
+        const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
+        
+        // Prefetch next block
+        if (block_end < src0_end_row) {
+            const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
+            htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
         }
 
+        // Process rows in current block
+        for (uint32_t ib = ir; ib < block_end; ib++) {
+            const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
+            float * restrict dst        = (float *) (data_dst + (ib * dst_row_size));
 
-        // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh
-        // gelu = x * sigmoid(1.702 * x) // current implementation
-        if (1 == opt_path) {
-            hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
-            hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
+            // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh
+            // gelu = x * sigmoid(1.702 * x) // current implementation
+            if (1 == opt_path) {
+                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
+                hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
 
-            hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
-        } else {
-            hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
-            // sigmoid
-            hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
-            hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+                hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+            } 
+            else {
+                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
+                // sigmoid
+                hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
+                hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+            }
         }
     }
 

From cbd4e93296f64314cab9cbbc1f587437ed262ba1 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Fri, 12 Dec 2025 17:17:43 -0500
Subject: [PATCH 09/14] feat: apply unaligned-load optimization on mul and
 mul_scalar

---
 ggml/src/ggml-hexagon/htp/hvx-utils.c | 152 ++++++++++++++++++++++++--
 1 file changed, 140 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index d6e928c96f..b0099991cd 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -49,6 +49,8 @@ void hvx_mul_f32(const uint8_t * restrict src0,
         //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
+
+    bool handled_leftover = false;
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
@@ -60,18 +62,88 @@ void hvx_mul_f32(const uint8_t * restrict src0,
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
+        // #pragma unroll(4)
+        // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+        //     HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
+        //     HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
+
+        //     HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
+
+        //     *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+        // }
+
+        int step_of_1 = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int leftover_size = left_over * sizeof(float);
+
+
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
+        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
+        HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
+
+
+        HVX_Vector slinep;
+        HVX_Vector slinec;
+        HVX_Vector sline;
+        HVX_Vector sline2p;
+        HVX_Vector sline2c;
+        HVX_Vector sline2;
+
+        slinep = *vec_in1++; 
+        sline2p = *vec_in2++;
         #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
-            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
+        for(uint32_t i = step_of_1 -1; i> 0; i--){
+            slinec = *vec_in1++;
+            sline2c = *vec_in2++;
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);       
+            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+           
+            *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32(  Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            /* Prepare slinep for next iteration */
+            slinep = slinec;
+            sline2p = sline2c;  
+        }
+        if(step_of_1 > 1){
+            slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
+            sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;
 
-            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);       
+            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+            *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32(  Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            /* Prepare slinep for next iteration */
+            slinep = slinec;
+            sline2p = sline2c;
+        }
+        if(left_over > 0 ){
 
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+            slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN)
+                    ? slinep
+                    : *vec_in1++);
+
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN)
+                    ? sline2p
+                    : *vec_in2++);
+            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+
+            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
+            hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));  
+            handled_leftover = true;
         }
     }
 
-    if (left_over > 0) {
+    // if (left_over > 0 ) {
+    //     const float * src0f = (const float *) src0 + num_elems_whole;
+    //     const float * src1f = (const float *) src1 + num_elems_whole;
+    //     float *       dstf  = (float *) dst + num_elems_whole;
+
+    //     HVX_Vector in1 = *(HVX_UVector *) src0f;
+    //     HVX_Vector in2 = *(HVX_UVector *) src1f;
+
+    //     HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
+    //     hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+    // }
+
+    if (left_over > 0 && !handled_leftover) {
         const float * src0f = (const float *) src0 + num_elems_whole;
         const float * src1f = (const float *) src1 + num_elems_whole;
         float *       dstf  = (float *) dst + num_elems_whole;
@@ -464,7 +536,7 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     }
 
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
-
+    bool handled_leftover = false;
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
@@ -475,17 +547,73 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
+        // #pragma unroll(4)
+        // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+        //     HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+
+        //     HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
+
+        //     *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+        // }
+
+        int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int leftover_size = left_over * sizeof(float);
+
+
+
+        HVX_Vector *  input_v_ptr = (HVX_Vector *) src;
+        HVX_UVector *  output_v_ptr       = (HVX_UVector *) dst;
+
+
+        HVX_Vector slinep;
+        HVX_Vector slinec;
+        HVX_Vector sline;
+            
+        slinep = *input_v_ptr++; 
+
         #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+        for(uint32_t i = step_of_1 - 1; i > 0; i--){
+            slinec = *input_v_ptr++;
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *)(output_v_ptr++)) =  Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            /* Prepare slinep for next iteration */
+            slinep = slinec;        
+        }
 
-            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
+        if(step_of_1 > 0){
 
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+            slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *)(output_v_ptr++)) =  Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+
+            slinep = slinec;
+        }
+
+        if(leftover_size > 0){
+            slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN)
+                    ? slinep
+                    : *input_v_ptr++);
+
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+
+            HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            /* Store output */
+            hvx_vec_store_u(output_v_ptr, leftover_size, sout);  
+            handled_leftover = true;
         }
     }
 
-    if (left_over > 0) {
+    // if (left_over > 0 ) {
+    //     const float * srcf = (const float *) src + num_elems_whole;
+    //     float *       dstf = (float *) dst + num_elems_whole;
+
+    //     HVX_Vector in = *(HVX_UVector *) srcf;
+
+    //     HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
+    //     hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+    // }
+
+    if (left_over > 0 && !handled_leftover) {
         const float * srcf = (const float *) src + num_elems_whole;
         float *       dstf = (float *) dst + num_elems_whole;
 

From e51b6bf2b94a6f0addbe18f1b822efcf4ad4b498 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Tue, 16 Dec 2025 08:27:57 -0500
Subject: [PATCH 10/14] Revert "debug: temporarily disable unnecessary log
 message for debug purpose"

This reverts commit 84f2f23aa9f17e2fa826db969cd825d0ab192995.
---
 ggml/src/ggml-hexagon/htp/hvx-utils.c | 42 +++++++++++++--------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index b0099991cd..63c7c85427 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -40,13 +40,13 @@ void hvx_mul_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
 
@@ -324,13 +324,13 @@ void hvx_add_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -464,13 +464,13 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     static const float kInf    = INFINITY;
@@ -526,13 +526,13 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
@@ -635,13 +635,13 @@ void hvx_sub_f32(const uint8_t * restrict src0,
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
         (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     if (0 == unaligned_loop) {
@@ -775,13 +775,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
@@ -822,7 +822,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems)
     int num_elems_whole = num_elems - left_over;
 
     if (0 == htp_is_aligned((void *) src, VLEN)) {
-        //FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n");
     }
 
     assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
@@ -861,13 +861,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if (0 == htp_is_aligned((void *) src, VLEN)) {
-        //FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector sum_vec  = Q6_V_vsplat_R(0x00000000);
@@ -910,13 +910,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
@@ -959,13 +959,13 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
     int unaligned_addr = 0;
     int unaligned_loop = 0;
     if (0 == htp_is_aligned((void *) src, VLEN)) {
-        //FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
         unaligned_addr = 1;
     }
 
     if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
         unaligned_loop = 1;
-        //FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
     HVX_Vector vec_max   = hvx_vec_splat_fp32(((const float *) src)[0]);
@@ -1005,7 +1005,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     size_t num_elems_whole = num_elems - left_over;
 
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
     }
 
     assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
@@ -1044,7 +1044,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
     size_t num_elems_whole = num_elems - left_over;
 
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
-        //FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
     }
 
     assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));

From 05693357c8d60daa9694df44d1797b9e6256becd Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Tue, 16 Dec 2025 08:31:16 -0500
Subject: [PATCH 11/14] refactor: cleanup commented unused code

---
 ggml/src/ggml-hexagon/htp/act-ops.c   |  3 --
 ggml/src/ggml-hexagon/htp/hvx-utils.c | 43 -------------------------
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 45 ---------------------------
 3 files changed, 91 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 97823575c1..9d3e584a84 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -313,17 +313,14 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
             const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
             float * restrict dst        = (float *) (data_dst + (ib * dst_row_size));
 
-            // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh
             // gelu = x * sigmoid(1.702 * x) // current implementation
             if (1 == opt_path) {
                 hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
                 hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
-
                 hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
             } 
             else {
                 hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
-                // sigmoid
                 hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
                 hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
             }
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index 63c7c85427..e7ee589f34 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -62,16 +62,6 @@ void hvx_mul_f32(const uint8_t * restrict src0,
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-        // #pragma unroll(4)
-        // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-        //     HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
-        //     HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
-
-        //     HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
-
-        //     *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
-        // }
-
         int step_of_1 = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
         int leftover_size = left_over * sizeof(float);
 
@@ -98,7 +88,6 @@ void hvx_mul_f32(const uint8_t * restrict src0,
             sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
            
             *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32(  Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
-            /* Prepare slinep for next iteration */
             slinep = slinec;
             sline2p = sline2c;  
         }
@@ -109,7 +98,6 @@ void hvx_mul_f32(const uint8_t * restrict src0,
             sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);       
             sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
             *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32(  Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
-            /* Prepare slinep for next iteration */
             slinep = slinec;
             sline2p = sline2c;
         }
@@ -131,17 +119,6 @@ void hvx_mul_f32(const uint8_t * restrict src0,
         }
     }
 
-    // if (left_over > 0 ) {
-    //     const float * src0f = (const float *) src0 + num_elems_whole;
-    //     const float * src1f = (const float *) src1 + num_elems_whole;
-    //     float *       dstf  = (float *) dst + num_elems_whole;
-
-    //     HVX_Vector in1 = *(HVX_UVector *) src0f;
-    //     HVX_Vector in2 = *(HVX_UVector *) src1f;
-
-    //     HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
-    //     hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
-    // }
 
     if (left_over > 0 && !handled_leftover) {
         const float * src0f = (const float *) src0 + num_elems_whole;
@@ -547,15 +524,6 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-        // #pragma unroll(4)
-        // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-        //     HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-
-        //     HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
-
-        //     *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
-        // }
-
         int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
         int leftover_size = left_over * sizeof(float);
 
@@ -597,22 +565,11 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
             sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
 
             HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
-            /* Store output */
             hvx_vec_store_u(output_v_ptr, leftover_size, sout);  
             handled_leftover = true;
         }
     }
 
-    // if (left_over > 0 ) {
-    //     const float * srcf = (const float *) src + num_elems_whole;
-    //     float *       dstf = (float *) dst + num_elems_whole;
-
-    //     HVX_Vector in = *(HVX_UVector *) srcf;
-
-    //     HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
-    //     hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
-    // }
-
     if (left_over > 0 && !handled_leftover) {
         const float * srcf = (const float *) src + num_elems_whole;
         float *       dstf = (float *) dst + num_elems_whole;
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 6c713b40eb..0b24786391 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -1003,9 +1003,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
     int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
     int leftover = num_elems - (step_of_1 * VLEN_FP32);
 
-    // assert(remaining == 0);//TODO: handle remaining elements later
-
-
     int32_t leftover_size = leftover * sizeof(float);
 
     static const float kMinExp = -87.f;  // 0
@@ -1053,7 +1050,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
         sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
 
         HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
-        /* Store output */
         hvx_vec_store_u(output_v_ptr, leftover_size, sout);        
     }
 
@@ -1061,47 +1057,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
 }
 
 
-
-// static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
-//     int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
-//     int leftover = num_elems - (step_of_1 * VLEN_FP32);
-
-//     // assert(remaining == 0);//TODO: handle remaining elements later
-
-
-//     int32_t leftover_size = leftover * sizeof(float);
-
-//     static const float kMinExp = -87.f;  // 0
-//     static const float kMaxExp = 87.f;   // 1
-
-//     const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
-//     const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
-//     const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
-
-//     const float *input = (float *)src;
-//     float *output = (float *)dst;
-
-//     HVX_UVector *  input_v_ptr = (HVX_UVector *) input;
-//     HVX_UVector *  output_v_ptr       = (HVX_UVector *) output;
-
-//     // #pragma unroll(4)  NOTE: this actual got slower
-//     for(uint32_t i = step_of_1; i> 0; i--){
-//         *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp);
- 
-//     }
-
-
-//     if(leftover> 0){
-
-
-//         HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp);
-//         /* Store output */
-//         hvx_vec_store_u(output_v_ptr, leftover_size, sout);        
-//     }
-
-  
-// }
-
 float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
 void  hvx_mul_f32(const uint8_t * restrict src0,
                   const uint8_t * restrict src1,

From 952877ec24732b12010c7fa7ed3fc8de4b74e718 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Tue, 16 Dec 2025 08:41:54 -0500
Subject: [PATCH 12/14] chore: reformat code with clang-formatter to pass cli
 test

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 207 ++++++++++++-------------
 ggml/src/ggml-hexagon/htp/act-ops.c    |  12 +-
 ggml/src/ggml-hexagon/htp/htp-msg.h    |   8 +-
 ggml/src/ggml-hexagon/htp/hvx-utils.c  | 145 ++++++++---------
 ggml/src/ggml-hexagon/htp/hvx-utils.h  |  87 +++++------
 ggml/src/ggml-hexagon/htp/main.c       |  22 +--
 6 files changed, 221 insertions(+), 260 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index c45b292a52..781db7facf 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -8,8 +8,8 @@
 #include <atomic>
 #include <chrono>
 #include <mutex>
-#include <string>
 #include <stdexcept>
+#include <string>
 
 #ifdef _WIN32
 #    include <sal.h>
@@ -53,10 +53,12 @@ static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMP
 static int opt_opsync = 0;  // synchronous ops
 
 #define HEX_VERBOSE(...) \
-    if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
+    if (opt_verbose)     \
+    GGML_LOG_DEBUG(__VA_ARGS__)
 
 #define HEX_PROFILE(...) \
-    if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)
+    if (opt_profile)     \
+    GGML_LOG_INFO(__VA_ARGS__)
 
 static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
     return ((size_t) addr & (align - 1)) == 0;
@@ -218,7 +220,7 @@ struct ggml_hexagon_session {
     void allocate(int dev_id) noexcept(false);
     void release() noexcept(true);
 
-    void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
+    void enqueue(struct htp_general_req & req, struct dspqueue_buffer * bufs, uint32_t n_bufs, bool sync = false);
     void flush();
 
     ggml_backend_buffer_type buffer_type;
@@ -258,7 +260,10 @@ static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_sessio
                 names, dims, types, strides, buffs, req_flags);
 }
 
-void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
+void ggml_hexagon_session::enqueue(struct htp_general_req & req,
+                                   struct dspqueue_buffer * bufs,
+                                   uint32_t                 n_bufs,
+                                   bool                     sync) {
     // Bump pending flag (cleared in the session::flush once we get the responce)
     this->op_pending++;  // atomic inc
 
@@ -298,13 +303,13 @@ void ggml_hexagon_session::flush() {
 
         // Read response packet from queue
         int err = dspqueue_read(q, &flags,
-                                   HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
-                                   &n_bufs,                 // Number of buffer references
-                                   bufs,                    // Buffer references
-                                   sizeof(rsp),             // Max message length
-                                   &rsp_size,               // Message length
-                                   (uint8_t *) &rsp,
-                                   1000000);                // Timeout
+                                HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
+                                &n_bufs,                 // Number of buffer references
+                                bufs,                    // Buffer references
+                                sizeof(rsp),             // Max message length
+                                &rsp_size,               // Message length
+                                (uint8_t *) &rsp,
+                                1000000);                // Timeout
 
         if (err == AEE_EEXPIRED) {
             // TODO: might need to bail out if the HTP is stuck on something
@@ -354,8 +359,8 @@ struct ggml_backend_hexagon_buffer_context {
 
         int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
         if (err != 0) {
-            GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
-                    s->domain_id, this->size, this->fd, (unsigned) err);
+            GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", s->domain_id,
+                           this->size, this->fd, (unsigned) err);
             return false;
         }
 
@@ -386,10 +391,12 @@ struct ggml_backend_hexagon_buffer_context {
         size += 4 * 1024;  // extra page for padding
 
         if (rpcmem_alloc2) {
-            this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+            this->base =
+                (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
         } else {
             GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
-            this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+            this->base =
+                (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
         }
 
         if (!this->base) {
@@ -453,7 +460,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf
                 (int) ctx->repack);
 
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        ; // nothing to do for the view
+        ;  // nothing to do for the view
     } else {
         if (!ctx->mapped) {
             ctx->mmap();
@@ -702,8 +709,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
 
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t) nrows * row_size;
+    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -732,7 +739,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
+        const int64_t   i   = n_full_rows;
         const uint8_t * src = (const uint8_t *) data + (i * row_size);
         uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
 
@@ -762,8 +769,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
     size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
 
     // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t) nrows * row_size;
+    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -792,7 +799,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
+        const int64_t   i   = n_full_rows;
         const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
         uint8_t *       dst = (uint8_t *) data + (i * row_size);
 
@@ -1028,8 +1035,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
 
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t) nrows * row_size;
+    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -1058,7 +1065,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
+        const int64_t   i   = n_full_rows;
         const uint8_t * src = (const uint8_t *) data + (i * row_size);
         uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
 
@@ -1088,8 +1095,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
     size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
 
     // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t) nrows * row_size;
+    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -1118,7 +1125,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
+        const int64_t   i   = n_full_rows;
         const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
         uint8_t *       dst = (uint8_t *) data + (i * row_size);
 
@@ -1379,8 +1386,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
 
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t) nrows * row_size;
+    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -1409,7 +1416,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
+        const int64_t   i   = n_full_rows;
         const uint8_t * src = (const uint8_t *) data + (i * row_size);
         uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
 
@@ -1439,8 +1446,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
     size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
 
     // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t) nrows * row_size;
+    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -1469,7 +1476,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
+        const int64_t   i   = n_full_rows;
         const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
         uint8_t *       dst = (uint8_t *) data + (i * row_size);
 
@@ -1592,25 +1599,28 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty
     return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->name.c_str();
 }
 
-static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
-            ggml_backend_buffer_type_t buffer_type, size_t size) {
+static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type,
+                                                                           size_t                     size) {
     auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
     try {
-        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
+        ggml_backend_hexagon_buffer_context * ctx =
+            new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
         return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
-    } catch (std::exception const &exc) {
+    } catch (const std::exception & exc) {
         GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
         return nullptr;
     }
 }
 
 static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
-            ggml_backend_buffer_type_t buffer_type, size_t size) {
+    ggml_backend_buffer_type_t buffer_type,
+    size_t                     size) {
     auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
     try {
-        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
+        ggml_backend_hexagon_buffer_context * ctx =
+            new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
         return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
-    } catch (std::exception const &exc) {
+    } catch (const std::exception & exc) {
         GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
         return nullptr;
     }
@@ -1621,7 +1631,8 @@ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer
     GGML_UNUSED(buffer_type);
 }
 
-static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) {
+static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
+                                                              const struct ggml_tensor * t) {
     return ggml_nbytes(t);
 }
 
@@ -1697,8 +1708,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
         }
 
         // Save the IDs
-        this->session_id = n.session_id;
-        this->domain_id  = n.effective_domain_id;
+        this->session_id    = n.session_id;
+        this->domain_id     = n.effective_domain_id;
         this->valid_session = true;
     }
 
@@ -1707,16 +1718,17 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     char session_uri[256];
     {
         char htp_uri[256];
-        snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
+        snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0",
+                 opt_arch);
 
         struct remote_rpc_get_uri u = {};
-        u.session_id      = this->session_id;
-        u.domain_name     = const_cast<char *>(CDSP_DOMAIN_NAME);
-        u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
-        u.module_uri      = const_cast<char *>(htp_uri);
-        u.module_uri_len  = strlen(htp_uri);
-        u.uri             = session_uri;
-        u.uri_len         = sizeof(session_uri);
+        u.session_id                = this->session_id;
+        u.domain_name               = const_cast<char *>(CDSP_DOMAIN_NAME);
+        u.domain_name_len           = strlen(CDSP_DOMAIN_NAME);
+        u.module_uri                = const_cast<char *>(htp_uri);
+        u.module_uri_len            = strlen(htp_uri);
+        u.uri                       = session_uri;
+        u.uri_len                   = sizeof(session_uri);
 
         int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
         if (err != AEE_SUCCESS) {
@@ -1725,7 +1737,9 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
 
             snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);
 
-            GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri);
+            GGML_LOG_WARN(
+                "ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n",
+                dev_id, err, session_uri);
         }
     }
 
@@ -1751,7 +1765,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     this->valid_handle = true;
 
     GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
-            this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
+                  this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
 
     // Enable FastRPC QoS mode
     {
@@ -1841,8 +1855,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
     buffer_type.context        = nullptr;
     repack_buffer_type.context = nullptr;
 
-    buffer_type.device         = dev;
-    repack_buffer_type.device  = dev;
+    buffer_type.device        = dev;
+    repack_buffer_type.device = dev;
 
     try {
         allocate(dev_id);
@@ -1852,7 +1866,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
 
         repack_buffer_type.iface   = ggml_backend_hexagon_repack_buffer_type_interface;
         repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
-    } catch (std::exception const &exc) {
+    } catch (const std::exception & exc) {
         release();
         throw;
     }
@@ -1861,8 +1875,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
 ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
     release();
 
-    delete static_cast<ggml_backend_hexagon_buffer_type_context*>(buffer_type.context);
-    delete static_cast<ggml_backend_hexagon_buffer_type_context*>(repack_buffer_type.context);
+    delete static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type.context);
+    delete static_cast<ggml_backend_hexagon_buffer_type_context *>(repack_buffer_type.context);
 }
 
 // ** backend interface
@@ -2164,11 +2178,11 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if(src1){
+    if (src1) {
         if (!hex_supported_buffer(sess, src0, src1, dst)) {
             return false;
         }
-    }else{
+    } else {
         if (!hex_supported_buffer(sess, src0, dst)) {
             return false;
         }
@@ -2306,11 +2320,11 @@ static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t
 
     memset(buf, 0, sizeof(*buf));
     auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
-    buf->fd      = tensor_buf->fd;
-    buf->ptr     = t->data;
-    buf->offset  = (uint8_t *) t->data - tensor_buf->base;
-    buf->size    = ggml_nbytes(t);
-    buf->flags   = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0);        // Flush CPU
+    buf->fd         = tensor_buf->fd;
+    buf->ptr        = t->data;
+    buf->offset     = (uint8_t *) t->data - tensor_buf->base;
+    buf->size       = ggml_nbytes(t);
+    buf->flags      = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0);     // Flush CPU
     buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0);  // Invalidate DSP
     return 1;
 }
@@ -2670,8 +2684,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
             if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
                 req.op    = HTP_OP_UNARY_SILU;
                 supported = true;
-            }
-            else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){
+            } else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) {
                 req.op    = HTP_OP_UNARY_GELU;
                 supported = true;
             }
@@ -2902,8 +2915,7 @@ static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op
     return (op0 && op0->src[1] == op1->src[1]);
 }
 
-static inline bool is_compute_op(ggml_tensor *node)
-{
+static inline bool is_compute_op(ggml_tensor * node) {
     return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
 }
 
@@ -3013,29 +3025,17 @@ struct node_info {
 
     std::vector<ggml_tensor *> fused;
 
-    ggml_op op() const {
-        return node->op;
-    }
+    ggml_op op() const { return node->op; }
 
-    const ggml_tensor * dst() const {
-        return fused.empty() ? node : fused.back();
-    }
+    const ggml_tensor * dst() const { return fused.empty() ? node : fused.back(); }
 
-    const ggml_tensor * src0() const {
-        return node->src[0];
-    }
+    const ggml_tensor * src0() const { return node->src[0]; }
 
-    const ggml_tensor * src1() const {
-        return node->src[1];
-    }
+    const ggml_tensor * src1() const { return node->src[1]; }
 
-    bool is_empty() const {
-        return ggml_op_is_empty(node->op);
-    }
+    bool is_empty() const { return ggml_op_is_empty(node->op); }
 
-    void add_fused(ggml_tensor * t) {
-        fused.push_back(t);
-    }
+    void add_fused(ggml_tensor * t) { fused.push_back(t); }
 
     bool stackable() const {
         switch (this->op()) {
@@ -3047,9 +3047,7 @@ struct node_info {
         }
     }
 
-    bool same_input(const node_info& n) const {
-        return n.src1() == this->src1();
-    }
+    bool same_input(const node_info & n) const { return n.src1() == this->src1(); }
 };
 
 static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
@@ -3114,25 +3112,21 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
     //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
     for (int i = 0; i < n; i++) {
         node_info node = {
-            /*.node =*/ gf->nodes[i],
-            /*.fused =*/ {},
+            /*.node =*/gf->nodes[i],
+            /*.fused =*/{},
         };
 
         // fuse only ops that start with these operations
         // can be expanded when needed
-        if (node.op() == GGML_OP_ADD ||
-            node.op() == GGML_OP_NORM ||
-            node.op() == GGML_OP_RMS_NORM) {
+        if (node.op() == GGML_OP_ADD || node.op() == GGML_OP_NORM || node.op() == GGML_OP_RMS_NORM) {
             ops[0] = node.op();
 
             int f = i + 1;
             while (f < n && f < i + MAX_FUSE) {
                 // conservatively allow fusing only these ops
                 // can be expanded when needed
-                if (gf->nodes[f]->op != GGML_OP_ADD &&
-                    gf->nodes[f]->op != GGML_OP_MUL &&
-                    gf->nodes[f]->op != GGML_OP_NORM &&
-                    gf->nodes[f]->op != GGML_OP_RMS_NORM) {
+                if (gf->nodes[f]->op != GGML_OP_ADD && gf->nodes[f]->op != GGML_OP_MUL &&
+                    gf->nodes[f]->op != GGML_OP_NORM && gf->nodes[f]->op != GGML_OP_RMS_NORM) {
                     break;
                 }
                 ops[f - i] = gf->nodes[f]->op;
@@ -3308,8 +3302,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
         case GGML_OP_UNARY:
             if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
                 supp = ggml_hexagon_supported_activations(sess, op);
-            }
-            else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){
+            } else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU) {
                 supp = ggml_hexagon_supported_activations(sess, op);
             }
             break;
@@ -3416,7 +3409,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
         }
     }
 
-    if(opt_arch < 75) {
+    if (opt_arch < 75) {
         opt_ndev = 1;
         GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
     }
@@ -3425,11 +3418,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
 
     // Create devices / sessions
     for (size_t i = 0; i < opt_ndev; i++) {
-        devices[i].iface   = ggml_backend_hexagon_device_i;
-        devices[i].reg     = reg;
+        devices[i].iface = ggml_backend_hexagon_device_i;
+        devices[i].reg   = reg;
         try {
             devices[i].context = new ggml_hexagon_session(i, &devices[i]);
-        } catch (std::exception const &exc) {
+        } catch (const std::exception & exc) {
             GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
             devices[i].context = nullptr;
         }
diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 9d3e584a84..273179ae2f 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -255,7 +255,6 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
          src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-
 static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
                                        struct htp_tensor *       dst,
                                        const int32_t *           op_params,
@@ -301,7 +300,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
     const int BLOCK = 8;
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
         const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
-        
+
         // Prefetch next block
         if (block_end < src0_end_row) {
             const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
@@ -315,12 +314,11 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
 
             // gelu = x * sigmoid(1.702 * x) // current implementation
             if (1 == opt_path) {
-                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
+                hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
                 hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
                 hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
-            } 
-            else {
-                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
+            } else {
+                hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
                 hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
                 hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
             }
@@ -339,8 +337,6 @@ static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
                                octx->src0_nrows_per_thread);
 }
 
-
-
 static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
                                        struct htp_tensor *       dst,
                                        const int32_t *           op_params,
diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h
index a61652304a..0e893c1d96 100644
--- a/ggml/src/ggml-hexagon/htp/htp-msg.h
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@@ -120,10 +120,10 @@ static const char * htp_type_name(uint32_t t) {
 #define HTP_MAX_DIMS 4
 
 struct htp_tensor {
-    uint32_t data;                // Buffer offset in the messages, and data pointer on the NSP
-    uint32_t type;                // Data type
-    uint32_t ne[HTP_MAX_DIMS];    // Number of elements
-    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h ggml_tensor)
+    uint32_t data;              // Buffer offset in the messages, and data pointer on the NSP
+    uint32_t type;              // Data type
+    uint32_t ne[HTP_MAX_DIMS];  // Number of elements
+    uint32_t nb[HTP_MAX_DIMS];  // Stride in bytes (see ggml.h ggml_tensor)
 };
 
 #define HTP_MAX_OP_PARAMS 64
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index e7ee589f34..2ac4cfb263 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -49,28 +49,25 @@ void hvx_mul_f32(const uint8_t * restrict src0,
         FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
-
     bool handled_leftover = false;
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-        int step_of_1 = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int step_of_1     = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
         int leftover_size = left_over * sizeof(float);
 
-
-        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
-        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
+        HVX_Vector * restrict vec_in1  = (HVX_Vector *) src0;
+        HVX_Vector * restrict vec_in2  = (HVX_Vector *) src1;
         HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
 
-
         HVX_Vector slinep;
         HVX_Vector slinec;
         HVX_Vector sline;
@@ -78,48 +75,42 @@ void hvx_mul_f32(const uint8_t * restrict src0,
         HVX_Vector sline2c;
         HVX_Vector sline2;
 
-        slinep = *vec_in1++; 
+        slinep  = *vec_in1++;
         sline2p = *vec_in2++;
-        #pragma unroll(4)
-        for(uint32_t i = step_of_1 -1; i> 0; i--){
-            slinec = *vec_in1++;
+#pragma unroll(4)
+        for (uint32_t i = step_of_1 - 1; i > 0; i--) {
+            slinec  = *vec_in1++;
             sline2c = *vec_in2++;
-            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);       
-            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
-           
-            *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32(  Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
-            slinep = slinec;
-            sline2p = sline2c;  
+            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+
+            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            slinep                         = slinec;
+            sline2p                        = sline2c;
         }
-        if(step_of_1 > 1){
-            slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
+        if (step_of_1 > 1) {
+            slinec  = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
             sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;
 
-            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);       
-            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
-            *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32(  Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
-            slinep = slinec;
-            sline2p = sline2c;
+            sline                          = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2                         = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            slinep                         = slinec;
+            sline2p                        = sline2c;
         }
-        if(left_over > 0 ){
+        if (left_over > 0) {
+            slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++);
 
-            slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN)
-                    ? slinep
-                    : *vec_in1++);
-
-            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
-            sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN)
-                    ? sline2p
-                    : *vec_in2++);
-            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++);
+            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
 
             HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
-            hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));  
+            hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));
             handled_leftover = true;
         }
     }
 
-
     if (left_over > 0 && !handled_leftover) {
         const float * src0f = (const float *) src0 + num_elems_whole;
         const float * src1f = (const float *) src1 + num_elems_whole;
@@ -315,13 +306,13 @@ void hvx_add_f32(const uint8_t * restrict src0,
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
             HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
@@ -458,7 +449,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector           in       = *vec_in1++;
             const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
@@ -468,7 +459,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
             *vec_out++                    = v;
         }
     } else {
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -512,60 +503,54 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
-    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
-    bool handled_leftover = false;
+    HVX_Vector val_vec          = hvx_vec_splat_fp32(val);
+    bool       handled_leftover = false;
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-        int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int step_of_1     = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
         int leftover_size = left_over * sizeof(float);
 
-
-
-        HVX_Vector *  input_v_ptr = (HVX_Vector *) src;
-        HVX_UVector *  output_v_ptr       = (HVX_UVector *) dst;
-
+        HVX_Vector *  input_v_ptr  = (HVX_Vector *) src;
+        HVX_UVector * output_v_ptr = (HVX_UVector *) dst;
 
         HVX_Vector slinep;
         HVX_Vector slinec;
         HVX_Vector sline;
-            
-        slinep = *input_v_ptr++; 
 
-        #pragma unroll(4)
-        for(uint32_t i = step_of_1 - 1; i > 0; i--){
-            slinec = *input_v_ptr++;
-            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
-            *((HVX_UVector *)(output_v_ptr++)) =  Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+        slinep = *input_v_ptr++;
+
+#pragma unroll(4)
+        for (uint32_t i = step_of_1 - 1; i > 0; i--) {
+            slinec                              = *input_v_ptr++;
+            sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
             /* Prepare slinep for next iteration */
-            slinep = slinec;        
+            slinep                              = slinec;
         }
 
-        if(step_of_1 > 0){
-
+        if (step_of_1 > 0) {
             slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
-            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
-            *((HVX_UVector *)(output_v_ptr++)) =  Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
 
             slinep = slinec;
         }
 
-        if(leftover_size > 0){
-            slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN)
-                    ? slinep
-                    : *input_v_ptr++);
+        if (leftover_size > 0) {
+            slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++);
 
             sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
 
-            HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
-            hvx_vec_store_u(output_v_ptr, leftover_size, sout);  
+            HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            hvx_vec_store_u(output_v_ptr, leftover_size, sout);
             handled_leftover = true;
         }
     }
@@ -606,13 +591,13 @@ void hvx_sub_f32(const uint8_t * restrict src0,
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
             HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
@@ -747,13 +732,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -789,7 +774,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems)
     HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000);
     HVX_Vector zero_vec    = Q6_V_vsplat_R(0x00000000);
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
         HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1);
         sum_vec_acc  = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v);
@@ -833,13 +818,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
     if (0 == unaligned_loop) {
         HVX_Vector * vec_in = (HVX_Vector *) src;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++);
             sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++);
         }
     } else {
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -882,13 +867,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i
         HVX_Vector * vec_in1 = (HVX_Vector *) src;
         HVX_Vector * vec_out = (HVX_Vector *) dst;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -931,12 +916,12 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in = (HVX_Vector *) src;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++);
         }
     } else {
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -974,7 +959,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
     HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
         vec_min    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
         *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
@@ -1012,7 +997,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
     HVX_Vector range_left  = hvx_vec_splat_fp32(limit_left);
     HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
         HVX_Vector in_vec = *vec_in++;
         HVX_Vector temp_v = in_vec;
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 0b24786391..c5da167d49 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -23,20 +23,18 @@ typedef union {
 
 /* Q6_Vsf_equals_Vw is only available on v73+.*/
 #if __HVX_ARCH__ < 73
-static inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
-{
-    HVX_Vector const vzero = Q6_V_vzero();
-    HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
-    HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
-    HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
-    HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
-    HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
-    HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
+static inline HVX_Vector int32_to_qfloat(const HVX_Vector in) {
+    const HVX_Vector vzero      = Q6_V_vzero();
+    HVX_VectorPred   is_zero    = Q6_Q_vcmp_eq_VwVw(in, vzero);
+    HVX_Vector       lshift     = Q6_Vw_vnormamt_Vw(in);
+    HVX_Vector       normalized = Q6_Vw_vasl_VwVw(in, lshift);
+    HVX_Vector       vexp       = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
+    HVX_Vector       mant       = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
+    HVX_Vector       ret        = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
     return ret;
 }
 
-static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
-{
+static inline HVX_Vector Q6_Vsf_equals_Vw(const HVX_Vector in) {
     return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
 }
 #endif
@@ -109,7 +107,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -133,7 +131,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -157,7 +155,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -182,7 +180,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -206,7 +204,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -230,7 +228,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -255,7 +253,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         vdst[i] = velem;
     }
@@ -265,7 +263,6 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
     }
 }
 
-
 /* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
 static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
     uint32_t left_off  = (size_t) addr & (chunk_size - 1);
@@ -273,8 +270,6 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3
     return right_off <= chunk_size;
 }
 
-
-
 static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
     HVX_VectorAlias u = { .v = v };
 
@@ -992,16 +987,15 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
     const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
     }
 }
 
-
-static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
+static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
     int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
-    int leftover = num_elems - (step_of_1 * VLEN_FP32);
+    int leftover  = num_elems - (step_of_1 * VLEN_FP32);
 
     int32_t leftover_size = leftover * sizeof(float);
 
@@ -1012,51 +1006,44 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
     const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
     const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
 
-    const float *input = (float *)src;
-    float *output = (float *)dst;
-
-    HVX_Vector *  input_v_ptr = (HVX_Vector *) input;
-    HVX_UVector *  output_v_ptr       = (HVX_UVector *) output;
+    const float * input  = (float *) src;
+    float *       output = (float *) dst;
 
+    HVX_Vector *  input_v_ptr  = (HVX_Vector *) input;
+    HVX_UVector * output_v_ptr = (HVX_UVector *) output;
 
     HVX_Vector slinep;
     HVX_Vector slinec;
     HVX_Vector sline;
-    
 
-    slinep = *input_v_ptr++; 
-    #pragma unroll(4)
-    for(uint32_t i = step_of_1 -1; i> 0; i--){
-        slinec = *input_v_ptr++;
-        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);       
-        *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+    slinep = *input_v_ptr++;
+#pragma unroll(4)
+    for (uint32_t i = step_of_1 - 1; i > 0; i--) {
+        slinec                              = *input_v_ptr++;
+        sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
         /* Prepare slinep for next iteration */
-        slinep = slinec;        
+        slinep                              = slinec;
     }
 
-    if(step_of_1> 0){
-
+    if (step_of_1 > 0) {
         slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
-        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-        *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);;
+        sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        ;
 
         slinep = slinec;
     }
-    if(leftover> 0){
-        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128)
-                   ? slinep
-                   : *input_v_ptr++);
+    if (leftover > 0) {
+        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++);
 
         sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
 
         HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
-        hvx_vec_store_u(output_v_ptr, leftover_size, sout);        
+        hvx_vec_store_u(output_v_ptr, leftover_size, sout);
     }
-
-  
 }
 
-
 float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
 void  hvx_mul_f32(const uint8_t * restrict src0,
                   const uint8_t * restrict src1,
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index e30ae69502..cbfdd0472f 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -151,7 +151,7 @@ static int vtcm_acquire(struct htp_context * ctx) {
         qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
         err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
         if (err != 0) {
-            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
+            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err);
             abort();
         }
         HAP_compute_res_release_cached(ctx->vtcm_rctx);
@@ -159,7 +159,7 @@ static int vtcm_acquire(struct htp_context * ctx) {
 
         err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
         if (err != 0) {
-            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
+            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err);
             abort();
         }
         ctx->vtcm_valid = true;
@@ -411,7 +411,7 @@ static void proc_matmul_req(struct htp_context *     ctx,
     rsp_bufs[0].ptr    = bufs[2].ptr;
     rsp_bufs[0].size   = bufs[2].size;
     rsp_bufs[0].offset = bufs[2].offset;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
@@ -453,7 +453,7 @@ static void proc_matmul_id_req(struct htp_context *     ctx,
     rsp_bufs[0].ptr    = bufs[3].ptr;
     rsp_bufs[0].size   = bufs[3].size;
     rsp_bufs[0].offset = bufs[3].offset;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
@@ -494,7 +494,7 @@ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * r
     rsp_bufs[0].ptr    = bufs[2].ptr;
     rsp_bufs[0].offset = bufs[2].offset;
     rsp_bufs[0].size   = bufs[2].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
@@ -533,7 +533,7 @@ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * r
     rsp_bufs[0].ptr    = bufs[3].ptr;
     rsp_bufs[0].offset = bufs[3].offset;
     rsp_bufs[0].size   = bufs[3].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
@@ -574,7 +574,7 @@ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * re
     rsp_bufs[0].ptr    = bufs[1].ptr;
     rsp_bufs[0].offset = bufs[1].offset;
     rsp_bufs[0].size   = bufs[1].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
@@ -618,8 +618,8 @@ static void proc_activations_req(struct htp_context *     ctx,
     rsp_bufs[0].ptr    = bufs[write_idx].ptr;
     rsp_bufs[0].offset = bufs[write_idx].offset;
     rsp_bufs[0].size   = bufs[write_idx].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
-                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
     struct htp_ops_context octx = { 0 };
@@ -674,8 +674,8 @@ static void proc_rope_req(struct htp_context *     ctx,
     rsp_bufs[0].ptr    = bufs[write_idx].ptr;
     rsp_bufs[0].offset = bufs[write_idx].offset;
     rsp_bufs[0].size   = bufs[write_idx].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
-                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
     struct htp_ops_context octx = { 0 };

From cf3a65fb73b21b108b9f586add04f5250ece934d Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Tue, 16 Dec 2025 14:28:34 -0500
Subject: [PATCH 13/14] Revert "chore: reformat code with clang-formatter to
 pass cli test"

This reverts commit 952877ec24732b12010c7fa7ed3fc8de4b74e718.
---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 207 +++++++++++++------------
 ggml/src/ggml-hexagon/htp/act-ops.c    |  12 +-
 ggml/src/ggml-hexagon/htp/htp-msg.h    |   8 +-
 ggml/src/ggml-hexagon/htp/hvx-utils.c  | 145 +++++++++--------
 ggml/src/ggml-hexagon/htp/hvx-utils.h  |  87 ++++++-----
 ggml/src/ggml-hexagon/htp/main.c       |  22 +--
 6 files changed, 260 insertions(+), 221 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 781db7facf..c45b292a52 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -8,8 +8,8 @@
 #include <atomic>
 #include <chrono>
 #include <mutex>
-#include <stdexcept>
 #include <string>
+#include <stdexcept>
 
 #ifdef _WIN32
 #    include <sal.h>
@@ -53,12 +53,10 @@ static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMP
 static int opt_opsync = 0;  // synchronous ops
 
 #define HEX_VERBOSE(...) \
-    if (opt_verbose)     \
-    GGML_LOG_DEBUG(__VA_ARGS__)
+    if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
 
 #define HEX_PROFILE(...) \
-    if (opt_profile)     \
-    GGML_LOG_INFO(__VA_ARGS__)
+    if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)
 
 static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
     return ((size_t) addr & (align - 1)) == 0;
@@ -220,7 +218,7 @@ struct ggml_hexagon_session {
     void allocate(int dev_id) noexcept(false);
     void release() noexcept(true);
 
-    void enqueue(struct htp_general_req & req, struct dspqueue_buffer * bufs, uint32_t n_bufs, bool sync = false);
+    void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
     void flush();
 
     ggml_backend_buffer_type buffer_type;
@@ -260,10 +258,7 @@ static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_sessio
                 names, dims, types, strides, buffs, req_flags);
 }
 
-void ggml_hexagon_session::enqueue(struct htp_general_req & req,
-                                   struct dspqueue_buffer * bufs,
-                                   uint32_t                 n_bufs,
-                                   bool                     sync) {
+void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
     // Bump pending flag (cleared in the session::flush once we get the responce)
     this->op_pending++;  // atomic inc
 
@@ -303,13 +298,13 @@ void ggml_hexagon_session::flush() {
 
         // Read response packet from queue
         int err = dspqueue_read(q, &flags,
-                                HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
-                                &n_bufs,                 // Number of buffer references
-                                bufs,                    // Buffer references
-                                sizeof(rsp),             // Max message length
-                                &rsp_size,               // Message length
-                                (uint8_t *) &rsp,
-                                1000000);                // Timeout
+                                   HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
+                                   &n_bufs,                 // Number of buffer references
+                                   bufs,                    // Buffer references
+                                   sizeof(rsp),             // Max message length
+                                   &rsp_size,               // Message length
+                                   (uint8_t *) &rsp,
+                                   1000000);                // Timeout
 
         if (err == AEE_EEXPIRED) {
             // TODO: might need to bail out if the HTP is stuck on something
@@ -359,8 +354,8 @@ struct ggml_backend_hexagon_buffer_context {
 
         int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
         if (err != 0) {
-            GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", s->domain_id,
-                           this->size, this->fd, (unsigned) err);
+            GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
+                    s->domain_id, this->size, this->fd, (unsigned) err);
             return false;
         }
 
@@ -391,12 +386,10 @@ struct ggml_backend_hexagon_buffer_context {
         size += 4 * 1024;  // extra page for padding
 
         if (rpcmem_alloc2) {
-            this->base =
-                (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+            this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
         } else {
             GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
-            this->base =
-                (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+            this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
         }
 
         if (!this->base) {
@@ -460,7 +453,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf
                 (int) ctx->repack);
 
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        ;  // nothing to do for the view
+        ; // nothing to do for the view
     } else {
         if (!ctx->mapped) {
             ctx->mmap();
@@ -709,8 +702,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
 
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -739,7 +732,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
         const uint8_t * src = (const uint8_t *) data + (i * row_size);
         uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
 
@@ -769,8 +762,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
     size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
 
     // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -799,7 +792,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
         const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
         uint8_t *       dst = (uint8_t *) data + (i * row_size);
 
@@ -1035,8 +1028,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
 
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -1065,7 +1058,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
         const uint8_t * src = (const uint8_t *) data + (i * row_size);
         uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
 
@@ -1095,8 +1088,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
     size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
 
     // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -1125,7 +1118,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
         const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
         uint8_t *       dst = (uint8_t *) data + (i * row_size);
 
@@ -1386,8 +1379,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
 
     // Ensure we don't try to read more data than is available in the source buffer 'data'
     // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -1416,7 +1409,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
         const uint8_t * src = (const uint8_t *) data + (i * row_size);
         uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
 
@@ -1446,8 +1439,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
     size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
 
     // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
 
     // Calculate how many full rows and how many remaining bytes we need to process.
     const int64_t n_full_rows = n_bytes_to_copy / row_size;
@@ -1476,7 +1469,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
 
     // 2. Process the final, potentially partial, row
     if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
         const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
         uint8_t *       dst = (uint8_t *) data + (i * row_size);
 
@@ -1599,28 +1592,25 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty
     return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->name.c_str();
 }
 
-static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type,
-                                                                           size_t                     size) {
+static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
+            ggml_backend_buffer_type_t buffer_type, size_t size) {
     auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
     try {
-        ggml_backend_hexagon_buffer_context * ctx =
-            new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
+        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
         return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
-    } catch (const std::exception & exc) {
+    } catch (std::exception const &exc) {
         GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
         return nullptr;
     }
 }
 
 static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
-    ggml_backend_buffer_type_t buffer_type,
-    size_t                     size) {
+            ggml_backend_buffer_type_t buffer_type, size_t size) {
     auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
     try {
-        ggml_backend_hexagon_buffer_context * ctx =
-            new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
+        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
         return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
-    } catch (const std::exception & exc) {
+    } catch (std::exception const &exc) {
         GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
         return nullptr;
     }
@@ -1631,8 +1621,7 @@ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer
     GGML_UNUSED(buffer_type);
 }
 
-static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
-                                                              const struct ggml_tensor * t) {
+static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) {
     return ggml_nbytes(t);
 }
 
@@ -1708,8 +1697,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
         }
 
         // Save the IDs
-        this->session_id    = n.session_id;
-        this->domain_id     = n.effective_domain_id;
+        this->session_id = n.session_id;
+        this->domain_id  = n.effective_domain_id;
         this->valid_session = true;
     }
 
@@ -1718,17 +1707,16 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     char session_uri[256];
     {
         char htp_uri[256];
-        snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0",
-                 opt_arch);
+        snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
 
         struct remote_rpc_get_uri u = {};
-        u.session_id                = this->session_id;
-        u.domain_name               = const_cast<char *>(CDSP_DOMAIN_NAME);
-        u.domain_name_len           = strlen(CDSP_DOMAIN_NAME);
-        u.module_uri                = const_cast<char *>(htp_uri);
-        u.module_uri_len            = strlen(htp_uri);
-        u.uri                       = session_uri;
-        u.uri_len                   = sizeof(session_uri);
+        u.session_id      = this->session_id;
+        u.domain_name     = const_cast<char *>(CDSP_DOMAIN_NAME);
+        u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
+        u.module_uri      = const_cast<char *>(htp_uri);
+        u.module_uri_len  = strlen(htp_uri);
+        u.uri             = session_uri;
+        u.uri_len         = sizeof(session_uri);
 
         int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
         if (err != AEE_SUCCESS) {
@@ -1737,9 +1725,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
 
             snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);
 
-            GGML_LOG_WARN(
-                "ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n",
-                dev_id, err, session_uri);
+            GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri);
         }
     }
 
@@ -1765,7 +1751,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     this->valid_handle = true;
 
     GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
-                  this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
+            this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
 
     // Enable FastRPC QoS mode
     {
@@ -1855,8 +1841,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
     buffer_type.context        = nullptr;
     repack_buffer_type.context = nullptr;
 
-    buffer_type.device        = dev;
-    repack_buffer_type.device = dev;
+    buffer_type.device         = dev;
+    repack_buffer_type.device  = dev;
 
     try {
         allocate(dev_id);
@@ -1866,7 +1852,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
 
         repack_buffer_type.iface   = ggml_backend_hexagon_repack_buffer_type_interface;
         repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
-    } catch (const std::exception & exc) {
+    } catch (std::exception const &exc) {
         release();
         throw;
     }
@@ -1875,8 +1861,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
 ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
     release();
 
-    delete static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type.context);
-    delete static_cast<ggml_backend_hexagon_buffer_type_context *>(repack_buffer_type.context);
+    delete static_cast<ggml_backend_hexagon_buffer_type_context*>(buffer_type.context);
+    delete static_cast<ggml_backend_hexagon_buffer_type_context*>(repack_buffer_type.context);
 }
 
 // ** backend interface
@@ -2178,11 +2164,11 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (src1) {
+    if(src1){
         if (!hex_supported_buffer(sess, src0, src1, dst)) {
             return false;
         }
-    } else {
+    }else{
         if (!hex_supported_buffer(sess, src0, dst)) {
             return false;
         }
@@ -2320,11 +2306,11 @@ static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t
 
     memset(buf, 0, sizeof(*buf));
     auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
-    buf->fd         = tensor_buf->fd;
-    buf->ptr        = t->data;
-    buf->offset     = (uint8_t *) t->data - tensor_buf->base;
-    buf->size       = ggml_nbytes(t);
-    buf->flags      = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0);     // Flush CPU
+    buf->fd      = tensor_buf->fd;
+    buf->ptr     = t->data;
+    buf->offset  = (uint8_t *) t->data - tensor_buf->base;
+    buf->size    = ggml_nbytes(t);
+    buf->flags   = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0);        // Flush CPU
     buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0);  // Invalidate DSP
     return 1;
 }
@@ -2684,7 +2670,8 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
             if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
                 req.op    = HTP_OP_UNARY_SILU;
                 supported = true;
-            } else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) {
+            }
+            else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){
                 req.op    = HTP_OP_UNARY_GELU;
                 supported = true;
             }
@@ -2915,7 +2902,8 @@ static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op
     return (op0 && op0->src[1] == op1->src[1]);
 }
 
-static inline bool is_compute_op(ggml_tensor * node) {
+static inline bool is_compute_op(ggml_tensor *node)
+{
     return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
 }
 
@@ -3025,17 +3013,29 @@ struct node_info {
 
     std::vector<ggml_tensor *> fused;
 
-    ggml_op op() const { return node->op; }
+    ggml_op op() const {
+        return node->op;
+    }
 
-    const ggml_tensor * dst() const { return fused.empty() ? node : fused.back(); }
+    const ggml_tensor * dst() const {
+        return fused.empty() ? node : fused.back();
+    }
 
-    const ggml_tensor * src0() const { return node->src[0]; }
+    const ggml_tensor * src0() const {
+        return node->src[0];
+    }
 
-    const ggml_tensor * src1() const { return node->src[1]; }
+    const ggml_tensor * src1() const {
+        return node->src[1];
+    }
 
-    bool is_empty() const { return ggml_op_is_empty(node->op); }
+    bool is_empty() const {
+        return ggml_op_is_empty(node->op);
+    }
 
-    void add_fused(ggml_tensor * t) { fused.push_back(t); }
+    void add_fused(ggml_tensor * t) {
+        fused.push_back(t);
+    }
 
     bool stackable() const {
         switch (this->op()) {
@@ -3047,7 +3047,9 @@ struct node_info {
         }
     }
 
-    bool same_input(const node_info & n) const { return n.src1() == this->src1(); }
+    bool same_input(const node_info& n) const {
+        return n.src1() == this->src1();
+    }
 };
 
 static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
@@ -3112,21 +3114,25 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
     //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
     for (int i = 0; i < n; i++) {
         node_info node = {
-            /*.node =*/gf->nodes[i],
-            /*.fused =*/{},
+            /*.node =*/ gf->nodes[i],
+            /*.fused =*/ {},
         };
 
         // fuse only ops that start with these operations
         // can be expanded when needed
-        if (node.op() == GGML_OP_ADD || node.op() == GGML_OP_NORM || node.op() == GGML_OP_RMS_NORM) {
+        if (node.op() == GGML_OP_ADD ||
+            node.op() == GGML_OP_NORM ||
+            node.op() == GGML_OP_RMS_NORM) {
             ops[0] = node.op();
 
             int f = i + 1;
             while (f < n && f < i + MAX_FUSE) {
                 // conservatively allow fusing only these ops
                 // can be expanded when needed
-                if (gf->nodes[f]->op != GGML_OP_ADD && gf->nodes[f]->op != GGML_OP_MUL &&
-                    gf->nodes[f]->op != GGML_OP_NORM && gf->nodes[f]->op != GGML_OP_RMS_NORM) {
+                if (gf->nodes[f]->op != GGML_OP_ADD &&
+                    gf->nodes[f]->op != GGML_OP_MUL &&
+                    gf->nodes[f]->op != GGML_OP_NORM &&
+                    gf->nodes[f]->op != GGML_OP_RMS_NORM) {
                     break;
                 }
                 ops[f - i] = gf->nodes[f]->op;
@@ -3302,7 +3308,8 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
         case GGML_OP_UNARY:
             if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
                 supp = ggml_hexagon_supported_activations(sess, op);
-            } else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU) {
+            }
+            else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){
                 supp = ggml_hexagon_supported_activations(sess, op);
             }
             break;
@@ -3409,7 +3416,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
         }
     }
 
-    if (opt_arch < 75) {
+    if(opt_arch < 75) {
         opt_ndev = 1;
         GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
     }
@@ -3418,11 +3425,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
 
     // Create devices / sessions
     for (size_t i = 0; i < opt_ndev; i++) {
-        devices[i].iface = ggml_backend_hexagon_device_i;
-        devices[i].reg   = reg;
+        devices[i].iface   = ggml_backend_hexagon_device_i;
+        devices[i].reg     = reg;
         try {
             devices[i].context = new ggml_hexagon_session(i, &devices[i]);
-        } catch (const std::exception & exc) {
+        } catch (std::exception const &exc) {
             GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
             devices[i].context = nullptr;
         }
diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 273179ae2f..9d3e584a84 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -255,6 +255,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
          src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
+
 static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
                                        struct htp_tensor *       dst,
                                        const int32_t *           op_params,
@@ -300,7 +301,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
     const int BLOCK = 8;
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
         const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
-
+        
         // Prefetch next block
         if (block_end < src0_end_row) {
             const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
@@ -314,11 +315,12 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
 
             // gelu = x * sigmoid(1.702 * x) // current implementation
             if (1 == opt_path) {
-                hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
+                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
                 hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
                 hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
-            } else {
-                hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
+            } 
+            else {
+                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
                 hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
                 hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
             }
@@ -337,6 +339,8 @@ static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
                                octx->src0_nrows_per_thread);
 }
 
+
+
 static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
                                        struct htp_tensor *       dst,
                                        const int32_t *           op_params,
diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h
index 0e893c1d96..a61652304a 100644
--- a/ggml/src/ggml-hexagon/htp/htp-msg.h
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@@ -120,10 +120,10 @@ static const char * htp_type_name(uint32_t t) {
 #define HTP_MAX_DIMS 4
 
 struct htp_tensor {
-    uint32_t data;              // Buffer offset in the messages, and data pointer on the NSP
-    uint32_t type;              // Data type
-    uint32_t ne[HTP_MAX_DIMS];  // Number of elements
-    uint32_t nb[HTP_MAX_DIMS];  // Stride in bytes (see ggml.h ggml_tensor)
+    uint32_t data;                // Buffer offset in the messages, and data pointer on the NSP
+    uint32_t type;                // Data type
+    uint32_t ne[HTP_MAX_DIMS];    // Number of elements
+    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h ggml_tensor)
 };
 
 #define HTP_MAX_OP_PARAMS 64
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index 2ac4cfb263..e7ee589f34 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -49,25 +49,28 @@ void hvx_mul_f32(const uint8_t * restrict src0,
         FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
+
     bool handled_leftover = false;
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-        int step_of_1     = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int step_of_1 = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
         int leftover_size = left_over * sizeof(float);
 
-        HVX_Vector * restrict vec_in1  = (HVX_Vector *) src0;
-        HVX_Vector * restrict vec_in2  = (HVX_Vector *) src1;
+
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
+        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
         HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
 
+
         HVX_Vector slinep;
         HVX_Vector slinec;
         HVX_Vector sline;
@@ -75,42 +78,48 @@ void hvx_mul_f32(const uint8_t * restrict src0,
         HVX_Vector sline2c;
         HVX_Vector sline2;
 
-        slinep  = *vec_in1++;
+        slinep = *vec_in1++; 
         sline2p = *vec_in2++;
-#pragma unroll(4)
-        for (uint32_t i = step_of_1 - 1; i > 0; i--) {
-            slinec  = *vec_in1++;
+        #pragma unroll(4)
+        for(uint32_t i = step_of_1 -1; i> 0; i--){
+            slinec = *vec_in1++;
             sline2c = *vec_in2++;
-            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
-            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
-
-            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
-            slinep                         = slinec;
-            sline2p                        = sline2c;
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);       
+            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+           
+            *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32(  Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            slinep = slinec;
+            sline2p = sline2c;  
         }
-        if (step_of_1 > 1) {
-            slinec  = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
+        if(step_of_1 > 1){
+            slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
             sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;
 
-            sline                          = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
-            sline2                         = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
-            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
-            slinep                         = slinec;
-            sline2p                        = sline2c;
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);       
+            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+            *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32(  Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            slinep = slinec;
+            sline2p = sline2c;
         }
-        if (left_over > 0) {
-            slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++);
+        if(left_over > 0 ){
 
-            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
-            sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++);
-            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+            slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN)
+                    ? slinep
+                    : *vec_in1++);
+
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN)
+                    ? sline2p
+                    : *vec_in2++);
+            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
 
             HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
-            hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));
+            hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));  
             handled_leftover = true;
         }
     }
 
+
     if (left_over > 0 && !handled_leftover) {
         const float * src0f = (const float *) src0 + num_elems_whole;
         const float * src1f = (const float *) src1 + num_elems_whole;
@@ -306,13 +315,13 @@ void hvx_add_f32(const uint8_t * restrict src0,
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
             HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
@@ -449,7 +458,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector           in       = *vec_in1++;
             const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
@@ -459,7 +468,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
             *vec_out++                    = v;
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -503,54 +512,60 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
-    HVX_Vector val_vec          = hvx_vec_splat_fp32(val);
-    bool       handled_leftover = false;
+    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
+    bool handled_leftover = false;
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-        int step_of_1     = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
         int leftover_size = left_over * sizeof(float);
 
-        HVX_Vector *  input_v_ptr  = (HVX_Vector *) src;
-        HVX_UVector * output_v_ptr = (HVX_UVector *) dst;
+
+
+        HVX_Vector *  input_v_ptr = (HVX_Vector *) src;
+        HVX_UVector *  output_v_ptr       = (HVX_UVector *) dst;
+
 
         HVX_Vector slinep;
         HVX_Vector slinec;
         HVX_Vector sline;
+            
+        slinep = *input_v_ptr++; 
 
-        slinep = *input_v_ptr++;
-
-#pragma unroll(4)
-        for (uint32_t i = step_of_1 - 1; i > 0; i--) {
-            slinec                              = *input_v_ptr++;
-            sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
-            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+        #pragma unroll(4)
+        for(uint32_t i = step_of_1 - 1; i > 0; i--){
+            slinec = *input_v_ptr++;
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *)(output_v_ptr++)) =  Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
             /* Prepare slinep for next iteration */
-            slinep                              = slinec;
+            slinep = slinec;        
         }
 
-        if (step_of_1 > 0) {
+        if(step_of_1 > 0){
+
             slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
-            sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
-            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *)(output_v_ptr++)) =  Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
 
             slinep = slinec;
         }
 
-        if (leftover_size > 0) {
-            slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++);
+        if(leftover_size > 0){
+            slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN)
+                    ? slinep
+                    : *input_v_ptr++);
 
             sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
 
-            HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
-            hvx_vec_store_u(output_v_ptr, leftover_size, sout);
+            HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            hvx_vec_store_u(output_v_ptr, leftover_size, sout);  
             handled_leftover = true;
         }
     }
@@ -591,13 +606,13 @@ void hvx_sub_f32(const uint8_t * restrict src0,
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
             HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
@@ -732,13 +747,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -774,7 +789,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems)
     HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000);
     HVX_Vector zero_vec    = Q6_V_vsplat_R(0x00000000);
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
         HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1);
         sum_vec_acc  = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v);
@@ -818,13 +833,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
     if (0 == unaligned_loop) {
         HVX_Vector * vec_in = (HVX_Vector *) src;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++);
             sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -867,13 +882,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i
         HVX_Vector * vec_in1 = (HVX_Vector *) src;
         HVX_Vector * vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -916,12 +931,12 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in = (HVX_Vector *) src;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -959,7 +974,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
     HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
         vec_min    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
         *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
@@ -997,7 +1012,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
     HVX_Vector range_left  = hvx_vec_splat_fp32(limit_left);
     HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
         HVX_Vector in_vec = *vec_in++;
         HVX_Vector temp_v = in_vec;
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index c5da167d49..0b24786391 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -23,18 +23,20 @@ typedef union {
 
 /* Q6_Vsf_equals_Vw is only available on v73+.*/
 #if __HVX_ARCH__ < 73
-static inline HVX_Vector int32_to_qfloat(const HVX_Vector in) {
-    const HVX_Vector vzero      = Q6_V_vzero();
-    HVX_VectorPred   is_zero    = Q6_Q_vcmp_eq_VwVw(in, vzero);
-    HVX_Vector       lshift     = Q6_Vw_vnormamt_Vw(in);
-    HVX_Vector       normalized = Q6_Vw_vasl_VwVw(in, lshift);
-    HVX_Vector       vexp       = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
-    HVX_Vector       mant       = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
-    HVX_Vector       ret        = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
+static inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
+{
+    HVX_Vector const vzero = Q6_V_vzero();
+    HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
+    HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
+    HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
+    HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
+    HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
+    HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
     return ret;
 }
 
-static inline HVX_Vector Q6_Vsf_equals_Vw(const HVX_Vector in) {
+static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
+{
     return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
 }
 #endif
@@ -107,7 +109,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -131,7 +133,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -155,7 +157,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -180,7 +182,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -204,7 +206,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -228,7 +230,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -253,7 +255,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         vdst[i] = velem;
     }
@@ -263,6 +265,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
     }
 }
 
+
 /* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
 static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
     uint32_t left_off  = (size_t) addr & (chunk_size - 1);
@@ -270,6 +273,8 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3
     return right_off <= chunk_size;
 }
 
+
+
 static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
     HVX_VectorAlias u = { .v = v };
 
@@ -987,15 +992,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
     const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
     }
 }
 
-static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
+
+static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
     int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
-    int leftover  = num_elems - (step_of_1 * VLEN_FP32);
+    int leftover = num_elems - (step_of_1 * VLEN_FP32);
 
     int32_t leftover_size = leftover * sizeof(float);
 
@@ -1006,44 +1012,51 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
     const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
     const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
 
-    const float * input  = (float *) src;
-    float *       output = (float *) dst;
+    const float *input = (float *)src;
+    float *output = (float *)dst;
+
+    HVX_Vector *  input_v_ptr = (HVX_Vector *) input;
+    HVX_UVector *  output_v_ptr       = (HVX_UVector *) output;
 
-    HVX_Vector *  input_v_ptr  = (HVX_Vector *) input;
-    HVX_UVector * output_v_ptr = (HVX_UVector *) output;
 
     HVX_Vector slinep;
     HVX_Vector slinec;
     HVX_Vector sline;
+    
 
-    slinep = *input_v_ptr++;
-#pragma unroll(4)
-    for (uint32_t i = step_of_1 - 1; i > 0; i--) {
-        slinec                              = *input_v_ptr++;
-        sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+    slinep = *input_v_ptr++; 
+    #pragma unroll(4)
+    for(uint32_t i = step_of_1 -1; i> 0; i--){
+        slinec = *input_v_ptr++;
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);       
+        *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
         /* Prepare slinep for next iteration */
-        slinep                              = slinec;
+        slinep = slinec;        
     }
 
-    if (step_of_1 > 0) {
+    if(step_of_1> 0){
+
         slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
-        sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
-        ;
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);;
 
         slinep = slinec;
     }
-    if (leftover > 0) {
-        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++);
+    if(leftover> 0){
+        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128)
+                   ? slinep
+                   : *input_v_ptr++);
 
         sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
 
         HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
-        hvx_vec_store_u(output_v_ptr, leftover_size, sout);
+        hvx_vec_store_u(output_v_ptr, leftover_size, sout);        
     }
+
+  
 }
 
+
 float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
 void  hvx_mul_f32(const uint8_t * restrict src0,
                   const uint8_t * restrict src1,
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index cbfdd0472f..e30ae69502 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -151,7 +151,7 @@ static int vtcm_acquire(struct htp_context * ctx) {
         qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
         err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
         if (err != 0) {
-            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err);
+            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
             abort();
         }
         HAP_compute_res_release_cached(ctx->vtcm_rctx);
@@ -159,7 +159,7 @@ static int vtcm_acquire(struct htp_context * ctx) {
 
         err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
         if (err != 0) {
-            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err);
+            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
             abort();
         }
         ctx->vtcm_valid = true;
@@ -411,7 +411,7 @@ static void proc_matmul_req(struct htp_context *     ctx,
     rsp_bufs[0].ptr    = bufs[2].ptr;
     rsp_bufs[0].size   = bufs[2].size;
     rsp_bufs[0].offset = bufs[2].offset;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
@@ -453,7 +453,7 @@ static void proc_matmul_id_req(struct htp_context *     ctx,
     rsp_bufs[0].ptr    = bufs[3].ptr;
     rsp_bufs[0].size   = bufs[3].size;
     rsp_bufs[0].offset = bufs[3].offset;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
@@ -494,7 +494,7 @@ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * r
     rsp_bufs[0].ptr    = bufs[2].ptr;
     rsp_bufs[0].offset = bufs[2].offset;
     rsp_bufs[0].size   = bufs[2].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
@@ -533,7 +533,7 @@ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * r
     rsp_bufs[0].ptr    = bufs[3].ptr;
     rsp_bufs[0].offset = bufs[3].offset;
     rsp_bufs[0].size   = bufs[3].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
@@ -574,7 +574,7 @@ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * re
     rsp_bufs[0].ptr    = bufs[1].ptr;
     rsp_bufs[0].offset = bufs[1].offset;
     rsp_bufs[0].size   = bufs[1].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
 
     // Setup Op context
@@ -618,8 +618,8 @@ static void proc_activations_req(struct htp_context *     ctx,
     rsp_bufs[0].ptr    = bufs[write_idx].ptr;
     rsp_bufs[0].offset = bufs[write_idx].offset;
     rsp_bufs[0].size   = bufs[write_idx].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
-                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
 
     // Setup Op context
     struct htp_ops_context octx = { 0 };
@@ -674,8 +674,8 @@ static void proc_rope_req(struct htp_context *     ctx,
     rsp_bufs[0].ptr    = bufs[write_idx].ptr;
     rsp_bufs[0].offset = bufs[write_idx].offset;
     rsp_bufs[0].size   = bufs[write_idx].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
-                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
 
     // Setup Op context
     struct htp_ops_context octx = { 0 };

From 52f43fb962f86975bfff2873f4ac96f671c9a51a Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Tue, 16 Dec 2025 16:58:52 -0500
Subject: [PATCH 14/14] fix: fix loop overflow

---
 ggml/src/ggml-hexagon/htp/hvx-utils.c | 4 ++--
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index e7ee589f34..8e5c2c6983 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -81,7 +81,7 @@ void hvx_mul_f32(const uint8_t * restrict src0,
         slinep = *vec_in1++; 
         sline2p = *vec_in2++;
         #pragma unroll(4)
-        for(uint32_t i = step_of_1 -1; i> 0; i--){
+        for(int i = step_of_1 -1; i> 0; i--){
             slinec = *vec_in1++;
             sline2c = *vec_in2++;
             sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);       
@@ -540,7 +540,7 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         slinep = *input_v_ptr++; 
 
         #pragma unroll(4)
-        for(uint32_t i = step_of_1 - 1; i > 0; i--){
+        for(int i = step_of_1 - 1; i > 0; i--){
             slinec = *input_v_ptr++;
             sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
             *((HVX_UVector *)(output_v_ptr++)) =  Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 0b24786391..6b5b65a29e 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -1026,7 +1026,7 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
 
     slinep = *input_v_ptr++; 
     #pragma unroll(4)
-    for(uint32_t i = step_of_1 -1; i> 0; i--){
+    for(int i = step_of_1 -1; i> 0; i--){
         slinec = *input_v_ptr++;
         sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);       
         *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);