Revert "chore: reformat code with clang-formatter to pass cli test"

This reverts commit 952877ec24.
2025-12-16 14:28:34 -05:00 · 2025-12-16 14:28:34 -05:00 · cf3a65fb73
parent 952877ec24
commit cf3a65fb73
6 changed files with 260 additions and 221 deletions
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@ -8,8 +8,8 @@
 #include <atomic>
 #include <chrono>
 #include <mutex>
-#include <stdexcept>
 #include <string>
+#include <stdexcept>

 #ifdef _WIN32
 #    include <sal.h>
@ -53,12 +53,10 @@ static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMP
 static int opt_opsync = 0;  // synchronous ops

 #define HEX_VERBOSE(...) \
-    if (opt_verbose)     \
-    GGML_LOG_DEBUG(__VA_ARGS__)
+    if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)

 #define HEX_PROFILE(...) \
-    if (opt_profile)     \
-    GGML_LOG_INFO(__VA_ARGS__)
+    if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)

 static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
    return ((size_t) addr & (align - 1)) == 0;
@ -220,7 +218,7 @@ struct ggml_hexagon_session {
    void allocate(int dev_id) noexcept(false);
    void release() noexcept(true);

-    void enqueue(struct htp_general_req & req, struct dspqueue_buffer * bufs, uint32_t n_bufs, bool sync = false);
+    void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
    void flush();

    ggml_backend_buffer_type buffer_type;
@ -260,10 +258,7 @@ static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_sessio
                names, dims, types, strides, buffs, req_flags);
 }

-void ggml_hexagon_session::enqueue(struct htp_general_req & req,
-                                   struct dspqueue_buffer * bufs,
-                                   uint32_t                 n_bufs,
-                                   bool                     sync) {
+void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
    // Bump pending flag (cleared in the session::flush once we get the responce)
    this->op_pending++;  // atomic inc

@ -303,13 +298,13 @@ void ggml_hexagon_session::flush() {

        // Read response packet from queue
        int err = dspqueue_read(q, &flags,
-                                HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
-                                &n_bufs,                 // Number of buffer references
-                                bufs,                    // Buffer references
-                                sizeof(rsp),             // Max message length
-                                &rsp_size,               // Message length
-                                (uint8_t *) &rsp,
-                                1000000);                // Timeout
+                                   HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
+                                   &n_bufs,                 // Number of buffer references
+                                   bufs,                    // Buffer references
+                                   sizeof(rsp),             // Max message length
+                                   &rsp_size,               // Message length
+                                   (uint8_t *) &rsp,
+                                   1000000);                // Timeout

        if (err == AEE_EEXPIRED) {
            // TODO: might need to bail out if the HTP is stuck on something
@ -359,8 +354,8 @@ struct ggml_backend_hexagon_buffer_context {

        int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
        if (err != 0) {
-            GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", s->domain_id,
-                           this->size, this->fd, (unsigned) err);
+            GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
+                    s->domain_id, this->size, this->fd, (unsigned) err);
            return false;
        }

@ -391,12 +386,10 @@ struct ggml_backend_hexagon_buffer_context {
        size += 4 * 1024;  // extra page for padding

        if (rpcmem_alloc2) {
-            this->base =
-                (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+            this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
        } else {
            GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
-            this->base =
-                (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+            this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
        }

        if (!this->base) {
@ -460,7 +453,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf
                (int) ctx->repack);

    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        ;  // nothing to do for the view
+        ; // nothing to do for the view
    } else {
        if (!ctx->mapped) {
            ctx->mmap();
@ -709,8 +702,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)

    // Ensure we don't try to read more data than is available in the source buffer 'data'
    // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

    // Calculate how many full rows and how many remaining bytes we need to process.
    const int64_t n_full_rows = n_bytes_to_copy / row_size;
@ -739,7 +732,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)

    // 2. Process the final, potentially partial, row
    if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
        const uint8_t * src = (const uint8_t *) data + (i * row_size);
        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);

@ -769,8 +762,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)

    // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

    // Calculate how many full rows and how many remaining bytes we need to process.
    const int64_t n_full_rows = n_bytes_to_copy / row_size;
@ -799,7 +792,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)

    // 2. Process the final, potentially partial, row
    if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
        uint8_t *       dst = (uint8_t *) data + (i * row_size);

@ -1035,8 +1028,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)

    // Ensure we don't try to read more data than is available in the source buffer 'data'
    // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

    // Calculate how many full rows and how many remaining bytes we need to process.
    const int64_t n_full_rows = n_bytes_to_copy / row_size;
@ -1065,7 +1058,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)

    // 2. Process the final, potentially partial, row
    if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
        const uint8_t * src = (const uint8_t *) data + (i * row_size);
        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);

@ -1095,8 +1088,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)

    // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

    // Calculate how many full rows and how many remaining bytes we need to process.
    const int64_t n_full_rows = n_bytes_to_copy / row_size;
@ -1125,7 +1118,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)

    // 2. Process the final, potentially partial, row
    if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
        uint8_t *       dst = (uint8_t *) data + (i * row_size);

@ -1386,8 +1379,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si

    // Ensure we don't try to read more data than is available in the source buffer 'data'
    // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

    // Calculate how many full rows and how many remaining bytes we need to process.
    const int64_t n_full_rows = n_bytes_to_copy / row_size;
@ -1416,7 +1409,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si

    // 2. Process the final, potentially partial, row
    if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
        const uint8_t * src = (const uint8_t *) data + (i * row_size);
        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);

@ -1446,8 +1439,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)

    // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t) nrows * row_size;
-    const size_t n_bytes_to_copy   = size < total_tensor_size ? size : total_tensor_size;
+    const size_t total_tensor_size = (size_t)nrows * row_size;
+    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;

    // Calculate how many full rows and how many remaining bytes we need to process.
    const int64_t n_full_rows = n_bytes_to_copy / row_size;
@ -1476,7 +1469,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si

    // 2. Process the final, potentially partial, row
    if (n_rem_bytes > 0) {
-        const int64_t   i   = n_full_rows;
+        const int64_t i = n_full_rows;
        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
        uint8_t *       dst = (uint8_t *) data + (i * row_size);

@ -1599,28 +1592,25 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty
    return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->name.c_str();
 }

-static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type,
-                                                                           size_t                     size) {
+static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
+            ggml_backend_buffer_type_t buffer_type, size_t size) {
    auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
    try {
-        ggml_backend_hexagon_buffer_context * ctx =
-            new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
+        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
        return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
-    } catch (const std::exception & exc) {
+    } catch (std::exception const &exc) {
        GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
        return nullptr;
    }
 }

 static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
-    ggml_backend_buffer_type_t buffer_type,
-    size_t                     size) {
+            ggml_backend_buffer_type_t buffer_type, size_t size) {
    auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
    try {
-        ggml_backend_hexagon_buffer_context * ctx =
-            new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
+        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
        return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
-    } catch (const std::exception & exc) {
+    } catch (std::exception const &exc) {
        GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
        return nullptr;
    }
@ -1631,8 +1621,7 @@ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer
    GGML_UNUSED(buffer_type);
 }

-static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
-                                                              const struct ggml_tensor * t) {
+static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) {
    return ggml_nbytes(t);
 }

@ -1708,8 +1697,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
        }

        // Save the IDs
-        this->session_id    = n.session_id;
-        this->domain_id     = n.effective_domain_id;
+        this->session_id = n.session_id;
+        this->domain_id  = n.effective_domain_id;
        this->valid_session = true;
    }

@ -1718,17 +1707,16 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
    char session_uri[256];
    {
        char htp_uri[256];
-        snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0",
-                 opt_arch);
+        snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);

        struct remote_rpc_get_uri u = {};
-        u.session_id                = this->session_id;
-        u.domain_name               = const_cast<char *>(CDSP_DOMAIN_NAME);
-        u.domain_name_len           = strlen(CDSP_DOMAIN_NAME);
-        u.module_uri                = const_cast<char *>(htp_uri);
-        u.module_uri_len            = strlen(htp_uri);
-        u.uri                       = session_uri;
-        u.uri_len                   = sizeof(session_uri);
+        u.session_id      = this->session_id;
+        u.domain_name     = const_cast<char *>(CDSP_DOMAIN_NAME);
+        u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
+        u.module_uri      = const_cast<char *>(htp_uri);
+        u.module_uri_len  = strlen(htp_uri);
+        u.uri             = session_uri;
+        u.uri_len         = sizeof(session_uri);

        int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
        if (err != AEE_SUCCESS) {
@ -1737,9 +1725,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {

            snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);

-            GGML_LOG_WARN(
-                "ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n",
-                dev_id, err, session_uri);
+            GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri);
        }
    }

@ -1765,7 +1751,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
    this->valid_handle = true;

    GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
-                  this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
+            this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);

    // Enable FastRPC QoS mode
    {
@ -1855,8 +1841,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
    buffer_type.context        = nullptr;
    repack_buffer_type.context = nullptr;

-    buffer_type.device        = dev;
-    repack_buffer_type.device = dev;
+    buffer_type.device         = dev;
+    repack_buffer_type.device  = dev;

    try {
        allocate(dev_id);
@ -1866,7 +1852,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n

        repack_buffer_type.iface   = ggml_backend_hexagon_repack_buffer_type_interface;
        repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
-    } catch (const std::exception & exc) {
+    } catch (std::exception const &exc) {
        release();
        throw;
    }
@ -1875,8 +1861,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
 ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
    release();

-    delete static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type.context);
-    delete static_cast<ggml_backend_hexagon_buffer_type_context *>(repack_buffer_type.context);
+    delete static_cast<ggml_backend_hexagon_buffer_type_context*>(buffer_type.context);
+    delete static_cast<ggml_backend_hexagon_buffer_type_context*>(repack_buffer_type.context);
 }

 // ** backend interface
@ -2178,11 +2164,11 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
    }

    // src0, src1 & dst must be mapped to the same session
-    if (src1) {
+    if(src1){
        if (!hex_supported_buffer(sess, src0, src1, dst)) {
            return false;
        }
-    } else {
+    }else{
        if (!hex_supported_buffer(sess, src0, dst)) {
            return false;
        }
@ -2320,11 +2306,11 @@ static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t

    memset(buf, 0, sizeof(*buf));
    auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
-    buf->fd         = tensor_buf->fd;
-    buf->ptr        = t->data;
-    buf->offset     = (uint8_t *) t->data - tensor_buf->base;
-    buf->size       = ggml_nbytes(t);
-    buf->flags      = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0);     // Flush CPU
+    buf->fd      = tensor_buf->fd;
+    buf->ptr     = t->data;
+    buf->offset  = (uint8_t *) t->data - tensor_buf->base;
+    buf->size    = ggml_nbytes(t);
+    buf->flags   = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0);        // Flush CPU
    buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0);  // Invalidate DSP
    return 1;
 }
@ -2684,7 +2670,8 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
            if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
                req.op    = HTP_OP_UNARY_SILU;
                supported = true;
-            } else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) {
+            }
+            else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){
                req.op    = HTP_OP_UNARY_GELU;
                supported = true;
            }
@ -2915,7 +2902,8 @@ static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op
    return (op0 && op0->src[1] == op1->src[1]);
 }

-static inline bool is_compute_op(ggml_tensor * node) {
+static inline bool is_compute_op(ggml_tensor *node)
+{
    return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
 }

@ -3025,17 +3013,29 @@ struct node_info {

    std::vector<ggml_tensor *> fused;

-    ggml_op op() const { return node->op; }
+    ggml_op op() const {
+        return node->op;
+    }

-    const ggml_tensor * dst() const { return fused.empty() ? node : fused.back(); }
+    const ggml_tensor * dst() const {
+        return fused.empty() ? node : fused.back();
+    }

-    const ggml_tensor * src0() const { return node->src[0]; }
+    const ggml_tensor * src0() const {
+        return node->src[0];
+    }

-    const ggml_tensor * src1() const { return node->src[1]; }
+    const ggml_tensor * src1() const {
+        return node->src[1];
+    }

-    bool is_empty() const { return ggml_op_is_empty(node->op); }
+    bool is_empty() const {
+        return ggml_op_is_empty(node->op);
+    }

-    void add_fused(ggml_tensor * t) { fused.push_back(t); }
+    void add_fused(ggml_tensor * t) {
+        fused.push_back(t);
+    }

    bool stackable() const {
        switch (this->op()) {
@ -3047,7 +3047,9 @@ struct node_info {
        }
    }

-    bool same_input(const node_info & n) const { return n.src1() == this->src1(); }
+    bool same_input(const node_info& n) const {
+        return n.src1() == this->src1();
+    }
 };

 static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
@ -3112,21 +3114,25 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
    //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
    for (int i = 0; i < n; i++) {
        node_info node = {
-            /*.node =*/gf->nodes[i],
-            /*.fused =*/{},
+            /*.node =*/ gf->nodes[i],
+            /*.fused =*/ {},
        };

        // fuse only ops that start with these operations
        // can be expanded when needed
-        if (node.op() == GGML_OP_ADD || node.op() == GGML_OP_NORM || node.op() == GGML_OP_RMS_NORM) {
+        if (node.op() == GGML_OP_ADD ||
+            node.op() == GGML_OP_NORM ||
+            node.op() == GGML_OP_RMS_NORM) {
            ops[0] = node.op();

            int f = i + 1;
            while (f < n && f < i + MAX_FUSE) {
                // conservatively allow fusing only these ops
                // can be expanded when needed
-                if (gf->nodes[f]->op != GGML_OP_ADD && gf->nodes[f]->op != GGML_OP_MUL &&
-                    gf->nodes[f]->op != GGML_OP_NORM && gf->nodes[f]->op != GGML_OP_RMS_NORM) {
+                if (gf->nodes[f]->op != GGML_OP_ADD &&
+                    gf->nodes[f]->op != GGML_OP_MUL &&
+                    gf->nodes[f]->op != GGML_OP_NORM &&
+                    gf->nodes[f]->op != GGML_OP_RMS_NORM) {
                    break;
                }
                ops[f - i] = gf->nodes[f]->op;
@ -3302,7 +3308,8 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
        case GGML_OP_UNARY:
            if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
                supp = ggml_hexagon_supported_activations(sess, op);
-            } else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU) {
+            }
+            else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){
                supp = ggml_hexagon_supported_activations(sess, op);
            }
            break;
@ -3409,7 +3416,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
        }
    }

-    if (opt_arch < 75) {
+    if(opt_arch < 75) {
        opt_ndev = 1;
        GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
    }
@ -3418,11 +3425,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {

    // Create devices / sessions
    for (size_t i = 0; i < opt_ndev; i++) {
-        devices[i].iface = ggml_backend_hexagon_device_i;
-        devices[i].reg   = reg;
+        devices[i].iface   = ggml_backend_hexagon_device_i;
+        devices[i].reg     = reg;
        try {
            devices[i].context = new ggml_hexagon_session(i, &devices[i]);
-        } catch (const std::exception & exc) {
+        } catch (std::exception const &exc) {
            GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
            devices[i].context = nullptr;
        }
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@ -255,6 +255,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
         src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }

+
 static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
                                       struct htp_tensor *       dst,
                                       const int32_t *           op_params,
@ -314,11 +315,12 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,

            // gelu = x * sigmoid(1.702 * x) // current implementation
            if (1 == opt_path) {
-                hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
+                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
                hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
                hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
-            } else {
-                hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
+            } 
+            else {
+                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
                hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
                hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
            }
@ -337,6 +339,8 @@ static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
                               octx->src0_nrows_per_thread);
 }

+
+
 static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
                                       struct htp_tensor *       dst,
                                       const int32_t *           op_params,
--- a/ggml/src/ggml-hexagon/htp/htp-msg.h
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@ -120,10 +120,10 @@ static const char * htp_type_name(uint32_t t) {
 #define HTP_MAX_DIMS 4

 struct htp_tensor {
-    uint32_t data;              // Buffer offset in the messages, and data pointer on the NSP
-    uint32_t type;              // Data type
-    uint32_t ne[HTP_MAX_DIMS];  // Number of elements
-    uint32_t nb[HTP_MAX_DIMS];  // Stride in bytes (see ggml.h ggml_tensor)
+    uint32_t data;                // Buffer offset in the messages, and data pointer on the NSP
+    uint32_t type;                // Data type
+    uint32_t ne[HTP_MAX_DIMS];    // Number of elements
+    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h ggml_tensor)
 };

 #define HTP_MAX_OP_PARAMS 64
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@ -49,25 +49,28 @@ void hvx_mul_f32(const uint8_t * restrict src0,
        FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
    }

+
    bool handled_leftover = false;
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;

-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
-        int step_of_1     = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int step_of_1 = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
        int leftover_size = left_over * sizeof(float);

-        HVX_Vector * restrict vec_in1  = (HVX_Vector *) src0;
-        HVX_Vector * restrict vec_in2  = (HVX_Vector *) src1;
+
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
+        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
        HVX_UVector * restrict vec_out = (HVX_UVector *) dst;

+
        HVX_Vector slinep;
        HVX_Vector slinec;
        HVX_Vector sline;
@ -75,35 +78,40 @@ void hvx_mul_f32(const uint8_t * restrict src0,
        HVX_Vector sline2c;
        HVX_Vector sline2;

-        slinep  = *vec_in1++;
+        slinep = *vec_in1++; 
        sline2p = *vec_in2++;
-#pragma unroll(4)
-        for (uint32_t i = step_of_1 - 1; i > 0; i--) {
-            slinec  = *vec_in1++;
+        #pragma unroll(4)
+        for(uint32_t i = step_of_1 -1; i> 0; i--){
+            slinec = *vec_in1++;
            sline2c = *vec_in2++;
-            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
-            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);       
+            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
           
-            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
-            slinep                         = slinec;
-            sline2p                        = sline2c;
+            *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32(  Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            slinep = slinec;
+            sline2p = sline2c;  
        }
-        if (step_of_1 > 1) {
-            slinec  = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
+        if(step_of_1 > 1){
+            slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
            sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;

-            sline                          = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
-            sline2                         = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
-            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
-            slinep                         = slinec;
-            sline2p                        = sline2c;
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);       
+            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+            *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32(  Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            slinep = slinec;
+            sline2p = sline2c;
        }
-        if (left_over > 0) {
-            slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++);
+        if(left_over > 0 ){

-            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
-            sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++);
-            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+            slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN)
+                    ? slinep
+                    : *vec_in1++);
+
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN)
+                    ? sline2p
+                    : *vec_in2++);
+            sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);

            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
            hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));  
@ -111,6 +119,7 @@ void hvx_mul_f32(const uint8_t * restrict src0,
        }
    }

+
    if (left_over > 0 && !handled_leftover) {
        const float * src0f = (const float *) src0 + num_elems_whole;
        const float * src1f = (const float *) src1 + num_elems_whole;
@ -306,13 +315,13 @@ void hvx_add_f32(const uint8_t * restrict src0,
        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;

-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
@ -449,7 +458,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;

-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector           in       = *vec_in1++;
            const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
@ -459,7 +468,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
            *vec_out++                    = v;
        }
    } else {
-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);

@ -503,23 +512,26 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
        FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
    }

-    HVX_Vector val_vec          = hvx_vec_splat_fp32(val);
-    bool       handled_leftover = false;
+    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
+    bool handled_leftover = false;
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;

-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
-        int step_of_1     = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
        int leftover_size = left_over * sizeof(float);

-        HVX_Vector *  input_v_ptr  = (HVX_Vector *) src;
-        HVX_UVector * output_v_ptr = (HVX_UVector *) dst;
+
+
+        HVX_Vector *  input_v_ptr = (HVX_Vector *) src;
+        HVX_UVector *  output_v_ptr       = (HVX_UVector *) dst;
+

        HVX_Vector slinep;
        HVX_Vector slinec;
@ -527,29 +539,32 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
            
        slinep = *input_v_ptr++; 

-#pragma unroll(4)
-        for (uint32_t i = step_of_1 - 1; i > 0; i--) {
-            slinec                              = *input_v_ptr++;
-            sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
-            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+        #pragma unroll(4)
+        for(uint32_t i = step_of_1 - 1; i > 0; i--){
+            slinec = *input_v_ptr++;
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *)(output_v_ptr++)) =  Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
            /* Prepare slinep for next iteration */
-            slinep                              = slinec;
+            slinep = slinec;        
        }

-        if (step_of_1 > 0) {
+        if(step_of_1 > 0){
+
            slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
-            sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
-            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *)(output_v_ptr++)) =  Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));

            slinep = slinec;
        }

-        if (leftover_size > 0) {
-            slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++);
+        if(leftover_size > 0){
+            slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN)
+                    ? slinep
+                    : *input_v_ptr++);

            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);

-            HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
            hvx_vec_store_u(output_v_ptr, leftover_size, sout);  
            handled_leftover = true;
        }
@ -591,13 +606,13 @@ void hvx_sub_f32(const uint8_t * restrict src0,
        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;

-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
@ -732,13 +747,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;

-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);

@ -774,7 +789,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems)
    HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000);
    HVX_Vector zero_vec    = Q6_V_vsplat_R(0x00000000);

-#pragma unroll(4)
+    #pragma unroll(4)
    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1);
        sum_vec_acc  = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v);
@ -818,13 +833,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
    if (0 == unaligned_loop) {
        HVX_Vector * vec_in = (HVX_Vector *) src;

-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++);
            sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++);
        }
    } else {
-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);

@ -867,13 +882,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i
        HVX_Vector * vec_in1 = (HVX_Vector *) src;
        HVX_Vector * vec_out = (HVX_Vector *) dst;

-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);

@ -916,12 +931,12 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in = (HVX_Vector *) src;

-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++);
        }
    } else {
-#pragma unroll(4)
+        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);

@ -959,7 +974,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
    HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
    HVX_Vector * restrict vec_out = (HVX_Vector *) dst;

-#pragma unroll(4)
+    #pragma unroll(4)
    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
        vec_min    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
        *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
@ -997,7 +1012,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
    HVX_Vector range_left  = hvx_vec_splat_fp32(limit_left);
    HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);

-#pragma unroll(4)
+    #pragma unroll(4)
    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
        HVX_Vector in_vec = *vec_in++;
        HVX_Vector temp_v = in_vec;
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@ -23,18 +23,20 @@ typedef union {

 /* Q6_Vsf_equals_Vw is only available on v73+.*/
 #if __HVX_ARCH__ < 73
-static inline HVX_Vector int32_to_qfloat(const HVX_Vector in) {
-    const HVX_Vector vzero      = Q6_V_vzero();
-    HVX_VectorPred   is_zero    = Q6_Q_vcmp_eq_VwVw(in, vzero);
-    HVX_Vector       lshift     = Q6_Vw_vnormamt_Vw(in);
-    HVX_Vector       normalized = Q6_Vw_vasl_VwVw(in, lshift);
-    HVX_Vector       vexp       = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
-    HVX_Vector       mant       = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
-    HVX_Vector       ret        = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
+static inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
+{
+    HVX_Vector const vzero = Q6_V_vzero();
+    HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
+    HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
+    HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
+    HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
+    HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
+    HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
    return ret;
 }

-static inline HVX_Vector Q6_Vsf_equals_Vw(const HVX_Vector in) {
+static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
+{
    return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
 }
 #endif
@ -107,7 +109,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest

    uint32_t i = 0;

-#pragma unroll(4)
+    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
@ -131,7 +133,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest

    uint32_t i = 0;

-#pragma unroll(4)
+    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
@ -155,7 +157,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest

    uint32_t i = 0;

-#pragma unroll(4)
+    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
@ -180,7 +182,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest

    uint32_t i = 0;

-#pragma unroll(4)
+    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
@ -204,7 +206,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest

    uint32_t i = 0;

-#pragma unroll(4)
+    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
@ -228,7 +230,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest

    uint32_t i = 0;

-#pragma unroll(4)
+    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
@ -253,7 +255,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t

    uint32_t i = 0;

-#pragma unroll(4)
+    #pragma unroll(4)
    for (; i < nvec; i++) {
        vdst[i] = velem;
    }
@ -263,6 +265,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
    }
 }

+
 /* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
 static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
@ -270,6 +273,8 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3
    return right_off <= chunk_size;
 }

+
+
 static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
    HVX_VectorAlias u = { .v = v };

@ -987,15 +992,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);

-#pragma unroll(4)
+    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
    }
 }

-static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
+
+static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
    int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
-    int leftover  = num_elems - (step_of_1 * VLEN_FP32);
+    int leftover = num_elems - (step_of_1 * VLEN_FP32);

    int32_t leftover_size = leftover * sizeof(float);

@ -1006,44 +1012,51 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);

-    const float * input  = (float *) src;
-    float *       output = (float *) dst;
+    const float *input = (float *)src;
+    float *output = (float *)dst;
+
+    HVX_Vector *  input_v_ptr = (HVX_Vector *) input;
+    HVX_UVector *  output_v_ptr       = (HVX_UVector *) output;

-    HVX_Vector *  input_v_ptr  = (HVX_Vector *) input;
-    HVX_UVector * output_v_ptr = (HVX_UVector *) output;

    HVX_Vector slinep;
    HVX_Vector slinec;
    HVX_Vector sline;
    
+
    slinep = *input_v_ptr++; 
-#pragma unroll(4)
-    for (uint32_t i = step_of_1 - 1; i > 0; i--) {
-        slinec                              = *input_v_ptr++;
-        sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+    #pragma unroll(4)
+    for(uint32_t i = step_of_1 -1; i> 0; i--){
+        slinec = *input_v_ptr++;
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);       
+        *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
        /* Prepare slinep for next iteration */
-        slinep                              = slinec;
+        slinep = slinec;        
    }

-    if (step_of_1 > 0) {
+    if(step_of_1> 0){
+
        slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
-        sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
-        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
-        ;
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);;

        slinep = slinec;
    }
-    if (leftover > 0) {
-        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++);
+    if(leftover> 0){
+        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128)
+                   ? slinep
+                   : *input_v_ptr++);

        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);

        HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
        hvx_vec_store_u(output_v_ptr, leftover_size, sout);        
    }
+
+  
 }

+
 float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
 void  hvx_mul_f32(const uint8_t * restrict src0,
                  const uint8_t * restrict src1,
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@ -151,7 +151,7 @@ static int vtcm_acquire(struct htp_context * ctx) {
        qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
        err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
        if (err != 0) {
-            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err);
+            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
            abort();
        }
        HAP_compute_res_release_cached(ctx->vtcm_rctx);
@ -159,7 +159,7 @@ static int vtcm_acquire(struct htp_context * ctx) {

        err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
        if (err != 0) {
-            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err);
+            FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
            abort();
        }
        ctx->vtcm_valid = true;
@ -411,7 +411,7 @@ static void proc_matmul_req(struct htp_context *     ctx,
    rsp_bufs[0].ptr    = bufs[2].ptr;
    rsp_bufs[0].size   = bufs[2].size;
    rsp_bufs[0].offset = bufs[2].offset;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
@ -453,7 +453,7 @@ static void proc_matmul_id_req(struct htp_context *     ctx,
    rsp_bufs[0].ptr    = bufs[3].ptr;
    rsp_bufs[0].size   = bufs[3].size;
    rsp_bufs[0].offset = bufs[3].offset;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
@ -494,7 +494,7 @@ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * r
    rsp_bufs[0].ptr    = bufs[2].ptr;
    rsp_bufs[0].offset = bufs[2].offset;
    rsp_bufs[0].size   = bufs[2].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
@ -533,7 +533,7 @@ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * r
    rsp_bufs[0].ptr    = bufs[3].ptr;
    rsp_bufs[0].offset = bufs[3].offset;
    rsp_bufs[0].size   = bufs[3].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
@ -574,7 +574,7 @@ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * re
    rsp_bufs[0].ptr    = bufs[1].ptr;
    rsp_bufs[0].offset = bufs[1].offset;
    rsp_bufs[0].size   = bufs[1].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
@ -618,8 +618,8 @@ static void proc_activations_req(struct htp_context *     ctx,
    rsp_bufs[0].ptr    = bufs[write_idx].ptr;
    rsp_bufs[0].offset = bufs[write_idx].offset;
    rsp_bufs[0].size   = bufs[write_idx].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
-                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU

    // Setup Op context
    struct htp_ops_context octx = { 0 };
@ -674,8 +674,8 @@ static void proc_rope_req(struct htp_context *     ctx,
    rsp_bufs[0].ptr    = bufs[write_idx].ptr;
    rsp_bufs[0].offset = bufs[write_idx].offset;
    rsp_bufs[0].size   = bufs[write_idx].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush HTP
-                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU

    // Setup Op context
    struct htp_ops_context octx = { 0 };