From 952877ec24732b12010c7fa7ed3fc8de4b74e718 Mon Sep 17 00:00:00 2001 From: shouyud Date: Tue, 16 Dec 2025 08:41:54 -0500 Subject: [PATCH] chore: reformat code with clang-formatter to pass cli test --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 207 ++++++++++++------------- ggml/src/ggml-hexagon/htp/act-ops.c | 12 +- ggml/src/ggml-hexagon/htp/htp-msg.h | 8 +- ggml/src/ggml-hexagon/htp/hvx-utils.c | 145 ++++++++--------- ggml/src/ggml-hexagon/htp/hvx-utils.h | 87 +++++------ ggml/src/ggml-hexagon/htp/main.c | 22 +-- 6 files changed, 221 insertions(+), 260 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index c45b292a52..781db7facf 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #ifdef _WIN32 # include @@ -53,10 +53,12 @@ static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMP static int opt_opsync = 0; // synchronous ops #define HEX_VERBOSE(...) \ - if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__) + if (opt_verbose) \ + GGML_LOG_DEBUG(__VA_ARGS__) #define HEX_PROFILE(...) \ - if (opt_profile) GGML_LOG_INFO(__VA_ARGS__) + if (opt_profile) \ + GGML_LOG_INFO(__VA_ARGS__) static inline uint64_t hex_is_aligned(void * addr, uint32_t align) { return ((size_t) addr & (align - 1)) == 0; @@ -218,7 +220,7 @@ struct ggml_hexagon_session { void allocate(int dev_id) noexcept(false); void release() noexcept(true); - void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false); + void enqueue(struct htp_general_req & req, struct dspqueue_buffer * bufs, uint32_t n_bufs, bool sync = false); void flush(); ggml_backend_buffer_type buffer_type; @@ -258,7 +260,10 @@ static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_sessio names, dims, types, strides, buffs, req_flags); } -void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) { +void ggml_hexagon_session::enqueue(struct htp_general_req & req, + struct dspqueue_buffer * bufs, + uint32_t n_bufs, + bool sync) { // Bump pending flag (cleared in the session::flush once we get the responce) this->op_pending++; // atomic inc @@ -298,13 +303,13 @@ void ggml_hexagon_session::flush() { // Read response packet from queue int err = dspqueue_read(q, &flags, - HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references - &n_bufs, // Number of buffer references - bufs, // Buffer references - sizeof(rsp), // Max message length - &rsp_size, // Message length - (uint8_t *) &rsp, - 1000000); // Timeout + HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references + &n_bufs, // Number of buffer references + bufs, // Buffer references + sizeof(rsp), // Max message length + &rsp_size, // Message length + (uint8_t *) &rsp, + 1000000); // Timeout if (err == AEE_EEXPIRED) { // TODO: might need to bail out if the HTP is stuck on something @@ -354,8 +359,8 @@ struct ggml_backend_hexagon_buffer_context { int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD); if (err != 0) { - GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", - s->domain_id, this->size, this->fd, (unsigned) err); + GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", s->domain_id, + this->size, this->fd, (unsigned) err); return false; } @@ -386,10 +391,12 @@ struct ggml_backend_hexagon_buffer_context { size += 4 * 1024; // extra page for padding if (rpcmem_alloc2) { - this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); + this->base = + (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); } else { GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str()); - this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); + this->base = + (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); } if (!this->base) { @@ -453,7 +460,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf (int) ctx->repack); if (tensor->view_src != NULL && tensor->view_offs == 0) { - ; // nothing to do for the view + ; // nothing to do for the view } else { if (!ctx->mapped) { ctx->mmap(); @@ -702,8 +709,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -732,7 +739,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -762,8 +769,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -792,7 +799,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1028,8 +1035,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1058,7 +1065,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -1088,8 +1095,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1118,7 +1125,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1379,8 +1386,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1409,7 +1416,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -1439,8 +1446,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1469,7 +1476,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1592,25 +1599,28 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty return static_cast(buffer_type->context)->name.c_str(); } -static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer( - ggml_backend_buffer_type_t buffer_type, size_t size) { +static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, + size_t size) { auto sess = static_cast(buffer_type->context)->sess; try { - ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/); + ggml_backend_hexagon_buffer_context * ctx = + new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/); return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); return nullptr; } } static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer( - ggml_backend_buffer_type_t buffer_type, size_t size) { + ggml_backend_buffer_type_t buffer_type, + size_t size) { auto sess = static_cast(buffer_type->context)->sess; try { - ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/); + ggml_backend_hexagon_buffer_context * ctx = + new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/); return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); return nullptr; } @@ -1621,7 +1631,8 @@ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer GGML_UNUSED(buffer_type); } -static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) { +static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, + const struct ggml_tensor * t) { return ggml_nbytes(t); } @@ -1697,8 +1708,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { } // Save the IDs - this->session_id = n.session_id; - this->domain_id = n.effective_domain_id; + this->session_id = n.session_id; + this->domain_id = n.effective_domain_id; this->valid_session = true; } @@ -1707,16 +1718,17 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { char session_uri[256]; { char htp_uri[256]; - snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch); + snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", + opt_arch); struct remote_rpc_get_uri u = {}; - u.session_id = this->session_id; - u.domain_name = const_cast(CDSP_DOMAIN_NAME); - u.domain_name_len = strlen(CDSP_DOMAIN_NAME); - u.module_uri = const_cast(htp_uri); - u.module_uri_len = strlen(htp_uri); - u.uri = session_uri; - u.uri_len = sizeof(session_uri); + u.session_id = this->session_id; + u.domain_name = const_cast(CDSP_DOMAIN_NAME); + u.domain_name_len = strlen(CDSP_DOMAIN_NAME); + u.module_uri = const_cast(htp_uri); + u.module_uri_len = strlen(htp_uri); + u.uri = session_uri; + u.uri_len = sizeof(session_uri); int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u)); if (err != AEE_SUCCESS) { @@ -1725,7 +1737,9 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri); - GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri); + GGML_LOG_WARN( + "ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", + dev_id, err, session_uri); } } @@ -1751,7 +1765,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { this->valid_handle = true; GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(), - this->session_id, this->domain_id, session_uri, (unsigned long) this->handle); + this->session_id, this->domain_id, session_uri, (unsigned long) this->handle); // Enable FastRPC QoS mode { @@ -1841,8 +1855,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n buffer_type.context = nullptr; repack_buffer_type.context = nullptr; - buffer_type.device = dev; - repack_buffer_type.device = dev; + buffer_type.device = dev; + repack_buffer_type.device = dev; try { allocate(dev_id); @@ -1852,7 +1866,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface; repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { release(); throw; } @@ -1861,8 +1875,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) { release(); - delete static_cast(buffer_type.context); - delete static_cast(repack_buffer_type.context); + delete static_cast(buffer_type.context); + delete static_cast(repack_buffer_type.context); } // ** backend interface @@ -2164,11 +2178,11 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session } // src0, src1 & dst must be mapped to the same session - if(src1){ + if (src1) { if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } - }else{ + } else { if (!hex_supported_buffer(sess, src0, dst)) { return false; } @@ -2306,11 +2320,11 @@ static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t memset(buf, 0, sizeof(*buf)); auto tensor_buf = static_cast(t->buffer->context); - buf->fd = tensor_buf->fd; - buf->ptr = t->data; - buf->offset = (uint8_t *) t->data - tensor_buf->base; - buf->size = ggml_nbytes(t); - buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU + buf->fd = tensor_buf->fd; + buf->ptr = t->data; + buf->offset = (uint8_t *) t->data - tensor_buf->base; + buf->size = ggml_nbytes(t); + buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP return 1; } @@ -2670,8 +2684,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) { req.op = HTP_OP_UNARY_SILU; supported = true; - } - else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){ + } else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) { req.op = HTP_OP_UNARY_GELU; supported = true; } @@ -2902,8 +2915,7 @@ static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op return (op0 && op0->src[1] == op1->src[1]); } -static inline bool is_compute_op(ggml_tensor *node) -{ +static inline bool is_compute_op(ggml_tensor * node) { return !(ggml_op_is_empty(node->op) || ggml_is_empty(node)); } @@ -3013,29 +3025,17 @@ struct node_info { std::vector fused; - ggml_op op() const { - return node->op; - } + ggml_op op() const { return node->op; } - const ggml_tensor * dst() const { - return fused.empty() ? node : fused.back(); - } + const ggml_tensor * dst() const { return fused.empty() ? node : fused.back(); } - const ggml_tensor * src0() const { - return node->src[0]; - } + const ggml_tensor * src0() const { return node->src[0]; } - const ggml_tensor * src1() const { - return node->src[1]; - } + const ggml_tensor * src1() const { return node->src[1]; } - bool is_empty() const { - return ggml_op_is_empty(node->op); - } + bool is_empty() const { return ggml_op_is_empty(node->op); } - void add_fused(ggml_tensor * t) { - fused.push_back(t); - } + void add_fused(ggml_tensor * t) { fused.push_back(t); } bool stackable() const { switch (this->op()) { @@ -3047,9 +3047,7 @@ struct node_info { } } - bool same_input(const node_info& n) const { - return n.src1() == this->src1(); - } + bool same_input(const node_info & n) const { return n.src1() == this->src1(); } }; static std::vector ggml_hexagon_graph_optimize_reorder(const std::vector & nodes) { @@ -3114,25 +3112,21 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr // and perform the reorder over the fused nodes. after the reorder is done, we unfuse for (int i = 0; i < n; i++) { node_info node = { - /*.node =*/ gf->nodes[i], - /*.fused =*/ {}, + /*.node =*/gf->nodes[i], + /*.fused =*/{}, }; // fuse only ops that start with these operations // can be expanded when needed - if (node.op() == GGML_OP_ADD || - node.op() == GGML_OP_NORM || - node.op() == GGML_OP_RMS_NORM) { + if (node.op() == GGML_OP_ADD || node.op() == GGML_OP_NORM || node.op() == GGML_OP_RMS_NORM) { ops[0] = node.op(); int f = i + 1; while (f < n && f < i + MAX_FUSE) { // conservatively allow fusing only these ops // can be expanded when needed - if (gf->nodes[f]->op != GGML_OP_ADD && - gf->nodes[f]->op != GGML_OP_MUL && - gf->nodes[f]->op != GGML_OP_NORM && - gf->nodes[f]->op != GGML_OP_RMS_NORM) { + if (gf->nodes[f]->op != GGML_OP_ADD && gf->nodes[f]->op != GGML_OP_MUL && + gf->nodes[f]->op != GGML_OP_NORM && gf->nodes[f]->op != GGML_OP_RMS_NORM) { break; } ops[f - i] = gf->nodes[f]->op; @@ -3308,8 +3302,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons case GGML_OP_UNARY: if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) { supp = ggml_hexagon_supported_activations(sess, op); - } - else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){ + } else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU) { supp = ggml_hexagon_supported_activations(sess, op); } break; @@ -3416,7 +3409,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) { } } - if(opt_arch < 75) { + if (opt_arch < 75) { opt_ndev = 1; GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n"); } @@ -3425,11 +3418,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) { // Create devices / sessions for (size_t i = 0; i < opt_ndev; i++) { - devices[i].iface = ggml_backend_hexagon_device_i; - devices[i].reg = reg; + devices[i].iface = ggml_backend_hexagon_device_i; + devices[i].reg = reg; try { devices[i].context = new ggml_hexagon_session(i, &devices[i]); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i); devices[i].context = nullptr; } diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 9d3e584a84..273179ae2f 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -255,7 +255,6 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } - static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, struct htp_tensor * dst, const int32_t * op_params, @@ -301,7 +300,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, const int BLOCK = 8; for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); - + // Prefetch next block if (block_end < src0_end_row) { const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size)); @@ -315,12 +314,11 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, // gelu = x * sigmoid(1.702 * x) // current implementation if (1 == opt_path) { - hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); + hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); - } - else { - hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); + } else { + hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } @@ -339,8 +337,6 @@ static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) { octx->src0_nrows_per_thread); } - - static void unary_silu_fp32_per_thread(const struct htp_tensor * src0, struct htp_tensor * dst, const int32_t * op_params, diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h index a61652304a..0e893c1d96 100644 --- a/ggml/src/ggml-hexagon/htp/htp-msg.h +++ b/ggml/src/ggml-hexagon/htp/htp-msg.h @@ -120,10 +120,10 @@ static const char * htp_type_name(uint32_t t) { #define HTP_MAX_DIMS 4 struct htp_tensor { - uint32_t data; // Buffer offset in the messages, and data pointer on the NSP - uint32_t type; // Data type - uint32_t ne[HTP_MAX_DIMS]; // Number of elements - uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor) + uint32_t data; // Buffer offset in the messages, and data pointer on the NSP + uint32_t type; // Data type + uint32_t ne[HTP_MAX_DIMS]; // Number of elements + uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor) }; #define HTP_MAX_OP_PARAMS 64 diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index e7ee589f34..2ac4cfb263 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -49,28 +49,25 @@ void hvx_mul_f32(const uint8_t * restrict src0, FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); } - bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); - - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; - HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; + HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; + HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_UVector * restrict vec_out = (HVX_UVector *) dst; - HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; @@ -78,48 +75,42 @@ void hvx_mul_f32(const uint8_t * restrict src0, HVX_Vector sline2c; HVX_Vector sline2; - slinep = *vec_in1++; + slinep = *vec_in1++; sline2p = *vec_in2++; - #pragma unroll(4) - for(uint32_t i = step_of_1 -1; i> 0; i--){ - slinec = *vec_in1++; +#pragma unroll(4) + for (uint32_t i = step_of_1 - 1; i > 0; i--) { + slinec = *vec_in1++; sline2c = *vec_in2++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); - - *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - slinep = slinec; - sline2p = sline2c; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + + *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); + slinep = slinec; + sline2p = sline2c; } - if(step_of_1 > 1){ - slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++; + if (step_of_1 > 1) { + slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++; sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); - *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - slinep = slinec; - sline2p = sline2c; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); + slinep = slinec; + sline2p = sline2c; } - if(left_over > 0 ){ + if (left_over > 0) { + slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++); - slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) - ? slinep - : *vec_in1++); - - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) - ? sline2p - : *vec_in2++); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2); - hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out)); + hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out)); handled_leftover = true; } } - if (left_over > 0 && !handled_leftover) { const float * src0f = (const float *) src0 + num_elems_whole; const float * src1f = (const float *) src1 + num_elems_whole; @@ -315,13 +306,13 @@ void hvx_add_f32(const uint8_t * restrict src0, HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); @@ -458,7 +449,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *vec_in1++; const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); @@ -468,7 +459,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * *vec_out++ = v; } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -512,60 +503,54 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } - HVX_Vector val_vec = hvx_vec_splat_fp32(val); - bool handled_leftover = false; + HVX_Vector val_vec = hvx_vec_splat_fp32(val); + bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); - - - HVX_Vector * input_v_ptr = (HVX_Vector *) src; - HVX_UVector * output_v_ptr = (HVX_UVector *) dst; - + HVX_Vector * input_v_ptr = (HVX_Vector *) src; + HVX_UVector * output_v_ptr = (HVX_UVector *) dst; HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; - - slinep = *input_v_ptr++; - #pragma unroll(4) - for(uint32_t i = step_of_1 - 1; i > 0; i--){ - slinec = *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + slinep = *input_v_ptr++; + +#pragma unroll(4) + for (uint32_t i = step_of_1 - 1; i > 0; i--) { + slinec = *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); /* Prepare slinep for next iteration */ - slinep = slinec; + slinep = slinec; } - if(step_of_1 > 0){ - + if (step_of_1 > 0) { slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); slinep = slinec; } - if(leftover_size > 0){ - slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) - ? slinep - : *input_v_ptr++); + if (leftover_size > 0) { + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++); sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); - hvx_vec_store_u(output_v_ptr, leftover_size, sout); + HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + hvx_vec_store_u(output_v_ptr, leftover_size, sout); handled_leftover = true; } } @@ -606,13 +591,13 @@ void hvx_sub_f32(const uint8_t * restrict src0, HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); @@ -747,13 +732,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -789,7 +774,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000); HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000); - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1); sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v); @@ -833,13 +818,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) { if (0 == unaligned_loop) { HVX_Vector * vec_in = (HVX_Vector *) src; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++); sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -882,13 +867,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i HVX_Vector * vec_in1 = (HVX_Vector *) src; HVX_Vector * vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -931,12 +916,12 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) { if (0 == unaligned_loop) { HVX_Vector * restrict vec_in = (HVX_Vector *) src; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -974,7 +959,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector * restrict vec_in = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min); @@ -1012,7 +997,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src, HVX_Vector range_left = hvx_vec_splat_fp32(limit_left); HVX_Vector range_right = hvx_vec_splat_fp32(limit_right); - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in_vec = *vec_in++; HVX_Vector temp_v = in_vec; diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 0b24786391..c5da167d49 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -23,20 +23,18 @@ typedef union { /* Q6_Vsf_equals_Vw is only available on v73+.*/ #if __HVX_ARCH__ < 73 -static inline HVX_Vector int32_to_qfloat(HVX_Vector const in) -{ - HVX_Vector const vzero = Q6_V_vzero(); - HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero); - HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in); - HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift); - HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift); - HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized); - HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp)); +static inline HVX_Vector int32_to_qfloat(const HVX_Vector in) { + const HVX_Vector vzero = Q6_V_vzero(); + HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero); + HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in); + HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift); + HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift); + HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized); + HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp)); return ret; } -static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in) -{ +static inline HVX_Vector Q6_Vsf_equals_Vw(const HVX_Vector in) { return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in)); } #endif @@ -109,7 +107,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -133,7 +131,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -157,7 +155,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -182,7 +180,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -206,7 +204,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -230,7 +228,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -255,7 +253,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { vdst[i] = velem; } @@ -265,7 +263,6 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t } } - /* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { uint32_t left_off = (size_t) addr & (chunk_size - 1); @@ -273,8 +270,6 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3 return right_off <= chunk_size; } - - static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) { HVX_VectorAlias u = { .v = v }; @@ -992,16 +987,15 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < step_of_1; i++) { v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp); } } - -static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ +static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector - int leftover = num_elems - (step_of_1 * VLEN_FP32); + int leftover = num_elems - (step_of_1 * VLEN_FP32); int32_t leftover_size = leftover * sizeof(float); @@ -1012,51 +1006,44 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - const float *input = (float *)src; - float *output = (float *)dst; - - HVX_Vector * input_v_ptr = (HVX_Vector *) input; - HVX_UVector * output_v_ptr = (HVX_UVector *) output; + const float * input = (float *) src; + float * output = (float *) dst; + HVX_Vector * input_v_ptr = (HVX_Vector *) input; + HVX_UVector * output_v_ptr = (HVX_UVector *) output; HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; - - slinep = *input_v_ptr++; - #pragma unroll(4) - for(uint32_t i = step_of_1 -1; i> 0; i--){ - slinec = *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); + slinep = *input_v_ptr++; +#pragma unroll(4) + for (uint32_t i = step_of_1 - 1; i > 0; i--) { + slinec = *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); /* Prepare slinep for next iteration */ - slinep = slinec; + slinep = slinec; } - if(step_of_1> 0){ - + if (step_of_1 > 0) { slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); + ; slinep = slinec; } - if(leftover> 0){ - slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) - ? slinep - : *input_v_ptr++); + if (leftover > 0) { + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++); sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); - hvx_vec_store_u(output_v_ptr, leftover_size, sout); + hvx_vec_store_u(output_v_ptr, leftover_size, sout); } - - } - float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems); void hvx_mul_f32(const uint8_t * restrict src0, const uint8_t * restrict src1, diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index e30ae69502..cbfdd0472f 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -151,7 +151,7 @@ static int vtcm_acquire(struct htp_context * ctx) { qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10); err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000); if (err != 0) { - FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err); + FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err); abort(); } HAP_compute_res_release_cached(ctx->vtcm_rctx); @@ -159,7 +159,7 @@ static int vtcm_acquire(struct htp_context * ctx) { err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000); if (err != 0) { - FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err); + FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err); abort(); } ctx->vtcm_valid = true; @@ -411,7 +411,7 @@ static void proc_matmul_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[2].ptr; rsp_bufs[0].size = bufs[2].size; rsp_bufs[0].offset = bufs[2].offset; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -453,7 +453,7 @@ static void proc_matmul_id_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[3].ptr; rsp_bufs[0].size = bufs[3].size; rsp_bufs[0].offset = bufs[3].offset; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -494,7 +494,7 @@ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * r rsp_bufs[0].ptr = bufs[2].ptr; rsp_bufs[0].offset = bufs[2].offset; rsp_bufs[0].size = bufs[2].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -533,7 +533,7 @@ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * r rsp_bufs[0].ptr = bufs[3].ptr; rsp_bufs[0].offset = bufs[3].offset; rsp_bufs[0].size = bufs[3].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -574,7 +574,7 @@ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * re rsp_bufs[0].ptr = bufs[1].ptr; rsp_bufs[0].offset = bufs[1].offset; rsp_bufs[0].size = bufs[1].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -618,8 +618,8 @@ static void proc_activations_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[write_idx].ptr; rsp_bufs[0].offset = bufs[write_idx].offset; rsp_bufs[0].size = bufs[write_idx].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context struct htp_ops_context octx = { 0 }; @@ -674,8 +674,8 @@ static void proc_rope_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[write_idx].ptr; rsp_bufs[0].offset = bufs[write_idx].offset; rsp_bufs[0].size = bufs[write_idx].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context struct htp_ops_context octx = { 0 };