chore: reformat code with clang-formatter to pass cli test
This commit is contained in:
parent
05693357c8
commit
952877ec24
|
|
@ -8,8 +8,8 @@
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <string>
|
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
# include <sal.h>
|
# include <sal.h>
|
||||||
|
|
@ -53,10 +53,12 @@ static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMP
|
||||||
static int opt_opsync = 0; // synchronous ops
|
static int opt_opsync = 0; // synchronous ops
|
||||||
|
|
||||||
#define HEX_VERBOSE(...) \
|
#define HEX_VERBOSE(...) \
|
||||||
if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
|
if (opt_verbose) \
|
||||||
|
GGML_LOG_DEBUG(__VA_ARGS__)
|
||||||
|
|
||||||
#define HEX_PROFILE(...) \
|
#define HEX_PROFILE(...) \
|
||||||
if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)
|
if (opt_profile) \
|
||||||
|
GGML_LOG_INFO(__VA_ARGS__)
|
||||||
|
|
||||||
static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
|
static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
|
||||||
return ((size_t) addr & (align - 1)) == 0;
|
return ((size_t) addr & (align - 1)) == 0;
|
||||||
|
|
@ -218,7 +220,7 @@ struct ggml_hexagon_session {
|
||||||
void allocate(int dev_id) noexcept(false);
|
void allocate(int dev_id) noexcept(false);
|
||||||
void release() noexcept(true);
|
void release() noexcept(true);
|
||||||
|
|
||||||
void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
|
void enqueue(struct htp_general_req & req, struct dspqueue_buffer * bufs, uint32_t n_bufs, bool sync = false);
|
||||||
void flush();
|
void flush();
|
||||||
|
|
||||||
ggml_backend_buffer_type buffer_type;
|
ggml_backend_buffer_type buffer_type;
|
||||||
|
|
@ -258,7 +260,10 @@ static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_sessio
|
||||||
names, dims, types, strides, buffs, req_flags);
|
names, dims, types, strides, buffs, req_flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
|
void ggml_hexagon_session::enqueue(struct htp_general_req & req,
|
||||||
|
struct dspqueue_buffer * bufs,
|
||||||
|
uint32_t n_bufs,
|
||||||
|
bool sync) {
|
||||||
// Bump pending flag (cleared in the session::flush once we get the responce)
|
// Bump pending flag (cleared in the session::flush once we get the responce)
|
||||||
this->op_pending++; // atomic inc
|
this->op_pending++; // atomic inc
|
||||||
|
|
||||||
|
|
@ -298,13 +303,13 @@ void ggml_hexagon_session::flush() {
|
||||||
|
|
||||||
// Read response packet from queue
|
// Read response packet from queue
|
||||||
int err = dspqueue_read(q, &flags,
|
int err = dspqueue_read(q, &flags,
|
||||||
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
|
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
|
||||||
&n_bufs, // Number of buffer references
|
&n_bufs, // Number of buffer references
|
||||||
bufs, // Buffer references
|
bufs, // Buffer references
|
||||||
sizeof(rsp), // Max message length
|
sizeof(rsp), // Max message length
|
||||||
&rsp_size, // Message length
|
&rsp_size, // Message length
|
||||||
(uint8_t *) &rsp,
|
(uint8_t *) &rsp,
|
||||||
1000000); // Timeout
|
1000000); // Timeout
|
||||||
|
|
||||||
if (err == AEE_EEXPIRED) {
|
if (err == AEE_EEXPIRED) {
|
||||||
// TODO: might need to bail out if the HTP is stuck on something
|
// TODO: might need to bail out if the HTP is stuck on something
|
||||||
|
|
@ -354,8 +359,8 @@ struct ggml_backend_hexagon_buffer_context {
|
||||||
|
|
||||||
int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
|
int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
|
GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", s->domain_id,
|
||||||
s->domain_id, this->size, this->fd, (unsigned) err);
|
this->size, this->fd, (unsigned) err);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -386,10 +391,12 @@ struct ggml_backend_hexagon_buffer_context {
|
||||||
size += 4 * 1024; // extra page for padding
|
size += 4 * 1024; // extra page for padding
|
||||||
|
|
||||||
if (rpcmem_alloc2) {
|
if (rpcmem_alloc2) {
|
||||||
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
this->base =
|
||||||
|
(uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
||||||
} else {
|
} else {
|
||||||
GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
|
GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
|
||||||
this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
this->base =
|
||||||
|
(uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this->base) {
|
if (!this->base) {
|
||||||
|
|
@ -453,7 +460,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf
|
||||||
(int) ctx->repack);
|
(int) ctx->repack);
|
||||||
|
|
||||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||||
; // nothing to do for the view
|
; // nothing to do for the view
|
||||||
} else {
|
} else {
|
||||||
if (!ctx->mapped) {
|
if (!ctx->mapped) {
|
||||||
ctx->mmap();
|
ctx->mmap();
|
||||||
|
|
@ -702,8 +709,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
|
||||||
|
|
||||||
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
||||||
// or write more than the tensor can hold.
|
// or write more than the tensor can hold.
|
||||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
const size_t total_tensor_size = (size_t) nrows * row_size;
|
||||||
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
||||||
|
|
||||||
// Calculate how many full rows and how many remaining bytes we need to process.
|
// Calculate how many full rows and how many remaining bytes we need to process.
|
||||||
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
||||||
|
|
@ -732,7 +739,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
|
||||||
|
|
||||||
// 2. Process the final, potentially partial, row
|
// 2. Process the final, potentially partial, row
|
||||||
if (n_rem_bytes > 0) {
|
if (n_rem_bytes > 0) {
|
||||||
const int64_t i = n_full_rows;
|
const int64_t i = n_full_rows;
|
||||||
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
||||||
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
||||||
|
|
||||||
|
|
@ -762,8 +769,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
|
||||||
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
||||||
|
|
||||||
// Ensure we don't try to copy more data than the tensor actually contains.
|
// Ensure we don't try to copy more data than the tensor actually contains.
|
||||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
const size_t total_tensor_size = (size_t) nrows * row_size;
|
||||||
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
||||||
|
|
||||||
// Calculate how many full rows and how many remaining bytes we need to process.
|
// Calculate how many full rows and how many remaining bytes we need to process.
|
||||||
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
||||||
|
|
@ -792,7 +799,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
|
||||||
|
|
||||||
// 2. Process the final, potentially partial, row
|
// 2. Process the final, potentially partial, row
|
||||||
if (n_rem_bytes > 0) {
|
if (n_rem_bytes > 0) {
|
||||||
const int64_t i = n_full_rows;
|
const int64_t i = n_full_rows;
|
||||||
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
||||||
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
||||||
|
|
||||||
|
|
@ -1028,8 +1035,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
|
||||||
|
|
||||||
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
||||||
// or write more than the tensor can hold.
|
// or write more than the tensor can hold.
|
||||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
const size_t total_tensor_size = (size_t) nrows * row_size;
|
||||||
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
||||||
|
|
||||||
// Calculate how many full rows and how many remaining bytes we need to process.
|
// Calculate how many full rows and how many remaining bytes we need to process.
|
||||||
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
||||||
|
|
@ -1058,7 +1065,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
|
||||||
|
|
||||||
// 2. Process the final, potentially partial, row
|
// 2. Process the final, potentially partial, row
|
||||||
if (n_rem_bytes > 0) {
|
if (n_rem_bytes > 0) {
|
||||||
const int64_t i = n_full_rows;
|
const int64_t i = n_full_rows;
|
||||||
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
||||||
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
||||||
|
|
||||||
|
|
@ -1088,8 +1095,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
|
||||||
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
||||||
|
|
||||||
// Ensure we don't try to copy more data than the tensor actually contains.
|
// Ensure we don't try to copy more data than the tensor actually contains.
|
||||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
const size_t total_tensor_size = (size_t) nrows * row_size;
|
||||||
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
||||||
|
|
||||||
// Calculate how many full rows and how many remaining bytes we need to process.
|
// Calculate how many full rows and how many remaining bytes we need to process.
|
||||||
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
||||||
|
|
@ -1118,7 +1125,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
|
||||||
|
|
||||||
// 2. Process the final, potentially partial, row
|
// 2. Process the final, potentially partial, row
|
||||||
if (n_rem_bytes > 0) {
|
if (n_rem_bytes > 0) {
|
||||||
const int64_t i = n_full_rows;
|
const int64_t i = n_full_rows;
|
||||||
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
||||||
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
||||||
|
|
||||||
|
|
@ -1379,8 +1386,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
|
||||||
|
|
||||||
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
||||||
// or write more than the tensor can hold.
|
// or write more than the tensor can hold.
|
||||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
const size_t total_tensor_size = (size_t) nrows * row_size;
|
||||||
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
||||||
|
|
||||||
// Calculate how many full rows and how many remaining bytes we need to process.
|
// Calculate how many full rows and how many remaining bytes we need to process.
|
||||||
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
||||||
|
|
@ -1409,7 +1416,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
|
||||||
|
|
||||||
// 2. Process the final, potentially partial, row
|
// 2. Process the final, potentially partial, row
|
||||||
if (n_rem_bytes > 0) {
|
if (n_rem_bytes > 0) {
|
||||||
const int64_t i = n_full_rows;
|
const int64_t i = n_full_rows;
|
||||||
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
||||||
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
||||||
|
|
||||||
|
|
@ -1439,8 +1446,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
|
||||||
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
||||||
|
|
||||||
// Ensure we don't try to copy more data than the tensor actually contains.
|
// Ensure we don't try to copy more data than the tensor actually contains.
|
||||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
const size_t total_tensor_size = (size_t) nrows * row_size;
|
||||||
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
||||||
|
|
||||||
// Calculate how many full rows and how many remaining bytes we need to process.
|
// Calculate how many full rows and how many remaining bytes we need to process.
|
||||||
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
||||||
|
|
@ -1469,7 +1476,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
|
||||||
|
|
||||||
// 2. Process the final, potentially partial, row
|
// 2. Process the final, potentially partial, row
|
||||||
if (n_rem_bytes > 0) {
|
if (n_rem_bytes > 0) {
|
||||||
const int64_t i = n_full_rows;
|
const int64_t i = n_full_rows;
|
||||||
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
||||||
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
||||||
|
|
||||||
|
|
@ -1592,25 +1599,28 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty
|
||||||
return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->name.c_str();
|
return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->name.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
|
static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type,
|
||||||
ggml_backend_buffer_type_t buffer_type, size_t size) {
|
size_t size) {
|
||||||
auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
|
auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
|
||||||
try {
|
try {
|
||||||
ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
|
ggml_backend_hexagon_buffer_context * ctx =
|
||||||
|
new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
|
||||||
return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
|
return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
|
||||||
} catch (std::exception const &exc) {
|
} catch (const std::exception & exc) {
|
||||||
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
|
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
|
static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
|
||||||
ggml_backend_buffer_type_t buffer_type, size_t size) {
|
ggml_backend_buffer_type_t buffer_type,
|
||||||
|
size_t size) {
|
||||||
auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
|
auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
|
||||||
try {
|
try {
|
||||||
ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
|
ggml_backend_hexagon_buffer_context * ctx =
|
||||||
|
new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
|
||||||
return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
|
return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
|
||||||
} catch (std::exception const &exc) {
|
} catch (const std::exception & exc) {
|
||||||
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
|
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -1621,7 +1631,8 @@ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer
|
||||||
GGML_UNUSED(buffer_type);
|
GGML_UNUSED(buffer_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) {
|
static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
|
||||||
|
const struct ggml_tensor * t) {
|
||||||
return ggml_nbytes(t);
|
return ggml_nbytes(t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1697,8 +1708,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save the IDs
|
// Save the IDs
|
||||||
this->session_id = n.session_id;
|
this->session_id = n.session_id;
|
||||||
this->domain_id = n.effective_domain_id;
|
this->domain_id = n.effective_domain_id;
|
||||||
this->valid_session = true;
|
this->valid_session = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1707,16 +1718,17 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
||||||
char session_uri[256];
|
char session_uri[256];
|
||||||
{
|
{
|
||||||
char htp_uri[256];
|
char htp_uri[256];
|
||||||
snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
|
snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0",
|
||||||
|
opt_arch);
|
||||||
|
|
||||||
struct remote_rpc_get_uri u = {};
|
struct remote_rpc_get_uri u = {};
|
||||||
u.session_id = this->session_id;
|
u.session_id = this->session_id;
|
||||||
u.domain_name = const_cast<char *>(CDSP_DOMAIN_NAME);
|
u.domain_name = const_cast<char *>(CDSP_DOMAIN_NAME);
|
||||||
u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
|
u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
|
||||||
u.module_uri = const_cast<char *>(htp_uri);
|
u.module_uri = const_cast<char *>(htp_uri);
|
||||||
u.module_uri_len = strlen(htp_uri);
|
u.module_uri_len = strlen(htp_uri);
|
||||||
u.uri = session_uri;
|
u.uri = session_uri;
|
||||||
u.uri_len = sizeof(session_uri);
|
u.uri_len = sizeof(session_uri);
|
||||||
|
|
||||||
int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
|
int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
|
||||||
if (err != AEE_SUCCESS) {
|
if (err != AEE_SUCCESS) {
|
||||||
|
|
@ -1725,7 +1737,9 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
||||||
|
|
||||||
snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);
|
snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);
|
||||||
|
|
||||||
GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri);
|
GGML_LOG_WARN(
|
||||||
|
"ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n",
|
||||||
|
dev_id, err, session_uri);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1751,7 +1765,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
||||||
this->valid_handle = true;
|
this->valid_handle = true;
|
||||||
|
|
||||||
GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
|
GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
|
||||||
this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
|
this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
|
||||||
|
|
||||||
// Enable FastRPC QoS mode
|
// Enable FastRPC QoS mode
|
||||||
{
|
{
|
||||||
|
|
@ -1841,8 +1855,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
|
||||||
buffer_type.context = nullptr;
|
buffer_type.context = nullptr;
|
||||||
repack_buffer_type.context = nullptr;
|
repack_buffer_type.context = nullptr;
|
||||||
|
|
||||||
buffer_type.device = dev;
|
buffer_type.device = dev;
|
||||||
repack_buffer_type.device = dev;
|
repack_buffer_type.device = dev;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
allocate(dev_id);
|
allocate(dev_id);
|
||||||
|
|
@ -1852,7 +1866,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
|
||||||
|
|
||||||
repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface;
|
repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface;
|
||||||
repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
|
repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
|
||||||
} catch (std::exception const &exc) {
|
} catch (const std::exception & exc) {
|
||||||
release();
|
release();
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
|
|
@ -1861,8 +1875,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
|
||||||
ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
|
ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
|
||||||
release();
|
release();
|
||||||
|
|
||||||
delete static_cast<ggml_backend_hexagon_buffer_type_context*>(buffer_type.context);
|
delete static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type.context);
|
||||||
delete static_cast<ggml_backend_hexagon_buffer_type_context*>(repack_buffer_type.context);
|
delete static_cast<ggml_backend_hexagon_buffer_type_context *>(repack_buffer_type.context);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ** backend interface
|
// ** backend interface
|
||||||
|
|
@ -2164,11 +2178,11 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
|
||||||
}
|
}
|
||||||
|
|
||||||
// src0, src1 & dst must be mapped to the same session
|
// src0, src1 & dst must be mapped to the same session
|
||||||
if(src1){
|
if (src1) {
|
||||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}else{
|
} else {
|
||||||
if (!hex_supported_buffer(sess, src0, dst)) {
|
if (!hex_supported_buffer(sess, src0, dst)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -2306,11 +2320,11 @@ static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t
|
||||||
|
|
||||||
memset(buf, 0, sizeof(*buf));
|
memset(buf, 0, sizeof(*buf));
|
||||||
auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
||||||
buf->fd = tensor_buf->fd;
|
buf->fd = tensor_buf->fd;
|
||||||
buf->ptr = t->data;
|
buf->ptr = t->data;
|
||||||
buf->offset = (uint8_t *) t->data - tensor_buf->base;
|
buf->offset = (uint8_t *) t->data - tensor_buf->base;
|
||||||
buf->size = ggml_nbytes(t);
|
buf->size = ggml_nbytes(t);
|
||||||
buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU
|
buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU
|
||||||
buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP
|
buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
@ -2670,8 +2684,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
|
||||||
if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
|
if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
|
||||||
req.op = HTP_OP_UNARY_SILU;
|
req.op = HTP_OP_UNARY_SILU;
|
||||||
supported = true;
|
supported = true;
|
||||||
}
|
} else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) {
|
||||||
else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){
|
|
||||||
req.op = HTP_OP_UNARY_GELU;
|
req.op = HTP_OP_UNARY_GELU;
|
||||||
supported = true;
|
supported = true;
|
||||||
}
|
}
|
||||||
|
|
@ -2902,8 +2915,7 @@ static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op
|
||||||
return (op0 && op0->src[1] == op1->src[1]);
|
return (op0 && op0->src[1] == op1->src[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool is_compute_op(ggml_tensor *node)
|
static inline bool is_compute_op(ggml_tensor * node) {
|
||||||
{
|
|
||||||
return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
|
return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3013,29 +3025,17 @@ struct node_info {
|
||||||
|
|
||||||
std::vector<ggml_tensor *> fused;
|
std::vector<ggml_tensor *> fused;
|
||||||
|
|
||||||
ggml_op op() const {
|
ggml_op op() const { return node->op; }
|
||||||
return node->op;
|
|
||||||
}
|
|
||||||
|
|
||||||
const ggml_tensor * dst() const {
|
const ggml_tensor * dst() const { return fused.empty() ? node : fused.back(); }
|
||||||
return fused.empty() ? node : fused.back();
|
|
||||||
}
|
|
||||||
|
|
||||||
const ggml_tensor * src0() const {
|
const ggml_tensor * src0() const { return node->src[0]; }
|
||||||
return node->src[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
const ggml_tensor * src1() const {
|
const ggml_tensor * src1() const { return node->src[1]; }
|
||||||
return node->src[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_empty() const {
|
bool is_empty() const { return ggml_op_is_empty(node->op); }
|
||||||
return ggml_op_is_empty(node->op);
|
|
||||||
}
|
|
||||||
|
|
||||||
void add_fused(ggml_tensor * t) {
|
void add_fused(ggml_tensor * t) { fused.push_back(t); }
|
||||||
fused.push_back(t);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool stackable() const {
|
bool stackable() const {
|
||||||
switch (this->op()) {
|
switch (this->op()) {
|
||||||
|
|
@ -3047,9 +3047,7 @@ struct node_info {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool same_input(const node_info& n) const {
|
bool same_input(const node_info & n) const { return n.src1() == this->src1(); }
|
||||||
return n.src1() == this->src1();
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
|
static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
|
||||||
|
|
@ -3114,25 +3112,21 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
|
||||||
// and perform the reorder over the fused nodes. after the reorder is done, we unfuse
|
// and perform the reorder over the fused nodes. after the reorder is done, we unfuse
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
node_info node = {
|
node_info node = {
|
||||||
/*.node =*/ gf->nodes[i],
|
/*.node =*/gf->nodes[i],
|
||||||
/*.fused =*/ {},
|
/*.fused =*/{},
|
||||||
};
|
};
|
||||||
|
|
||||||
// fuse only ops that start with these operations
|
// fuse only ops that start with these operations
|
||||||
// can be expanded when needed
|
// can be expanded when needed
|
||||||
if (node.op() == GGML_OP_ADD ||
|
if (node.op() == GGML_OP_ADD || node.op() == GGML_OP_NORM || node.op() == GGML_OP_RMS_NORM) {
|
||||||
node.op() == GGML_OP_NORM ||
|
|
||||||
node.op() == GGML_OP_RMS_NORM) {
|
|
||||||
ops[0] = node.op();
|
ops[0] = node.op();
|
||||||
|
|
||||||
int f = i + 1;
|
int f = i + 1;
|
||||||
while (f < n && f < i + MAX_FUSE) {
|
while (f < n && f < i + MAX_FUSE) {
|
||||||
// conservatively allow fusing only these ops
|
// conservatively allow fusing only these ops
|
||||||
// can be expanded when needed
|
// can be expanded when needed
|
||||||
if (gf->nodes[f]->op != GGML_OP_ADD &&
|
if (gf->nodes[f]->op != GGML_OP_ADD && gf->nodes[f]->op != GGML_OP_MUL &&
|
||||||
gf->nodes[f]->op != GGML_OP_MUL &&
|
gf->nodes[f]->op != GGML_OP_NORM && gf->nodes[f]->op != GGML_OP_RMS_NORM) {
|
||||||
gf->nodes[f]->op != GGML_OP_NORM &&
|
|
||||||
gf->nodes[f]->op != GGML_OP_RMS_NORM) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ops[f - i] = gf->nodes[f]->op;
|
ops[f - i] = gf->nodes[f]->op;
|
||||||
|
|
@ -3308,8 +3302,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
|
if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
|
||||||
supp = ggml_hexagon_supported_activations(sess, op);
|
supp = ggml_hexagon_supported_activations(sess, op);
|
||||||
}
|
} else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU) {
|
||||||
else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){
|
|
||||||
supp = ggml_hexagon_supported_activations(sess, op);
|
supp = ggml_hexagon_supported_activations(sess, op);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
@ -3416,7 +3409,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(opt_arch < 75) {
|
if (opt_arch < 75) {
|
||||||
opt_ndev = 1;
|
opt_ndev = 1;
|
||||||
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
||||||
}
|
}
|
||||||
|
|
@ -3425,11 +3418,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
||||||
|
|
||||||
// Create devices / sessions
|
// Create devices / sessions
|
||||||
for (size_t i = 0; i < opt_ndev; i++) {
|
for (size_t i = 0; i < opt_ndev; i++) {
|
||||||
devices[i].iface = ggml_backend_hexagon_device_i;
|
devices[i].iface = ggml_backend_hexagon_device_i;
|
||||||
devices[i].reg = reg;
|
devices[i].reg = reg;
|
||||||
try {
|
try {
|
||||||
devices[i].context = new ggml_hexagon_session(i, &devices[i]);
|
devices[i].context = new ggml_hexagon_session(i, &devices[i]);
|
||||||
} catch (std::exception const &exc) {
|
} catch (const std::exception & exc) {
|
||||||
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
|
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
|
||||||
devices[i].context = nullptr;
|
devices[i].context = nullptr;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -255,7 +255,6 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
|
||||||
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
||||||
struct htp_tensor * dst,
|
struct htp_tensor * dst,
|
||||||
const int32_t * op_params,
|
const int32_t * op_params,
|
||||||
|
|
@ -301,7 +300,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
||||||
const int BLOCK = 8;
|
const int BLOCK = 8;
|
||||||
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
|
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
|
||||||
const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
|
const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
|
||||||
|
|
||||||
// Prefetch next block
|
// Prefetch next block
|
||||||
if (block_end < src0_end_row) {
|
if (block_end < src0_end_row) {
|
||||||
const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
|
const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
|
||||||
|
|
@ -315,12 +314,11 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
||||||
|
|
||||||
// gelu = x * sigmoid(1.702 * x) // current implementation
|
// gelu = x * sigmoid(1.702 * x) // current implementation
|
||||||
if (1 == opt_path) {
|
if (1 == opt_path) {
|
||||||
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
|
hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
|
||||||
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
||||||
hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
||||||
}
|
} else {
|
||||||
else {
|
hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
|
||||||
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
|
|
||||||
hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
||||||
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
||||||
}
|
}
|
||||||
|
|
@ -339,8 +337,6 @@ static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
|
||||||
octx->src0_nrows_per_thread);
|
octx->src0_nrows_per_thread);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
|
static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
|
||||||
struct htp_tensor * dst,
|
struct htp_tensor * dst,
|
||||||
const int32_t * op_params,
|
const int32_t * op_params,
|
||||||
|
|
|
||||||
|
|
@ -120,10 +120,10 @@ static const char * htp_type_name(uint32_t t) {
|
||||||
#define HTP_MAX_DIMS 4
|
#define HTP_MAX_DIMS 4
|
||||||
|
|
||||||
struct htp_tensor {
|
struct htp_tensor {
|
||||||
uint32_t data; // Buffer offset in the messages, and data pointer on the NSP
|
uint32_t data; // Buffer offset in the messages, and data pointer on the NSP
|
||||||
uint32_t type; // Data type
|
uint32_t type; // Data type
|
||||||
uint32_t ne[HTP_MAX_DIMS]; // Number of elements
|
uint32_t ne[HTP_MAX_DIMS]; // Number of elements
|
||||||
uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor)
|
uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor)
|
||||||
};
|
};
|
||||||
|
|
||||||
#define HTP_MAX_OP_PARAMS 64
|
#define HTP_MAX_OP_PARAMS 64
|
||||||
|
|
|
||||||
|
|
@ -49,28 +49,25 @@ void hvx_mul_f32(const uint8_t * restrict src0,
|
||||||
FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
|
FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool handled_leftover = false;
|
bool handled_leftover = false;
|
||||||
if (0 == unaligned_loop) {
|
if (0 == unaligned_loop) {
|
||||||
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
|
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
|
||||||
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
||||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++);
|
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++);
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
||||||
int leftover_size = left_over * sizeof(float);
|
int leftover_size = left_over * sizeof(float);
|
||||||
|
|
||||||
|
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
|
||||||
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
|
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
||||||
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
|
||||||
HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
|
HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
|
||||||
|
|
||||||
|
|
||||||
HVX_Vector slinep;
|
HVX_Vector slinep;
|
||||||
HVX_Vector slinec;
|
HVX_Vector slinec;
|
||||||
HVX_Vector sline;
|
HVX_Vector sline;
|
||||||
|
|
@ -78,48 +75,42 @@ void hvx_mul_f32(const uint8_t * restrict src0,
|
||||||
HVX_Vector sline2c;
|
HVX_Vector sline2c;
|
||||||
HVX_Vector sline2;
|
HVX_Vector sline2;
|
||||||
|
|
||||||
slinep = *vec_in1++;
|
slinep = *vec_in1++;
|
||||||
sline2p = *vec_in2++;
|
sline2p = *vec_in2++;
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for(uint32_t i = step_of_1 -1; i> 0; i--){
|
for (uint32_t i = step_of_1 - 1; i > 0; i--) {
|
||||||
slinec = *vec_in1++;
|
slinec = *vec_in1++;
|
||||||
sline2c = *vec_in2++;
|
sline2c = *vec_in2++;
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
||||||
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
||||||
|
|
||||||
*((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
*((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
||||||
slinep = slinec;
|
slinep = slinec;
|
||||||
sline2p = sline2c;
|
sline2p = sline2c;
|
||||||
}
|
}
|
||||||
if(step_of_1 > 1){
|
if (step_of_1 > 1) {
|
||||||
slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
|
slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
|
||||||
sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;
|
sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;
|
||||||
|
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
||||||
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
||||||
*((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
*((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
||||||
slinep = slinec;
|
slinep = slinec;
|
||||||
sline2p = sline2c;
|
sline2p = sline2c;
|
||||||
}
|
}
|
||||||
if(left_over > 0 ){
|
if (left_over > 0) {
|
||||||
|
slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++);
|
||||||
|
|
||||||
slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN)
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
||||||
? slinep
|
sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++);
|
||||||
: *vec_in1++);
|
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
||||||
|
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
|
||||||
sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN)
|
|
||||||
? sline2p
|
|
||||||
: *vec_in2++);
|
|
||||||
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
|
||||||
|
|
||||||
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
|
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
|
||||||
hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));
|
hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));
|
||||||
handled_leftover = true;
|
handled_leftover = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (left_over > 0 && !handled_leftover) {
|
if (left_over > 0 && !handled_leftover) {
|
||||||
const float * src0f = (const float *) src0 + num_elems_whole;
|
const float * src0f = (const float *) src0 + num_elems_whole;
|
||||||
const float * src1f = (const float *) src1 + num_elems_whole;
|
const float * src1f = (const float *) src1 + num_elems_whole;
|
||||||
|
|
@ -315,13 +306,13 @@ void hvx_add_f32(const uint8_t * restrict src0,
|
||||||
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
||||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++);
|
HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++);
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
|
HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
|
||||||
HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
|
HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
|
||||||
|
|
@ -458,7 +449,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||||
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
|
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
|
||||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in = *vec_in1++;
|
HVX_Vector in = *vec_in1++;
|
||||||
const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
|
const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
|
||||||
|
|
@ -468,7 +459,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||||
*vec_out++ = v;
|
*vec_out++ = v;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||||
|
|
||||||
|
|
@ -512,60 +503,54 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||||
FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
|
FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
|
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
|
||||||
bool handled_leftover = false;
|
bool handled_leftover = false;
|
||||||
if (0 == unaligned_loop) {
|
if (0 == unaligned_loop) {
|
||||||
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
|
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
|
||||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec);
|
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec);
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
||||||
int leftover_size = left_over * sizeof(float);
|
int leftover_size = left_over * sizeof(float);
|
||||||
|
|
||||||
|
HVX_Vector * input_v_ptr = (HVX_Vector *) src;
|
||||||
|
HVX_UVector * output_v_ptr = (HVX_UVector *) dst;
|
||||||
HVX_Vector * input_v_ptr = (HVX_Vector *) src;
|
|
||||||
HVX_UVector * output_v_ptr = (HVX_UVector *) dst;
|
|
||||||
|
|
||||||
|
|
||||||
HVX_Vector slinep;
|
HVX_Vector slinep;
|
||||||
HVX_Vector slinec;
|
HVX_Vector slinec;
|
||||||
HVX_Vector sline;
|
HVX_Vector sline;
|
||||||
|
|
||||||
slinep = *input_v_ptr++;
|
|
||||||
|
|
||||||
#pragma unroll(4)
|
slinep = *input_v_ptr++;
|
||||||
for(uint32_t i = step_of_1 - 1; i > 0; i--){
|
|
||||||
slinec = *input_v_ptr++;
|
#pragma unroll(4)
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
for (uint32_t i = step_of_1 - 1; i > 0; i--) {
|
||||||
*((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
slinec = *input_v_ptr++;
|
||||||
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
||||||
|
*((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
||||||
/* Prepare slinep for next iteration */
|
/* Prepare slinep for next iteration */
|
||||||
slinep = slinec;
|
slinep = slinec;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(step_of_1 > 0){
|
if (step_of_1 > 0) {
|
||||||
|
|
||||||
slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
|
slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
||||||
*((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
*((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
||||||
|
|
||||||
slinep = slinec;
|
slinep = slinec;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(leftover_size > 0){
|
if (leftover_size > 0) {
|
||||||
slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN)
|
slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++);
|
||||||
? slinep
|
|
||||||
: *input_v_ptr++);
|
|
||||||
|
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
||||||
|
|
||||||
HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
||||||
hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
||||||
handled_leftover = true;
|
handled_leftover = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -606,13 +591,13 @@ void hvx_sub_f32(const uint8_t * restrict src0,
|
||||||
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
||||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++);
|
HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++);
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
|
HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
|
||||||
HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
|
HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
|
||||||
|
|
@ -747,13 +732,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||||
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
|
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
|
||||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec);
|
HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec);
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||||
|
|
||||||
|
|
@ -789,7 +774,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems)
|
||||||
HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000);
|
HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000);
|
||||||
HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000);
|
HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000);
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1);
|
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1);
|
||||||
sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v);
|
sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v);
|
||||||
|
|
@ -833,13 +818,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
|
||||||
if (0 == unaligned_loop) {
|
if (0 == unaligned_loop) {
|
||||||
HVX_Vector * vec_in = (HVX_Vector *) src;
|
HVX_Vector * vec_in = (HVX_Vector *) src;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
// sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++);
|
// sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++);
|
||||||
sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++);
|
sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||||
|
|
||||||
|
|
@ -882,13 +867,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i
|
||||||
HVX_Vector * vec_in1 = (HVX_Vector *) src;
|
HVX_Vector * vec_in1 = (HVX_Vector *) src;
|
||||||
HVX_Vector * vec_out = (HVX_Vector *) dst;
|
HVX_Vector * vec_out = (HVX_Vector *) dst;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
|
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||||
|
|
||||||
|
|
@ -931,12 +916,12 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
|
||||||
if (0 == unaligned_loop) {
|
if (0 == unaligned_loop) {
|
||||||
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
|
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++);
|
vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||||
|
|
||||||
|
|
@ -974,7 +959,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||||
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
|
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
|
||||||
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
|
vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
|
*vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
|
||||||
|
|
@ -1012,7 +997,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
|
||||||
HVX_Vector range_left = hvx_vec_splat_fp32(limit_left);
|
HVX_Vector range_left = hvx_vec_splat_fp32(limit_left);
|
||||||
HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
|
HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in_vec = *vec_in++;
|
HVX_Vector in_vec = *vec_in++;
|
||||||
HVX_Vector temp_v = in_vec;
|
HVX_Vector temp_v = in_vec;
|
||||||
|
|
|
||||||
|
|
@ -23,20 +23,18 @@ typedef union {
|
||||||
|
|
||||||
/* Q6_Vsf_equals_Vw is only available on v73+.*/
|
/* Q6_Vsf_equals_Vw is only available on v73+.*/
|
||||||
#if __HVX_ARCH__ < 73
|
#if __HVX_ARCH__ < 73
|
||||||
static inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
|
static inline HVX_Vector int32_to_qfloat(const HVX_Vector in) {
|
||||||
{
|
const HVX_Vector vzero = Q6_V_vzero();
|
||||||
HVX_Vector const vzero = Q6_V_vzero();
|
HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
|
||||||
HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
|
HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
|
||||||
HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
|
HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
|
||||||
HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
|
HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
|
||||||
HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
|
HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
|
||||||
HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
|
HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
|
||||||
HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
|
static inline HVX_Vector Q6_Vsf_equals_Vw(const HVX_Vector in) {
|
||||||
{
|
|
||||||
return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
|
return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -109,7 +107,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (; i < nvec; i++) {
|
for (; i < nvec; i++) {
|
||||||
HVX_Vector v = vsrc[i];
|
HVX_Vector v = vsrc[i];
|
||||||
vdst[i] = v;
|
vdst[i] = v;
|
||||||
|
|
@ -133,7 +131,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (; i < nvec; i++) {
|
for (; i < nvec; i++) {
|
||||||
HVX_Vector v = vsrc[i];
|
HVX_Vector v = vsrc[i];
|
||||||
vdst[i] = v;
|
vdst[i] = v;
|
||||||
|
|
@ -157,7 +155,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (; i < nvec; i++) {
|
for (; i < nvec; i++) {
|
||||||
HVX_Vector v = vsrc[i];
|
HVX_Vector v = vsrc[i];
|
||||||
vdst[i] = v;
|
vdst[i] = v;
|
||||||
|
|
@ -182,7 +180,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (; i < nvec; i++) {
|
for (; i < nvec; i++) {
|
||||||
HVX_Vector v = vsrc[i];
|
HVX_Vector v = vsrc[i];
|
||||||
vdst[i] = v;
|
vdst[i] = v;
|
||||||
|
|
@ -206,7 +204,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (; i < nvec; i++) {
|
for (; i < nvec; i++) {
|
||||||
HVX_Vector v = vsrc[i];
|
HVX_Vector v = vsrc[i];
|
||||||
vdst[i] = v;
|
vdst[i] = v;
|
||||||
|
|
@ -230,7 +228,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (; i < nvec; i++) {
|
for (; i < nvec; i++) {
|
||||||
HVX_Vector v = vsrc[i];
|
HVX_Vector v = vsrc[i];
|
||||||
vdst[i] = v;
|
vdst[i] = v;
|
||||||
|
|
@ -255,7 +253,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (; i < nvec; i++) {
|
for (; i < nvec; i++) {
|
||||||
vdst[i] = velem;
|
vdst[i] = velem;
|
||||||
}
|
}
|
||||||
|
|
@ -265,7 +263,6 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
|
/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
|
||||||
static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
|
static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
|
||||||
uint32_t left_off = (size_t) addr & (chunk_size - 1);
|
uint32_t left_off = (size_t) addr & (chunk_size - 1);
|
||||||
|
|
@ -273,8 +270,6 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3
|
||||||
return right_off <= chunk_size;
|
return right_off <= chunk_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
|
static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
|
||||||
HVX_VectorAlias u = { .v = v };
|
HVX_VectorAlias u = { .v = v };
|
||||||
|
|
||||||
|
|
@ -992,16 +987,15 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
|
||||||
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||||
const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < step_of_1; i++) {
|
for (int i = 0; i < step_of_1; i++) {
|
||||||
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
|
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
||||||
static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
|
|
||||||
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
||||||
int leftover = num_elems - (step_of_1 * VLEN_FP32);
|
int leftover = num_elems - (step_of_1 * VLEN_FP32);
|
||||||
|
|
||||||
int32_t leftover_size = leftover * sizeof(float);
|
int32_t leftover_size = leftover * sizeof(float);
|
||||||
|
|
||||||
|
|
@ -1012,51 +1006,44 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
|
||||||
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||||
const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
||||||
|
|
||||||
const float *input = (float *)src;
|
const float * input = (float *) src;
|
||||||
float *output = (float *)dst;
|
float * output = (float *) dst;
|
||||||
|
|
||||||
HVX_Vector * input_v_ptr = (HVX_Vector *) input;
|
|
||||||
HVX_UVector * output_v_ptr = (HVX_UVector *) output;
|
|
||||||
|
|
||||||
|
HVX_Vector * input_v_ptr = (HVX_Vector *) input;
|
||||||
|
HVX_UVector * output_v_ptr = (HVX_UVector *) output;
|
||||||
|
|
||||||
HVX_Vector slinep;
|
HVX_Vector slinep;
|
||||||
HVX_Vector slinec;
|
HVX_Vector slinec;
|
||||||
HVX_Vector sline;
|
HVX_Vector sline;
|
||||||
|
|
||||||
|
|
||||||
slinep = *input_v_ptr++;
|
slinep = *input_v_ptr++;
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for(uint32_t i = step_of_1 -1; i> 0; i--){
|
for (uint32_t i = step_of_1 - 1; i > 0; i--) {
|
||||||
slinec = *input_v_ptr++;
|
slinec = *input_v_ptr++;
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
||||||
*((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
|
*((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
|
||||||
/* Prepare slinep for next iteration */
|
/* Prepare slinep for next iteration */
|
||||||
slinep = slinec;
|
slinep = slinec;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(step_of_1> 0){
|
if (step_of_1 > 0) {
|
||||||
|
|
||||||
slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
|
slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
||||||
*((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);;
|
*((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
|
||||||
|
;
|
||||||
|
|
||||||
slinep = slinec;
|
slinep = slinec;
|
||||||
}
|
}
|
||||||
if(leftover> 0){
|
if (leftover > 0) {
|
||||||
slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128)
|
slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++);
|
||||||
? slinep
|
|
||||||
: *input_v_ptr++);
|
|
||||||
|
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
||||||
|
|
||||||
HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
|
HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
|
||||||
hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
|
float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
|
||||||
void hvx_mul_f32(const uint8_t * restrict src0,
|
void hvx_mul_f32(const uint8_t * restrict src0,
|
||||||
const uint8_t * restrict src1,
|
const uint8_t * restrict src1,
|
||||||
|
|
|
||||||
|
|
@ -151,7 +151,7 @@ static int vtcm_acquire(struct htp_context * ctx) {
|
||||||
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
|
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
|
||||||
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
|
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err);
|
||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
HAP_compute_res_release_cached(ctx->vtcm_rctx);
|
HAP_compute_res_release_cached(ctx->vtcm_rctx);
|
||||||
|
|
@ -159,7 +159,7 @@ static int vtcm_acquire(struct htp_context * ctx) {
|
||||||
|
|
||||||
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
|
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err);
|
||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
ctx->vtcm_valid = true;
|
ctx->vtcm_valid = true;
|
||||||
|
|
@ -411,7 +411,7 @@ static void proc_matmul_req(struct htp_context * ctx,
|
||||||
rsp_bufs[0].ptr = bufs[2].ptr;
|
rsp_bufs[0].ptr = bufs[2].ptr;
|
||||||
rsp_bufs[0].size = bufs[2].size;
|
rsp_bufs[0].size = bufs[2].size;
|
||||||
rsp_bufs[0].offset = bufs[2].offset;
|
rsp_bufs[0].offset = bufs[2].offset;
|
||||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||||
|
|
||||||
// Setup Op context
|
// Setup Op context
|
||||||
|
|
@ -453,7 +453,7 @@ static void proc_matmul_id_req(struct htp_context * ctx,
|
||||||
rsp_bufs[0].ptr = bufs[3].ptr;
|
rsp_bufs[0].ptr = bufs[3].ptr;
|
||||||
rsp_bufs[0].size = bufs[3].size;
|
rsp_bufs[0].size = bufs[3].size;
|
||||||
rsp_bufs[0].offset = bufs[3].offset;
|
rsp_bufs[0].offset = bufs[3].offset;
|
||||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||||
|
|
||||||
// Setup Op context
|
// Setup Op context
|
||||||
|
|
@ -494,7 +494,7 @@ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * r
|
||||||
rsp_bufs[0].ptr = bufs[2].ptr;
|
rsp_bufs[0].ptr = bufs[2].ptr;
|
||||||
rsp_bufs[0].offset = bufs[2].offset;
|
rsp_bufs[0].offset = bufs[2].offset;
|
||||||
rsp_bufs[0].size = bufs[2].size;
|
rsp_bufs[0].size = bufs[2].size;
|
||||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||||
|
|
||||||
// Setup Op context
|
// Setup Op context
|
||||||
|
|
@ -533,7 +533,7 @@ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * r
|
||||||
rsp_bufs[0].ptr = bufs[3].ptr;
|
rsp_bufs[0].ptr = bufs[3].ptr;
|
||||||
rsp_bufs[0].offset = bufs[3].offset;
|
rsp_bufs[0].offset = bufs[3].offset;
|
||||||
rsp_bufs[0].size = bufs[3].size;
|
rsp_bufs[0].size = bufs[3].size;
|
||||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||||
|
|
||||||
// Setup Op context
|
// Setup Op context
|
||||||
|
|
@ -574,7 +574,7 @@ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * re
|
||||||
rsp_bufs[0].ptr = bufs[1].ptr;
|
rsp_bufs[0].ptr = bufs[1].ptr;
|
||||||
rsp_bufs[0].offset = bufs[1].offset;
|
rsp_bufs[0].offset = bufs[1].offset;
|
||||||
rsp_bufs[0].size = bufs[1].size;
|
rsp_bufs[0].size = bufs[1].size;
|
||||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||||
|
|
||||||
// Setup Op context
|
// Setup Op context
|
||||||
|
|
@ -618,8 +618,8 @@ static void proc_activations_req(struct htp_context * ctx,
|
||||||
rsp_bufs[0].ptr = bufs[write_idx].ptr;
|
rsp_bufs[0].ptr = bufs[write_idx].ptr;
|
||||||
rsp_bufs[0].offset = bufs[write_idx].offset;
|
rsp_bufs[0].offset = bufs[write_idx].offset;
|
||||||
rsp_bufs[0].size = bufs[write_idx].size;
|
rsp_bufs[0].size = bufs[write_idx].size;
|
||||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||||
|
|
||||||
// Setup Op context
|
// Setup Op context
|
||||||
struct htp_ops_context octx = { 0 };
|
struct htp_ops_context octx = { 0 };
|
||||||
|
|
@ -674,8 +674,8 @@ static void proc_rope_req(struct htp_context * ctx,
|
||||||
rsp_bufs[0].ptr = bufs[write_idx].ptr;
|
rsp_bufs[0].ptr = bufs[write_idx].ptr;
|
||||||
rsp_bufs[0].offset = bufs[write_idx].offset;
|
rsp_bufs[0].offset = bufs[write_idx].offset;
|
||||||
rsp_bufs[0].size = bufs[write_idx].size;
|
rsp_bufs[0].size = bufs[write_idx].size;
|
||||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||||
|
|
||||||
// Setup Op context
|
// Setup Op context
|
||||||
struct htp_ops_context octx = { 0 };
|
struct htp_ops_context octx = { 0 };
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue