refactor: simplify tensor operation initialization and buffer management in hexagon implementation
This commit is contained in:
parent
5f9dfe64cf
commit
5e27b7f402
|
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
#include <cstddef>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
@ -2296,9 +2297,7 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer
|
||||||
(unsigned int) d->size);
|
(unsigned int) d->size);
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req,
|
typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op);
|
||||||
dspqueue_buffer (&bufs)[4],
|
|
||||||
const struct ggml_tensor * op);
|
|
||||||
|
|
||||||
template <bool _IsSrc0Constant, init_dsp_req_and_buffer_t init_req>
|
template <bool _IsSrc0Constant, init_dsp_req_and_buffer_t init_req>
|
||||||
static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) {
|
static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) {
|
||||||
|
|
@ -2326,210 +2325,151 @@ static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flag
|
||||||
}
|
}
|
||||||
|
|
||||||
dspqueue_buffer bufs[4];
|
dspqueue_buffer bufs[4];
|
||||||
init_req(&req, buf, op);
|
const size_t n_bufs = init_req(&req, bufs, op);
|
||||||
|
|
||||||
auto * sess = get_session_from_tensor(src0);
|
auto * sess = get_session_from_tensor(src0);
|
||||||
if (opt_verbose) {
|
if (opt_verbose) {
|
||||||
hex_print_op_info(op, sess, req.flags);
|
hex_print_op_info(op, sess, req.flags);
|
||||||
if (opt_verbose > 1) {
|
if (opt_verbose > 1) {
|
||||||
hex_dump_dspbuf(src0, &bufs[0]);
|
hex_dump_dspbuf(src0, &bufs[0]);
|
||||||
hex_dump_dspbuf(src1, &bufs[1]);
|
if (src1) {
|
||||||
hex_dump_dspbuf(src2, &bufs[2]);
|
hex_dump_dspbuf(src1, &bufs[1]);
|
||||||
|
}
|
||||||
|
if (src2) {
|
||||||
|
hex_dump_dspbuf(src2, &bufs[2]);
|
||||||
|
}
|
||||||
hex_dump_dspbuf(dst, &bufs[3]);
|
hex_dump_dspbuf(dst, &bufs[3]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
|
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
|
||||||
sess->enqueue(req, bufs, 4, opt_opsync);
|
sess->enqueue(req, bufs, n_bufs, opt_opsync);
|
||||||
}
|
}
|
||||||
|
|
||||||
t2 = ggml_time_us();
|
t2 = ggml_time_us();
|
||||||
|
|
||||||
HEX_PROFILE(
|
if (src1) {
|
||||||
"ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
|
HEX_PROFILE(
|
||||||
"call-usec %llu\n",
|
"ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
|
||||||
sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
"(%f) call-usec %llu\n",
|
||||||
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
||||||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
||||||
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
||||||
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
||||||
|
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
||||||
|
} else {
|
||||||
|
HEX_PROFILE(
|
||||||
|
"ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec "
|
||||||
|
"%llu\n",
|
||||||
|
sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
||||||
|
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
||||||
|
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
||||||
|
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool _IsSrc0Constant> static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
|
template <bool _IsSrc0Constant> static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
|
||||||
const struct ggml_tensor * node = op;
|
constexpr const auto init_func = [](htp_general_req * req, dspqueue_buffer(&bufs)[4],
|
||||||
const struct ggml_tensor * src0 = node->src[0];
|
const ggml_tensor * op) -> size_t {
|
||||||
const struct ggml_tensor * src1 = node->src[1];
|
const struct ggml_tensor * node = op;
|
||||||
const struct ggml_tensor * dst = node;
|
const struct ggml_tensor * src0 = node->src[0];
|
||||||
|
const struct ggml_tensor * src1 = node->src[1];
|
||||||
|
const struct ggml_tensor * dst = node;
|
||||||
|
|
||||||
uint64_t t1 = 0;
|
switch (node->op) {
|
||||||
uint64_t t2 = 0;
|
case GGML_OP_MUL_MAT:
|
||||||
|
req->op = HTP_OP_MUL_MAT;
|
||||||
t1 = ggml_time_us();
|
break;
|
||||||
|
case GGML_OP_MUL:
|
||||||
// Construct HTP message
|
req->op = HTP_OP_MUL;
|
||||||
htp_general_req req;
|
break;
|
||||||
req.flags = flags;
|
case GGML_OP_ADD:
|
||||||
|
req->op = HTP_OP_ADD;
|
||||||
// Use opmask to override flags
|
break;
|
||||||
if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
|
case GGML_OP_SUB:
|
||||||
req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
req->op = HTP_OP_SUB;
|
||||||
}
|
break;
|
||||||
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
|
default:
|
||||||
req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
|
||||||
}
|
break;
|
||||||
|
|
||||||
switch (node->op) {
|
|
||||||
case GGML_OP_MUL_MAT:
|
|
||||||
req.op = HTP_OP_MUL_MAT;
|
|
||||||
break;
|
|
||||||
case GGML_OP_MUL:
|
|
||||||
req.op = HTP_OP_MUL;
|
|
||||||
break;
|
|
||||||
case GGML_OP_ADD:
|
|
||||||
req.op = HTP_OP_ADD;
|
|
||||||
break;
|
|
||||||
case GGML_OP_SUB:
|
|
||||||
req.op = HTP_OP_SUB;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
|
|
||||||
}
|
|
||||||
|
|
||||||
init_htp_tensor(&req.src0, src0);
|
|
||||||
init_htp_tensor(&req.src1, src1);
|
|
||||||
init_htp_tensor(&req.dst, dst);
|
|
||||||
|
|
||||||
dspqueue_buffer bufs[3];
|
|
||||||
|
|
||||||
// Buffer 0 (src0): Weights (mulmat) or First Operand (binary op).
|
|
||||||
// If constant (e.g. weights), no cache management is needed.
|
|
||||||
// Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
|
|
||||||
// Note: On platforms with I/O coherency, the framework skips cache ops automatically.
|
|
||||||
dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
|
|
||||||
|
|
||||||
// Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op).
|
|
||||||
// CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
|
|
||||||
dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
|
|
||||||
|
|
||||||
// Buffer 2 (dst): Output Activations.
|
|
||||||
// DSP writes, CPU reads.
|
|
||||||
// We flush CPU caches to ensure consistency before DSP writes.
|
|
||||||
// DSP cache maintenance is handled in the response message.
|
|
||||||
dspqueue_buffers_init(&bufs[2], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
|
|
||||||
auto * sess = get_session_from_tensor(src0);
|
|
||||||
|
|
||||||
if (opt_verbose) {
|
|
||||||
hex_print_op_info(op, sess, req.flags);
|
|
||||||
if (opt_verbose > 1) {
|
|
||||||
hex_dump_dspbuf(src0, &bufs[0]);
|
|
||||||
hex_dump_dspbuf(src1, &bufs[1]);
|
|
||||||
hex_dump_dspbuf(dst, &bufs[2]);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
|
init_htp_tensor(&req->src0, src0);
|
||||||
sess->enqueue(req, bufs, 3, opt_opsync);
|
init_htp_tensor(&req->src1, src1);
|
||||||
}
|
init_htp_tensor(&req->dst, dst);
|
||||||
|
|
||||||
t2 = ggml_time_us();
|
// Buffer 0 (src0): Weights (mulmat) or First Operand (binary op).
|
||||||
|
// If constant (e.g. weights), no cache management is needed.
|
||||||
|
// Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
|
||||||
|
// Note: On platforms with I/O coherency, the framework skips cache ops automatically.
|
||||||
|
size_t n_bufs = dspqueue_buffers_init(
|
||||||
|
bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
|
||||||
HEX_PROFILE(
|
// Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op).
|
||||||
"ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
|
// CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
|
||||||
"call-usec %llu\n",
|
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
|
||||||
sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
|
||||||
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
// Buffer 2 (dst): Output Activations.
|
||||||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
// DSP writes, CPU reads.
|
||||||
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
// We flush CPU caches to ensure consistency before DSP writes.
|
||||||
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
// DSP cache maintenance is handled in the response message.
|
||||||
|
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
|
||||||
|
|
||||||
|
return n_bufs;
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool _IsSrc0Constant> static void ggml_hexagon_binary_id(const struct ggml_tensor * op, uint32_t flags) {
|
template <bool _IsSrc0Constant> static void ggml_hexagon_binary_id(const struct ggml_tensor * op, uint32_t flags) {
|
||||||
const struct ggml_tensor * node = op;
|
constexpr const auto init_func = [](htp_general_req * req, dspqueue_buffer(&bufs)[4],
|
||||||
const struct ggml_tensor * src0 = node->src[0];
|
const ggml_tensor * op) -> size_t {
|
||||||
const struct ggml_tensor * src1 = node->src[1];
|
const struct ggml_tensor * node = op;
|
||||||
const struct ggml_tensor * src2 = node->src[2];
|
const struct ggml_tensor * src0 = node->src[0];
|
||||||
const struct ggml_tensor * dst = node;
|
const struct ggml_tensor * src1 = node->src[1];
|
||||||
|
const struct ggml_tensor * src2 = node->src[2];
|
||||||
|
const struct ggml_tensor * dst = node;
|
||||||
|
|
||||||
uint64_t t1 = 0;
|
switch (node->op) {
|
||||||
uint64_t t2 = 0;
|
case GGML_OP_MUL_MAT_ID:
|
||||||
|
req->op = HTP_OP_MUL_MAT_ID;
|
||||||
t1 = ggml_time_us();
|
break;
|
||||||
|
case GGML_OP_ADD_ID:
|
||||||
// Construct HTP message
|
req->op = HTP_OP_ADD_ID;
|
||||||
htp_general_req req;
|
break;
|
||||||
req.flags = flags;
|
default:
|
||||||
|
GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
|
||||||
// Use opmask to override flags
|
|
||||||
if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
|
|
||||||
req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
|
||||||
}
|
|
||||||
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
|
|
||||||
req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (node->op) {
|
|
||||||
case GGML_OP_MUL_MAT_ID:
|
|
||||||
req.op = HTP_OP_MUL_MAT_ID;
|
|
||||||
break;
|
|
||||||
case GGML_OP_ADD_ID:
|
|
||||||
req.op = HTP_OP_ADD_ID;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
|
|
||||||
}
|
|
||||||
|
|
||||||
init_htp_tensor(&req.src0, src0);
|
|
||||||
init_htp_tensor(&req.src1, src1);
|
|
||||||
init_htp_tensor(&req.src2, src2);
|
|
||||||
init_htp_tensor(&req.dst, dst);
|
|
||||||
|
|
||||||
dspqueue_buffer bufs[4];
|
|
||||||
|
|
||||||
// Buffer 0 (src0): Weights (mulmat) or Input Activations (other op).
|
|
||||||
// If constant, no cache management is needed.
|
|
||||||
// Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
|
|
||||||
dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
|
|
||||||
|
|
||||||
// Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op).
|
|
||||||
// CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
|
|
||||||
dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
|
|
||||||
|
|
||||||
// Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op).
|
|
||||||
// CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
|
|
||||||
dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
|
|
||||||
|
|
||||||
// Buffer 3 (dst): Output Activations.
|
|
||||||
// DSP writes, CPU reads.
|
|
||||||
// We flush CPU caches to ensure consistency before DSP writes.
|
|
||||||
// DSP cache maintenance is handled in the response message.
|
|
||||||
dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
|
|
||||||
|
|
||||||
auto * sess = get_session_from_tensor(src0);
|
|
||||||
|
|
||||||
if (opt_verbose) {
|
|
||||||
hex_print_op_info(op, sess, req.flags);
|
|
||||||
if (opt_verbose > 1) {
|
|
||||||
hex_dump_dspbuf(src0, &bufs[0]);
|
|
||||||
hex_dump_dspbuf(src1, &bufs[1]);
|
|
||||||
hex_dump_dspbuf(src2, &bufs[2]);
|
|
||||||
hex_dump_dspbuf(dst, &bufs[3]);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
|
init_htp_tensor(&req->src0, src0);
|
||||||
sess->enqueue(req, bufs, 4, opt_opsync);
|
init_htp_tensor(&req->src1, src1);
|
||||||
}
|
init_htp_tensor(&req->src2, src2);
|
||||||
|
init_htp_tensor(&req->dst, dst);
|
||||||
|
|
||||||
t2 = ggml_time_us();
|
// Buffer 0 (src0): Weights (mulmat) or Input Activations (other op).
|
||||||
|
// If constant, no cache management is needed.
|
||||||
|
// Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
|
||||||
|
size_t n_bufs = dspqueue_buffers_init(
|
||||||
|
bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
|
||||||
HEX_PROFILE(
|
// Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op).
|
||||||
"ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
|
// CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
|
||||||
"call-usec %llu\n",
|
n_bufs += dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
|
||||||
sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
|
||||||
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
// Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op).
|
||||||
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
// CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
|
||||||
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
n_bufs += dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
|
||||||
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
|
||||||
|
// Buffer 3 (dst): Output Activations.
|
||||||
|
// DSP writes, CPU reads.
|
||||||
|
// We flush CPU caches to ensure consistency before DSP writes.
|
||||||
|
// DSP cache maintenance is handled in the response message.
|
||||||
|
n_bufs += dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
|
||||||
|
|
||||||
|
return n_bufs;
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
|
static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue