diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 8cf57f84bf..d481598f28 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -2296,9 +2297,7 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer (unsigned int) d->size); } -typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, - dspqueue_buffer (&bufs)[4], - const struct ggml_tensor * op); +typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op); template static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) { @@ -2326,210 +2325,151 @@ static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flag } dspqueue_buffer bufs[4]; - init_req(&req, buf, op); + const size_t n_bufs = init_req(&req, bufs, op); auto * sess = get_session_from_tensor(src0); if (opt_verbose) { hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(src2, &bufs[2]); + if (src1) { + hex_dump_dspbuf(src1, &bufs[1]); + } + if (src2) { + hex_dump_dspbuf(src2, &bufs[2]); + } hex_dump_dspbuf(dst, &bufs[3]); } } if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, 4, opt_opsync); + sess->enqueue(req, bufs, n_bufs, opt_opsync); } t2 = ggml_time_us(); - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) " - "call-usec %llu\n", - sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + if (src1) { + HEX_PROFILE( + "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u " + "(%f) call-usec %llu\n", + sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, + (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + } else { + HEX_PROFILE( + "ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec " + "%llu\n", + sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, + (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + } } template static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { - const struct ggml_tensor * node = op; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const struct ggml_tensor * dst = node; + constexpr const auto init_func = [](htp_general_req * req, dspqueue_buffer(&bufs)[4], + const ggml_tensor * op) -> size_t { + const struct ggml_tensor * node = op; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const struct ggml_tensor * dst = node; - uint64_t t1 = 0; - uint64_t t2 = 0; - - t1 = ggml_time_us(); - - // Construct HTP message - htp_general_req req; - req.flags = flags; - - // Use opmask to override flags - if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { - req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { - req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; - } - - switch (node->op) { - case GGML_OP_MUL_MAT: - req.op = HTP_OP_MUL_MAT; - break; - case GGML_OP_MUL: - req.op = HTP_OP_MUL; - break; - case GGML_OP_ADD: - req.op = HTP_OP_ADD; - break; - case GGML_OP_SUB: - req.op = HTP_OP_SUB; - break; - default: - GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op); - } - - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.dst, dst); - - dspqueue_buffer bufs[3]; - - // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op). - // If constant (e.g. weights), no cache management is needed. - // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. - // Note: On platforms with I/O coherency, the framework skips cache ops automatically. - dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - - // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - - // Buffer 2 (dst): Output Activations. - // DSP writes, CPU reads. - // We flush CPU caches to ensure consistency before DSP writes. - // DSP cache maintenance is handled in the response message. - dspqueue_buffers_init(&bufs[2], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - auto * sess = get_session_from_tensor(src0); - - if (opt_verbose) { - hex_print_op_info(op, sess, req.flags); - if (opt_verbose > 1) { - hex_dump_dspbuf(src0, &bufs[0]); - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(dst, &bufs[2]); + switch (node->op) { + case GGML_OP_MUL_MAT: + req->op = HTP_OP_MUL_MAT; + break; + case GGML_OP_MUL: + req->op = HTP_OP_MUL; + break; + case GGML_OP_ADD: + req->op = HTP_OP_ADD; + break; + case GGML_OP_SUB: + req->op = HTP_OP_SUB; + break; + default: + GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op); + break; } - } - if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, 3, opt_opsync); - } + init_htp_tensor(&req->src0, src0); + init_htp_tensor(&req->src1, src1); + init_htp_tensor(&req->dst, dst); - t2 = ggml_time_us(); + // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op). + // If constant (e.g. weights), no cache management is needed. + // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. + // Note: On platforms with I/O coherency, the framework skips cache ops automatically. + size_t n_bufs = dspqueue_buffers_init( + bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) " - "call-usec %llu\n", - sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + + // Buffer 2 (dst): Output Activations. + // DSP writes, CPU reads. + // We flush CPU caches to ensure consistency before DSP writes. + // DSP cache maintenance is handled in the response message. + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); + + return n_bufs; + }; + + ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags); } template static void ggml_hexagon_binary_id(const struct ggml_tensor * op, uint32_t flags) { - const struct ggml_tensor * node = op; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const struct ggml_tensor * src2 = node->src[2]; - const struct ggml_tensor * dst = node; + constexpr const auto init_func = [](htp_general_req * req, dspqueue_buffer(&bufs)[4], + const ggml_tensor * op) -> size_t { + const struct ggml_tensor * node = op; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const struct ggml_tensor * src2 = node->src[2]; + const struct ggml_tensor * dst = node; - uint64_t t1 = 0; - uint64_t t2 = 0; - - t1 = ggml_time_us(); - - // Construct HTP message - htp_general_req req; - req.flags = flags; - - // Use opmask to override flags - if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { - req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { - req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; - } - - switch (node->op) { - case GGML_OP_MUL_MAT_ID: - req.op = HTP_OP_MUL_MAT_ID; - break; - case GGML_OP_ADD_ID: - req.op = HTP_OP_ADD_ID; - break; - default: - GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op); - } - - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.src2, src2); - init_htp_tensor(&req.dst, dst); - - dspqueue_buffer bufs[4]; - - // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op). - // If constant, no cache management is needed. - // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. - dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - - // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - - // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - - // Buffer 3 (dst): Output Activations. - // DSP writes, CPU reads. - // We flush CPU caches to ensure consistency before DSP writes. - // DSP cache maintenance is handled in the response message. - dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - - auto * sess = get_session_from_tensor(src0); - - if (opt_verbose) { - hex_print_op_info(op, sess, req.flags); - if (opt_verbose > 1) { - hex_dump_dspbuf(src0, &bufs[0]); - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(src2, &bufs[2]); - hex_dump_dspbuf(dst, &bufs[3]); + switch (node->op) { + case GGML_OP_MUL_MAT_ID: + req->op = HTP_OP_MUL_MAT_ID; + break; + case GGML_OP_ADD_ID: + req->op = HTP_OP_ADD_ID; + break; + default: + GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op); } - } - if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, 4, opt_opsync); - } + init_htp_tensor(&req->src0, src0); + init_htp_tensor(&req->src1, src1); + init_htp_tensor(&req->src2, src2); + init_htp_tensor(&req->dst, dst); - t2 = ggml_time_us(); + // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op). + // If constant, no cache management is needed. + // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. + size_t n_bufs = dspqueue_buffers_init( + bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) " - "call-usec %llu\n", - sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + + // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + + // Buffer 3 (dst): Output Activations. + // DSP writes, CPU reads. + // We flush CPU caches to ensure consistency before DSP writes. + // DSP cache maintenance is handled in the response message. + n_bufs += dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); + + return n_bufs; + }; + + ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags); } static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {