From d12247a60d59eb5dcf30b24db1085b2692bece1f Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 24 Nov 2025 12:35:45 +0800 Subject: [PATCH] refactor: replace ggml_hexagon_mul_mat with template-based binary operation for improved flexibility --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 81 +++----------------------- 1 file changed, 7 insertions(+), 74 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 72a82a8911..8b7641efa0 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2322,76 +2322,6 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer (unsigned int) d->size); } -static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - const struct ggml_tensor * dst = op; - - uint64_t t1, t2; - t1 = ggml_time_us(); - - // Construct HTP message - htp_general_req req; - req.op = HTP_OP_MUL_MAT; - req.flags = flags; - - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.dst, dst); - - // Use opmask to override flags - if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { - req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { - req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; - } - - dspqueue_buffer bufs[3]; - - // First buffer Weights. - // The content is static, there is no need to do any cache management - dspqueue_buffers_init(bufs, src0, false, false); - - // Second buffer Input Activations. This is a buffer that the CPU - // writes and the DSP reads, so we'll need to flush CPU caches and - // invalidate DSP ones. On platforms with I/O coherency support the - // framework will automatically skip cache operations where possible. - dspqueue_buffers_init(&bufs[1], src1, true, true); - - // Third buffer Output Activations. We'll handle DSP - // cache maintenance in the response message but need to flush - // CPU caches to ensure any previously written dirty lines are - // written out before writes from the DSP start. - dspqueue_buffers_init(&bufs[2], dst, true, false); - - auto * sess = get_session_from_tensor(src0); - - if (opt_verbose) { - hex_print_op_info(op, sess, req.flags); - if (opt_verbose > 1) { - hex_dump_dspbuf(src0, &bufs[0]); - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(dst, &bufs[2]); - } - } - - if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, 3, opt_opsync); - } - - t2 = ggml_time_us(); - - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) " - "call-usec %llu\n", - sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); -} - static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; @@ -2471,7 +2401,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); } -static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { +template static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; @@ -2495,6 +2425,9 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { } switch (node->op) { + case GGML_OP_MUL_MAT: + req.op = HTP_OP_MUL_MAT; + break; case GGML_OP_MUL: req.op = HTP_OP_MUL; break; @@ -2518,7 +2451,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - dspqueue_buffers_init(bufs, src0, true, true); + dspqueue_buffers_init(bufs, src0, !_IsSrc0Constant, !_IsSrc0Constant); // Second buffer = Second Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll @@ -2938,7 +2871,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg switch (node->op) { case GGML_OP_MUL_MAT: - ggml_hexagon_mul_mat(node, flags); + ggml_hexagon_binary(node, flags); prev_quant_op = node; break; case GGML_OP_MUL_MAT_ID: @@ -2948,7 +2881,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg case GGML_OP_MUL: case GGML_OP_ADD: case GGML_OP_SUB: - ggml_hexagon_binary(node, flags); + ggml_hexagon_binary(node, flags); break; case GGML_OP_ADD_ID: ggml_hexagon_add_id(node, flags);