From 186053c9a8e1c683af17f87b137cb9654d07c9c9 Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 26 Nov 2025 15:48:20 +0800 Subject: [PATCH] refactor: streamline hexagon operation initialization and buffer management --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 182 ++++++++++++------------- 1 file changed, 89 insertions(+), 93 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index d481598f28..1277e698ad 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2299,8 +2299,8 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op); -template -static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) { +template +static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; @@ -2368,108 +2368,104 @@ static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flag } } -template static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { - constexpr const auto init_func = [](htp_general_req * req, dspqueue_buffer(&bufs)[4], - const ggml_tensor * op) -> size_t { - const struct ggml_tensor * node = op; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const struct ggml_tensor * dst = node; +template +static inline size_t init_binary_req_and_bufs(htp_general_req * req, + dspqueue_buffer (&bufs)[4], + const ggml_tensor * op) { + const struct ggml_tensor * node = op; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const struct ggml_tensor * dst = node; - switch (node->op) { - case GGML_OP_MUL_MAT: - req->op = HTP_OP_MUL_MAT; - break; - case GGML_OP_MUL: - req->op = HTP_OP_MUL; - break; - case GGML_OP_ADD: - req->op = HTP_OP_ADD; - break; - case GGML_OP_SUB: - req->op = HTP_OP_SUB; - break; - default: - GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op); - break; - } + switch (node->op) { + case GGML_OP_MUL_MAT: + req->op = HTP_OP_MUL_MAT; + break; + case GGML_OP_MUL: + req->op = HTP_OP_MUL; + break; + case GGML_OP_ADD: + req->op = HTP_OP_ADD; + break; + case GGML_OP_SUB: + req->op = HTP_OP_SUB; + break; + default: + GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op); + break; + } - init_htp_tensor(&req->src0, src0); - init_htp_tensor(&req->src1, src1); - init_htp_tensor(&req->dst, dst); + init_htp_tensor(&req->src0, src0); + init_htp_tensor(&req->src1, src1); + init_htp_tensor(&req->dst, dst); - // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op). - // If constant (e.g. weights), no cache management is needed. - // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. - // Note: On platforms with I/O coherency, the framework skips cache ops automatically. - size_t n_bufs = dspqueue_buffers_init( - bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op). + // If constant (e.g. weights), no cache management is needed. + // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. + // Note: On platforms with I/O coherency, the framework skips cache ops automatically. + size_t n_bufs = dspqueue_buffers_init( + bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Buffer 2 (dst): Output Activations. - // DSP writes, CPU reads. - // We flush CPU caches to ensure consistency before DSP writes. - // DSP cache maintenance is handled in the response message. - n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); + // Buffer 2 (dst): Output Activations. + // DSP writes, CPU reads. + // We flush CPU caches to ensure consistency before DSP writes. + // DSP cache maintenance is handled in the response message. + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - return n_bufs; - }; - - ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags); + return n_bufs; } -template static void ggml_hexagon_binary_id(const struct ggml_tensor * op, uint32_t flags) { - constexpr const auto init_func = [](htp_general_req * req, dspqueue_buffer(&bufs)[4], - const ggml_tensor * op) -> size_t { - const struct ggml_tensor * node = op; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const struct ggml_tensor * src2 = node->src[2]; - const struct ggml_tensor * dst = node; +template +static inline size_t init_binary_id_req_and_bufs(htp_general_req * req, + dspqueue_buffer (&bufs)[4], + const ggml_tensor * op) { + const struct ggml_tensor * node = op; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const struct ggml_tensor * src2 = node->src[2]; + const struct ggml_tensor * dst = node; - switch (node->op) { - case GGML_OP_MUL_MAT_ID: - req->op = HTP_OP_MUL_MAT_ID; - break; - case GGML_OP_ADD_ID: - req->op = HTP_OP_ADD_ID; - break; - default: - GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op); - } + switch (node->op) { + case GGML_OP_MUL_MAT_ID: + req->op = HTP_OP_MUL_MAT_ID; + break; + case GGML_OP_ADD_ID: + req->op = HTP_OP_ADD_ID; + break; + default: + GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op); + } - init_htp_tensor(&req->src0, src0); - init_htp_tensor(&req->src1, src1); - init_htp_tensor(&req->src2, src2); - init_htp_tensor(&req->dst, dst); + init_htp_tensor(&req->src0, src0); + init_htp_tensor(&req->src1, src1); + init_htp_tensor(&req->src2, src2); + init_htp_tensor(&req->dst, dst); - // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op). - // If constant, no cache management is needed. - // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. - size_t n_bufs = dspqueue_buffers_init( - bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op). + // If constant, no cache management is needed. + // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. + size_t n_bufs = dspqueue_buffers_init( + bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - n_bufs += dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - n_bufs += dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Buffer 3 (dst): Output Activations. - // DSP writes, CPU reads. - // We flush CPU caches to ensure consistency before DSP writes. - // DSP cache maintenance is handled in the response message. - n_bufs += dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); + // Buffer 3 (dst): Output Activations. + // DSP writes, CPU reads. + // We flush CPU caches to ensure consistency before DSP writes. + // DSP cache maintenance is handled in the response message. + n_bufs += dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - return n_bufs; - }; - - ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags); + return n_bufs; } static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { @@ -2775,20 +2771,20 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg switch (node->op) { case GGML_OP_MUL_MAT: - ggml_hexagon_binary(node, flags); + ggml_hexagon_op_generic>(node, flags); prev_quant_op = node; break; case GGML_OP_MUL_MAT_ID: - ggml_hexagon_binary_id(node, flags); + ggml_hexagon_op_generic>(node, flags); prev_quant_op = node; break; case GGML_OP_MUL: case GGML_OP_ADD: case GGML_OP_SUB: - ggml_hexagon_binary(node, flags); + ggml_hexagon_op_generic>(node, flags); break; case GGML_OP_ADD_ID: - ggml_hexagon_binary_id(node, flags); + ggml_hexagon_op_generic>(node, flags); break; case GGML_OP_RMS_NORM: ggml_hexagon_unary(node, flags);