From d058fc43734410686ca1a5cc02ab99b2a47f0b14 Mon Sep 17 00:00:00 2001 From: ngdxzy Date: Tue, 16 Dec 2025 16:14:15 -0500 Subject: [PATCH] feat: add atomic lock to ops context --- ggml/src/ggml-hexagon/htp/htp-ops.h | 2 ++ ggml/src/ggml-hexagon/htp/matmul-ops.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index e87657436f..070bd6fcaf 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -50,6 +50,8 @@ struct htp_ops_context { struct fastdiv_values src1_div21; // fastdiv values for ne2 * ne1 uint32_t flags; + + atomic_uint shared_atomic_lock; }; int op_matmul(struct htp_ops_context * octx); diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 346f0bd339..beaa4fea14 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -1712,6 +1712,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src, static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = data; quantize_fp32_q8x4x2(&octx->src1, octx->src1_spad.data, &octx->src0_spad, n, i, octx->src1_nrows_per_thread); + atomic_fetch_add(&octx->shared_atomic_lock, 1); } // ** matmul callbacks for worker_pool @@ -2027,6 +2028,8 @@ int op_matmul(struct htp_ops_context * octx) { octx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads; octx->src0_nrows_per_thread += (octx->src0_nrows_per_thread & 1); // round up to even + atomic_store(&octx->shared_atomic_lock, 0); + if (need_quant) { // Run quant jobs const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads); @@ -2034,6 +2037,8 @@ int op_matmul(struct htp_ops_context * octx) { worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, octx, n_quant_jobs); } + FARF(HIGH, "matmul-%s : quant jobs finished! Atomic lock: %u\n", op_type, atomic_load(&octx->shared_atomic_lock)); + if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { // Run matmul jobs const uint32_t n_matmul_jobs = octx->n_threads;