From d058fc43734410686ca1a5cc02ab99b2a47f0b14 Mon Sep 17 00:00:00 2001
From: ngdxzy <zhenyu_xu@uri.edu>
Date: Tue, 16 Dec 2025 16:14:15 -0500
Subject: [PATCH] feat: add atomic lock to ops context

---
 ggml/src/ggml-hexagon/htp/htp-ops.h    | 2 ++
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index e87657436f..070bd6fcaf 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -50,6 +50,8 @@ struct htp_ops_context {
     struct fastdiv_values src1_div21; // fastdiv values for ne2 * ne1
 
     uint32_t flags;
+
+    atomic_uint shared_atomic_lock;
 };
 
 int op_matmul(struct htp_ops_context * octx);
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 346f0bd339..beaa4fea14 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -1712,6 +1712,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
 static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) {
     struct htp_ops_context * octx = data;
     quantize_fp32_q8x4x2(&octx->src1, octx->src1_spad.data, &octx->src0_spad, n, i, octx->src1_nrows_per_thread);
+    atomic_fetch_add(&octx->shared_atomic_lock, 1);
 }
 
 // ** matmul callbacks for worker_pool
@@ -2027,6 +2028,8 @@ int op_matmul(struct htp_ops_context * octx) {
     octx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads;
     octx->src0_nrows_per_thread += (octx->src0_nrows_per_thread & 1);  // round up to even
 
+    atomic_store(&octx->shared_atomic_lock, 0);
+
     if (need_quant) {
         // Run quant jobs
         const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
@@ -2034,6 +2037,8 @@ int op_matmul(struct htp_ops_context * octx) {
         worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, octx, n_quant_jobs);
     }
 
+    FARF(HIGH, "matmul-%s : quant jobs finished! Atomic lock: %u\n", op_type, atomic_load(&octx->shared_atomic_lock));
+
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         // Run matmul jobs
         const uint32_t n_matmul_jobs = octx->n_threads;