From afaeb543bc36a1c2c64948d5ccc95b08bc06ef07 Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 17 Dec 2025 17:24:50 +0800 Subject: [PATCH 01/10] refactoring: enhance memory management with tracking buffer allocation --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 39 +++++++++++++++++++++----- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 13b96d61f8..b909487a2b 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -50,6 +50,8 @@ static int opt_profile = 0; static int opt_hostbuf = 1; static int opt_experimental = 0; +static const size_t kMaxMemPerSessInBytes = 2ULL * 1024 * 1024 * 1024; // 2GB + // Enable all stages by default static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE; static int opt_opsync = 0; // synchronous ops @@ -140,6 +142,7 @@ struct ggml_hexagon_session { uint32_t prof_usecs; uint32_t prof_cycles; uint32_t prof_pkts; + uint64_t avail_mem_bytes = kMaxMemPerSessInBytes; // available memory for allocations }; void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) { @@ -267,13 +270,15 @@ struct ggml_backend_hexagon_buffer_context { } ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) { - size += 4 * 1024; // extra page for padding + size = get_padded_buffer_size(size); // add padding for alignment if (rpcmem_alloc2) { - this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); + this->base = + (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); } else { GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str()); - this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); + this->base = + (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); } if (!this->base) { @@ -296,6 +301,7 @@ struct ggml_backend_hexagon_buffer_context { this->size = size; this->mapped = false; this->repack = repack; + sess->avail_mem_bytes -= size; } ~ggml_backend_hexagon_buffer_context() { @@ -304,8 +310,11 @@ struct ggml_backend_hexagon_buffer_context { rpcmem_free(this->base); this->base = NULL; } + this->sess->avail_mem_bytes += this->size; } + static size_t get_padded_buffer_size(size_t size) { return size + 4 * 1024; } + ggml_hexagon_session * sess; // primary session uint8_t * base; size_t size; @@ -1479,8 +1488,15 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer( ggml_backend_buffer_type_t buffer_type, size_t size) { auto sess = static_cast(buffer_type->context)->sess; + if (sess->avail_mem_bytes < ggml_backend_hexagon_buffer_context::get_padded_buffer_size(size)) { + GGML_LOG_INFO("ggml-hex: %s insufficient memory to allocate buffer of size %zu bytes (available %zu bytes)\n", + sess->name.c_str(), size, sess->avail_mem_bytes); + return nullptr; + } + try { - ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/); + ggml_backend_hexagon_buffer_context * ctx = + new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/); return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); } catch (const std::exception & exc) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); @@ -1489,8 +1505,15 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer( } static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer( - ggml_backend_buffer_type_t buffer_type, size_t size) { + ggml_backend_buffer_type_t buffer_type, + size_t size) { auto sess = static_cast(buffer_type->context)->sess; + if (sess->avail_mem_bytes < ggml_backend_hexagon_buffer_context::get_padded_buffer_size(size)) { + GGML_LOG_INFO("ggml-hex: %s insufficient memory to allocate repack buffer of size %zu bytes (available %zu bytes)\n", + sess->name.c_str(), size, sess->avail_mem_bytes); + return nullptr; + } + try { ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/); return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); @@ -2681,9 +2704,11 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev } static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + auto sess = static_cast(dev->context); + // ~2GB per session for now - *free = 2ULL * 1024 * 1024 * 1024; - *total = *free; + *free = sess->avail_mem_bytes; + *total = kMaxMemPerSessInBytes; GGML_UNUSED(dev); } From 7ef467ce20876c98610a4dab52709163587c073e Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 19 Dec 2025 11:15:35 +0800 Subject: [PATCH 02/10] wip --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index f14523d485..312dfc40be 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -924,8 +924,8 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri const HVX_UVector * restrict vx = (const HVX_UVector * restrict) x; const HVX_UVectorPair * restrict vy = (const HVX_UVectorPair * restrict) y; - uint32_t nv0 = n / 64; // num full fp16 hvx vectors - uint32_t nv1 = n % 64; // leftover elements + uint32_t nv0 = n / VLEN_FP16; // num full fp16 hvx vectors + uint32_t nv1 = n % VLEN_FP16; // leftover elements // for some reason we need volatile here so that the compiler doesn't try anything funky volatile HVX_Vector rsum = Q6_V_vsplat_R(0); From e0b1435b506307c52c3710075f307cf763eeadcb Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 19 Dec 2025 20:55:49 +0800 Subject: [PATCH 03/10] refactoring: improve code formatting and alignment in matmul operations --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 98 ++++++++++---------------- 1 file changed, 39 insertions(+), 59 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 312dfc40be..a88e4ad977 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -485,8 +485,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, } // Convert into fp32 and reduce - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); + r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); + r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); @@ -658,8 +658,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n, } // Convert into fp32 and reduce - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); + r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); + r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); @@ -900,8 +900,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n, } // Convert into fp32 and reduce - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); + r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); + r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); @@ -909,18 +909,6 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n, #if 1 static void vec_dot_f16_f32(const int n, float * restrict s, const void * restrict x, const void * restrict y) { - if (0) { - float rsum = 0; - const __fp16 * restrict vx = (const __fp16 * restrict) x; - const float * restrict vy = (const float * restrict) y; - - for (uint32_t i = 0; i < n; i++) { - rsum += (float)vx[i] * vy[i]; - } - *s = rsum; - return; - } - const HVX_UVector * restrict vx = (const HVX_UVector * restrict) x; const HVX_UVectorPair * restrict vy = (const HVX_UVectorPair * restrict) y; @@ -929,12 +917,10 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri // for some reason we need volatile here so that the compiler doesn't try anything funky volatile HVX_Vector rsum = Q6_V_vsplat_R(0); - float r_sum_scalar = 0.0f; - uint32_t i = 0; + uint32_t i = 0; for (i = 0; i < nv0; i++) { HVX_VectorPair yp = vy[i]; - HVX_Vector x = vx[i]; HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 @@ -948,43 +934,37 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri } if (nv1) { - // HVX_VectorPair yp = vy[i]; + HVX_VectorPair yp = vy[i]; + HVX_Vector x = vx[i]; + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 - // HVX_Vector x = vx[i]; - // HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 + HVX_Vector l_x; + HVX_Vector l_y; + if (nv1 >= 32) { + volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); + rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi); + nv1 -= 32; + l_x = Q6_V_hi_W(xp); + l_y = Q6_V_hi_W(yp); + } else { + l_x = Q6_V_lo_W(xp); + l_y = Q6_V_lo_W(yp); + } - // if (nv1 >= 32) { - // volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); - // rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi); - // nv1 -= 32; - // } - - // rsum = hvx_vec_qf32_reduce_sum(rsum); - - // if (nv1) { - // volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); - // HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1); - // rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); - // } - - //process the remainder using scalar loop - rsum = hvx_vec_qf32_reduce_sum(rsum); - const __fp16 * restrict sx = (const __fp16 * restrict) x; - const float * restrict sy = (const float * restrict) y; - - for (uint32_t i = nv0 * 64; i < n; i++) { - r_sum_scalar += (float) sx[i] * sy[i]; + if (nv1) { + volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y); + HVX_Vector sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float)); + rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); } // hvx_vec_dump_fp16("X", x); // hvx_vec_dump_fp16("Y", y); // hvx_vec_dump_fp32("SUM", Q6_Vsf_equals_Vqf32(sum)); // hvx_vec_dump_fp32("RSUM", Q6_Vsf_equals_Vqf32(rsum)); - } else { - rsum = hvx_vec_qf32_reduce_sum(rsum); } - *s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)) + r_sum_scalar; + rsum = hvx_vec_qf32_reduce_sum(rsum); + *s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)); # ifdef HTP_DEBUG { @@ -1120,8 +1100,8 @@ static void matmul(struct htp_matmul_type * mt, const uint8_t * restrict src0_row = (const uint8_t *) src0->data; - // Prefill spad with src0 rows - #pragma unroll(4) +// Prefill spad with src0 rows +#pragma unroll(4) for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const int is0 = (ir0 - src0_start_row); if (is0 >= HTP_SPAD_SRC0_NROWS) { @@ -1135,7 +1115,7 @@ static void matmul(struct htp_matmul_type * mt, for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - #pragma unroll(2) +#pragma unroll(2) for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) { const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_row_size); float * restrict dst_row = (float *) (dst->data + (ir1 * dst_row_size)); @@ -1159,7 +1139,7 @@ static void matmul(struct htp_matmul_type * mt, src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - #pragma unroll(2) +#pragma unroll(2) for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) { const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_row_size); float * restrict dst_row = (float *) (dst->data + (ir1 * dst_row_size)); @@ -1222,8 +1202,8 @@ static void matvec(struct htp_matmul_type * mt, const uint8_t * restrict src1_col = (const uint8_t *) src1_data; float * restrict dst_col = (float *) dst->data; - // Prefill spad with 2x src0 rows - #pragma unroll(2) +// Prefill spad with 2x src0 rows +#pragma unroll(2) for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const uint32_t is0 = (ir0 - src0_start_row); if (is0 >= HTP_SPAD_SRC0_NROWS) { @@ -1336,8 +1316,8 @@ static void matmul_id(struct htp_matmul_type * mt, const uint8_t * src0_row = (const uint8_t *) src0->data + (0 + cur_a * nb02 + 0); - // Prefill spad with src0 rows - #pragma unroll(4) +// Prefill spad with src0 rows +#pragma unroll(4) for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const int is0 = (ir0 - src0_start_row); if (is0 >= HTP_SPAD_SRC0_NROWS) { @@ -1460,8 +1440,8 @@ static void matvec_id(struct htp_matmul_type * mt, const uint8_t * restrict src1_col = (const uint8_t *) src1_data; float * restrict dst_row = (float *) (dst->data + ie1 * nb1); - // Prefill spad with src0 rows - #pragma unroll(4) +// Prefill spad with src0 rows +#pragma unroll(4) for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const int is0 = (ir0 - src0_start_row); if (is0 >= HTP_SPAD_SRC0_NROWS) { @@ -2347,7 +2327,7 @@ int op_matmul_id(struct htp_ops_context * octx) { assert(i02 >= 0 && i02 < n_as); - MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) { id, iid1 }; + MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping){ id, iid1 }; matrix_row_counts[i02] += 1; } } From 398aa853116e3959a8406a3fc63831b3b473fd28 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 19 Dec 2025 21:04:52 +0800 Subject: [PATCH 04/10] wip --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index a88e4ad977..e2b40da18f 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -916,13 +916,14 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri uint32_t nv1 = n % VLEN_FP16; // leftover elements // for some reason we need volatile here so that the compiler doesn't try anything funky + const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00); // 1.0 in fp16 volatile HVX_Vector rsum = Q6_V_vsplat_R(0); uint32_t i = 0; for (i = 0; i < nv0; i++) { HVX_VectorPair yp = vy[i]; HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero); // mul by 1.0 //NOTE: need volatile here to prevent compiler optimization // Seem compiler cannot guarantee read-after-write?? @@ -936,7 +937,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri if (nv1) { HVX_VectorPair yp = vy[i]; HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero); // mul by 1.0 HVX_Vector l_x; HVX_Vector l_y; From 500c627fbc565ef25ce1c49b63293a9a11cae0ec Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 19 Dec 2025 21:26:40 +0800 Subject: [PATCH 05/10] wip --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 27 ++++++++++++-------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index e2b40da18f..1eeee823c3 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -912,23 +912,20 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri const HVX_UVector * restrict vx = (const HVX_UVector * restrict) x; const HVX_UVectorPair * restrict vy = (const HVX_UVectorPair * restrict) y; - uint32_t nv0 = n / VLEN_FP16; // num full fp16 hvx vectors - uint32_t nv1 = n % VLEN_FP16; // leftover elements + uint32_t nv0 = n / VLEN_FP16; // num full fp16 hvx vectors + uint32_t nv1 = n % VLEN_FP16; // leftover elements - // for some reason we need volatile here so that the compiler doesn't try anything funky - const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00); // 1.0 in fp16 - volatile HVX_Vector rsum = Q6_V_vsplat_R(0); - uint32_t i = 0; + const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00); // 1.0 in fp16 + HVX_Vector rsum = Q6_V_vsplat_R(0); + uint32_t i = 0; for (i = 0; i < nv0; i++) { HVX_VectorPair yp = vy[i]; HVX_Vector x = vx[i]; HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero); // mul by 1.0 - //NOTE: need volatile here to prevent compiler optimization - // Seem compiler cannot guarantee read-after-write?? - volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); - volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); + HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); + HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo); rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); @@ -942,8 +939,8 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri HVX_Vector l_x; HVX_Vector l_y; if (nv1 >= 32) { - volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); - rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi); + HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); + rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi); nv1 -= 32; l_x = Q6_V_hi_W(xp); l_y = Q6_V_hi_W(yp); @@ -953,9 +950,9 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri } if (nv1) { - volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y); - HVX_Vector sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float)); - rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); + HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y); + HVX_Vector sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float)); + rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); } // hvx_vec_dump_fp16("X", x); From 29171361b5ff11beba4a79b48ac1864b7e68db41 Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 20 Dec 2025 21:59:32 +0800 Subject: [PATCH 06/10] wip --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 1eeee823c3..6e2d46b112 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -938,10 +938,10 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri HVX_Vector l_x; HVX_Vector l_y; - if (nv1 >= 32) { + if (nv1 >= VLEN_FP32) { HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi); - nv1 -= 32; + nv1 -= VLEN_FP32; l_x = Q6_V_hi_W(xp); l_y = Q6_V_hi_W(yp); } else { From ea450209d05acc72f1adc61b4478803f234bfed2 Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 20 Dec 2025 23:06:02 +0800 Subject: [PATCH 07/10] opt: use qf32 internal precision for vec_dot_f16_f32 --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 38 ++++++++++++++------------ 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 6e2d46b112..bf0edd7fa9 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -915,17 +915,20 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri uint32_t nv0 = n / VLEN_FP16; // num full fp16 hvx vectors uint32_t nv1 = n % VLEN_FP16; // leftover elements - const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00); // 1.0 in fp16 + const HVX_Vector one = Q6_Vh_vsplat_R(0x3C00); // 1.0 in fp16 + const HVX_Vector zero = Q6_V_vsplat_R(0); // 0.0 in fp16 HVX_Vector rsum = Q6_V_vsplat_R(0); uint32_t i = 0; for (i = 0; i < nv0; i++) { - HVX_VectorPair yp = vy[i]; - HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero); // mul by 1.0 + HVX_VectorPair yp = vy[i]; + HVX_Vector x = vx[i]; + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one); // mul by 1.0 + HVX_Vector y_hi = Q6_Vqf32_vadd_VsfVsf(Q6_V_hi_W(yp), zero); // convert to qf32 + HVX_Vector y_lo = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero); // convert to qf32 - HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); - HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); + HVX_Vector hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(xp), y_hi); + HVX_Vector lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(xp), y_lo); HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo); rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); @@ -934,24 +937,25 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri if (nv1) { HVX_VectorPair yp = vy[i]; HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero); // mul by 1.0 + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one); // mul by 1.0 - HVX_Vector l_x; - HVX_Vector l_y; + HVX_Vector leftover_x; + HVX_Vector leftover_y; if (nv1 >= VLEN_FP32) { - HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); - rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi); + HVX_Vector y_lo = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero); // convert to qf32 + HVX_Vector lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(xp), y_lo); + rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, lo); nv1 -= VLEN_FP32; - l_x = Q6_V_hi_W(xp); - l_y = Q6_V_hi_W(yp); + leftover_x = Q6_V_hi_W(xp); + leftover_y = Q6_Vqf32_vadd_VsfVsf(Q6_V_hi_W(yp), zero); // convert to qf32 } else { - l_x = Q6_V_lo_W(xp); - l_y = Q6_V_lo_W(yp); + leftover_x = Q6_V_lo_W(xp); + leftover_y = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero); // convert to qf32 } if (nv1) { - HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y); - HVX_Vector sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float)); + HVX_Vector lo = Q6_Vqf32_vmpy_Vqf32Vqf32(leftover_x, leftover_y); + HVX_Vector sum = Q6_V_valign_VVR(lo, zero, nv1 * sizeof(float)); rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); } From cb0a8ff4e7dab37c118379543ee597c14452febc Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 22 Dec 2025 01:26:14 +0800 Subject: [PATCH 08/10] add unroll marker --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index bf0edd7fa9..5d7ec4d88d 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -920,6 +920,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri HVX_Vector rsum = Q6_V_vsplat_R(0); uint32_t i = 0; +#pragma unroll(2) for (i = 0; i < nv0; i++) { HVX_VectorPair yp = vy[i]; HVX_Vector x = vx[i]; From 2058f28b3ea1be8f9ec836f490e49b841f09ec08 Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 22 Dec 2025 12:35:46 +0800 Subject: [PATCH 09/10] Revert "opt: use qf32 internal precision for vec_dot_f16_f32" This reverts commit 8600ecd20d6c902fe16271d6af1e59504eff4a27. --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 38 ++++++++++++-------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 5d7ec4d88d..04794227ed 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -915,21 +915,18 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri uint32_t nv0 = n / VLEN_FP16; // num full fp16 hvx vectors uint32_t nv1 = n % VLEN_FP16; // leftover elements - const HVX_Vector one = Q6_Vh_vsplat_R(0x3C00); // 1.0 in fp16 - const HVX_Vector zero = Q6_V_vsplat_R(0); // 0.0 in fp16 + const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00); // 1.0 in fp16 HVX_Vector rsum = Q6_V_vsplat_R(0); uint32_t i = 0; #pragma unroll(2) for (i = 0; i < nv0; i++) { - HVX_VectorPair yp = vy[i]; - HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one); // mul by 1.0 - HVX_Vector y_hi = Q6_Vqf32_vadd_VsfVsf(Q6_V_hi_W(yp), zero); // convert to qf32 - HVX_Vector y_lo = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero); // convert to qf32 + HVX_VectorPair yp = vy[i]; + HVX_Vector x = vx[i]; + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero); // mul by 1.0 - HVX_Vector hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(xp), y_hi); - HVX_Vector lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(xp), y_lo); + HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); + HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo); rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); @@ -938,25 +935,24 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri if (nv1) { HVX_VectorPair yp = vy[i]; HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one); // mul by 1.0 + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero); // mul by 1.0 - HVX_Vector leftover_x; - HVX_Vector leftover_y; + HVX_Vector l_x; + HVX_Vector l_y; if (nv1 >= VLEN_FP32) { - HVX_Vector y_lo = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero); // convert to qf32 - HVX_Vector lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(xp), y_lo); - rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, lo); + HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); + rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi); nv1 -= VLEN_FP32; - leftover_x = Q6_V_hi_W(xp); - leftover_y = Q6_Vqf32_vadd_VsfVsf(Q6_V_hi_W(yp), zero); // convert to qf32 + l_x = Q6_V_hi_W(xp); + l_y = Q6_V_hi_W(yp); } else { - leftover_x = Q6_V_lo_W(xp); - leftover_y = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(yp), zero); // convert to qf32 + l_x = Q6_V_lo_W(xp); + l_y = Q6_V_lo_W(yp); } if (nv1) { - HVX_Vector lo = Q6_Vqf32_vmpy_Vqf32Vqf32(leftover_x, leftover_y); - HVX_Vector sum = Q6_V_valign_VVR(lo, zero, nv1 * sizeof(float)); + HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(l_x), l_y); + HVX_Vector sum = Q6_V_valign_VVR(lo, Q6_V_vzero(), nv1 * sizeof(float)); rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum); } From 5e3db77f6e9392091c80b21a0ec4b43544162637 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Thu, 25 Dec 2025 13:57:17 +0800 Subject: [PATCH 10/10] wip --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 04794227ed..7ad7a75f0d 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -915,7 +915,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri uint32_t nv0 = n / VLEN_FP16; // num full fp16 hvx vectors uint32_t nv1 = n % VLEN_FP16; // leftover elements - const HVX_Vector zero = Q6_Vh_vsplat_R(0x3C00); // 1.0 in fp16 + const HVX_Vector one = Q6_Vh_vsplat_R(0x3C00); // 1.0 in fp16 HVX_Vector rsum = Q6_V_vsplat_R(0); uint32_t i = 0; @@ -923,7 +923,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri for (i = 0; i < nv0; i++) { HVX_VectorPair yp = vy[i]; HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero); // mul by 1.0 + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one); // mul by 1.0 HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); @@ -935,7 +935,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri if (nv1) { HVX_VectorPair yp = vy[i]; HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), zero); // mul by 1.0 + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), one); // mul by 1.0 HVX_Vector l_x; HVX_Vector l_y;