diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 4b8a16c363..8bcf5291c1 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -45,6 +45,7 @@ static int opt_verbose = 0; static int opt_profile = 0; static int opt_hostbuf = 1; // hostbuf ON by default static int opt_experimental = 0; +static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only // Enable all stages by default static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE; @@ -1693,7 +1694,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { // Start the DSP-side service. We need to pass the queue ID to the // DSP in a FastRPC call; the DSP side will import the queue and start // listening for packets in a callback. - err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx); + err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx); if (err != 0) { GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err); throw std::runtime_error("ggml-hex: iface start failed (see log for details)"); @@ -3372,6 +3373,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { const char * str_profile = getenv("GGML_HEXAGON_PROFILE"); const char * str_etm = getenv("GGML_HEXAGON_ETM"); const char * str_nhvx = getenv("GGML_HEXAGON_NHVX"); + const char * str_use_hmx = getenv("GGML_HEXAGON_USE_HMX"); const char * str_ndev = getenv("GGML_HEXAGON_NDEV"); const char * str_arch = getenv("GGML_HEXAGON_ARCH"); @@ -3381,8 +3383,9 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask; opt_opsync = str_opsync ? atoi(str_opsync) : 0; opt_profile = str_profile ? atoi(str_profile) : 0; - opt_etm = str_etm ? atoi(str_etm) : 0; + opt_etm = str_etm ? atoi(str_etm) : 0; opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx; + opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx; opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev; if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) { diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index a490a2ce9a..6ddfe4252f 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -40,6 +40,24 @@ target_compile_definitions(${HTP_LIB} PRIVATE $,FARF_HIGH=1,> FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}) +# HMX acceleration: available on v73+ architectures +set(HTP_HMX_VERSIONS v73 v75 v79 v81) +list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx) + +if (_hmx_idx GREATER_EQUAL 0) + target_sources(${HTP_LIB} PRIVATE + hmx-matmul-ops.c + ) + + # -mhmx enables HMX instruction set (needed by files that include hmx-utils.h) + set_source_files_properties( + hmx-matmul-ops.c + PROPERTIES COMPILE_OPTIONS "-mhmx" + ) + + target_compile_definitions(${HTP_LIB} PRIVATE HTP_HAS_HMX=1) +endif() + build_idl(htp_iface.idl ${HTP_LIB}) set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON) diff --git a/ggml/src/ggml-hexagon/htp/hex-dma.h b/ggml/src/ggml-hexagon/htp/hex-dma.h index 350ab9d966..9811a07599 100644 --- a/ggml/src/ggml-hexagon/htp/hex-dma.h +++ b/ggml/src/ggml-hexagon/htp/hex-dma.h @@ -175,6 +175,86 @@ static inline uint32_t dma_queue_capacity(dma_queue * q) { return q->capacity; } +// --------------------------------------------------------------------------- +// Overflow-safe DMA push: all UDMA type1 descriptor fields (roiwidth, +// roiheight, srcstride, dststride) are 16-bit, max 65535. This helper +// transparently handles values that exceed the 16-bit limit and submits +// chained DMA transtions. +// +// Case 1 (fast path): all params fit in 16 bits -> direct dma_queue_push. +// Case 2 (contiguous block): width == srcstride == dststride. Reshape the +// flat transfer into a 2D descriptor with sub_width <= 65535. Produces a +// single descriptor, preserving async DMA behavior. +// Case 3 (stride overflow): srcstride or dststride > 65535. Issue rows +// one at a time. The first N-1 rows are pushed+popped synchronously; +// the last row is left async so the caller can pop it. +// --------------------------------------------------------------------------- +#define UDMA_MAX_FIELD_VAL 65535u + +static inline bool dma_queue_push_chained(dma_queue *q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t width, size_t nrows) { + // Fast path: everything fits in 16 bits. + if (__builtin_expect( + width <= UDMA_MAX_FIELD_VAL && + nrows <= UDMA_MAX_FIELD_VAL && + src_stride <= UDMA_MAX_FIELD_VAL && + dst_stride <= UDMA_MAX_FIELD_VAL, 1)) { + return dma_queue_push(q, dptr, dst_stride, src_stride, width, nrows); + } + + // Case 2: contiguous block (width == src_stride == dst_stride). + // Reshape total bytes into sub_width * sub_nrows where sub_width <= 65535. + if (width == src_stride && width == dst_stride) { + size_t total = width * nrows; + + // Pick the largest 128-byte-aligned sub_width that divides total evenly. + size_t sub_width = UDMA_MAX_FIELD_VAL & ~(size_t)127; // 65408 + while (sub_width > 0 && total % sub_width != 0) { + sub_width -= 128; + } + if (sub_width == 0) { + // Fallback: use original width (must fit) with adjusted nrows. + // This shouldn't happen for 128-aligned DMA sizes. + sub_width = width; + } + size_t sub_nrows = total / sub_width; + + // Handle sub_nrows > 65535 by issuing chunked descriptors. + const uint8_t *src = (const uint8_t *)dptr.src; + uint8_t *dst = (uint8_t *)dptr.dst; + size_t rows_done = 0; + while (rows_done < sub_nrows) { + size_t chunk = sub_nrows - rows_done; + if (chunk > UDMA_MAX_FIELD_VAL) chunk = UDMA_MAX_FIELD_VAL; + + dma_ptr p = dma_make_ptr(dst + rows_done * sub_width, src + rows_done * sub_width); + if (!dma_queue_push(q, p, sub_width, sub_width, sub_width, chunk)) + return false; + + rows_done += chunk; + // Complete all chunks without waiting except the last one, so the + // caller's single dma_queue_pop drains the final descriptor. + if (rows_done < sub_nrows) + dma_queue_pop_nowait(q); + } + return true; + } + + // Case 3: stride overflow — fall back to row-by-row. + { + const uint8_t *src = (const uint8_t *)dptr.src; + uint8_t *dst = (uint8_t *)dptr.dst; + for (size_t r = 0; r < nrows; ++r) { + dma_ptr p = dma_make_ptr(dst + r * dst_stride, + src + r * src_stride); + if (!dma_queue_push(q, p, 0, 0, width, 1)) + return false; + if (r + 1 < nrows) + dma_queue_pop_nowait(q); + } + return true; + } +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/ggml/src/ggml-hexagon/htp/hex-utils.h b/ggml/src/ggml-hexagon/htp/hex-utils.h index fb8a25a3f2..8ed1456bc5 100644 --- a/ggml/src/ggml-hexagon/htp/hex-utils.h +++ b/ggml/src/ggml-hexagon/htp/hex-utils.h @@ -29,10 +29,22 @@ static inline uint64_t hex_get_pktcnt() { return pktcnt; } -static inline int32_t hex_is_aligned(void * addr, uint32_t align) { +static inline size_t hmx_ceil_div(size_t num, size_t den) { + return (num + den - 1) / den; +} + +static inline int32_t hex_is_aligned(const void * addr, uint32_t align) { return ((size_t) addr & (align - 1)) == 0; } +static inline size_t hex_align_up(size_t v, size_t align) { + return hmx_ceil_div(v, align) * align; +} + +static inline size_t hex_align_down(size_t v, size_t align) { + return (v / align) * align; +} + static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { uint32_t left_off = (size_t) addr & (chunk_size - 1); uint32_t right_off = left_off + n; @@ -43,6 +55,14 @@ static inline uint32_t hex_round_up(uint32_t n, uint32_t m) { return m * ((n + m - 1) / m); } +static inline size_t hex_smin(size_t a, size_t b) { + return a < b ? a : b; +} + +static inline size_t hex_smax(size_t a, size_t b) { + return a > b ? a : b; +} + static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) { const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height)); Q6_l2fetch_AP((void *) p, control); diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c new file mode 100644 index 0000000000..c703a04942 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c @@ -0,0 +1,1528 @@ +#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wunused-but-set-variable" + +#include +#include +#include +#include +#include + +#include +#include + +#define GGML_COMMON_DECL_C +#include "ggml-common.h" + +#include "hex-dma.h" +#include "hvx-utils.h" +#include "hvx-dump.h" +#include "worker-pool.h" +#include "htp-ctx.h" +#include "htp-msg.h" + +#include "hmx-utils.h" +#include "hmx-ops.h" +#include "hmx-profile.h" + +static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { + -8, 0, -7, 0, -6, 0, -5, 0, -4, 0, -3, 0, -2, 0, -1, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, +}; + +static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { + -127, 0, -104, 0, -83, 0, -65, 0, -49, 0, -35, 0, -22, 0, -10, 0, + 1, 0, 13, 0, 25, 0, 38, 0, 53, 0, 69, 0, 89, 0, 113, 0, +}; + +// vscatter offsets for fused dequant+transpose: write K-values directly to [K][N] tile. +// word[i] = i*128 maps K-row-pair i to byte offset i*128 in the tile. +// Column offset (n*4) is added at runtime. Only entries 0..15 are used (masked by predicate). +static const int32_t weight_transpose_scatter_offsets[32] __attribute__((aligned(VLEN))) = { + 0*128, 1*128, 2*128, 3*128, 4*128, 5*128, 6*128, 7*128, + 8*128, 9*128, 10*128, 11*128, 12*128, 13*128, 14*128, 15*128, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +// Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes +#define HMX_X4X2_SCALES_PER_BLK 8 +#define HMX_X4X2_DBLK_SIZE 16 // 8 * 2 bytes + +static inline void swap_ptr(void **p1, void **p2) { + void *t = *p1; + *p1 = *p2; + *p2 = t; +} + +typedef struct { + uint8_t *dst; + const uint8_t *src; + dma_queue *dma; + size_t n_rows; + size_t src_stride; // DDR row stride (full row_stride) + size_t dst_stride; // VTCM sub-block row stride + size_t quant_off; // quant byte offset in each DDR row + size_t quant_width; // quant bytes to copy per row + size_t scale_off; // scale byte offset in each DDR row + size_t scale_width; // scale bytes to copy per row +} qweight_fetch_task_state_t; + +// Compute the byte stride of one row in x4x2 format. +// Numerically equals ggml_row_size(type, k) when k is 256-aligned, because +// x4x2 packing has the same density as block_q4_0 / block_q8_0. +// Layout per row: [quants: nb*128 (Q4) or nb*256 (Q8)][scales: nb*16 bytes] +// Total per row = nb * (128+16) = 144*nb (Q4) or nb * (256+16) = 272*nb (Q8). +// Callers must ensure k is a multiple of 256 (enforced by proc_hmx_matmul_req). +static inline size_t get_x4x2_row_stride(int weight_type, int k) { + int nb = (k + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2; + switch (weight_type) { + case HTP_TYPE_Q4_0: + case HTP_TYPE_IQ4_NL: + return (size_t)nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE); // 144 * nb + case HTP_TYPE_Q8_0: + return (size_t)nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE); // 272 * nb + default: + return 0; + } +} + +// --- Overflow-safe arithmetic for VTCM budget calculation --- + +static inline bool hmx_mul_overflow(size_t a, size_t b, size_t *out) { + if (a != 0 && b > SIZE_MAX / a) return true; + *out = a * b; + return false; +} + +static inline bool hmx_add_overflow(size_t a, size_t b, size_t *out) { + if (a > SIZE_MAX - b) return true; + *out = a + b; + return false; +} + +// Search for optimal (mc, nc) chunk sizes that maximize mc * nc within VTCM budget. +// +// Cost model: total = nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead +// per_n_cost: bytes per nc column (weight + scratch buffers) +// per_m_cost: bytes per mc row (activation) +// per_mn_cost: bytes per mc*nc element (output) +// overhead: fixed bytes (scales 256B, eye_tile 2048B, etc.) +// +// Algorithm: nc sweeps from n_max down by 32, analytically solving for mc_max. +// Returns 0 on success, -1 if VTCM is insufficient. +static int hmx_compute_chunks( + size_t vtcm_total, size_t overhead, + size_t per_n_cost, size_t per_m_cost, size_t per_mn_cost, + int m, int n, + size_t *m_chunk_out, size_t *n_chunk_out, + size_t *total_out) +{ + if (m <= 0 || n <= 0) return -1; + if (vtcm_total <= overhead) return -1; + if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1; + + const size_t usable = vtcm_total - overhead; + size_t best_mn = 0, best_m = 0, best_n = 0; + + const size_t n_max = hex_align_down((size_t)n, HMX_FP16_TILE_N_COLS); + for (size_t nc = n_max; nc >= HMX_FP16_TILE_N_COLS; nc -= HMX_FP16_TILE_N_COLS) { + // Early exit: if nc * m_max cannot beat best, smaller nc won't either + if (nc * hex_align_down((size_t)m, HMX_FP16_TILE_N_ROWS) <= best_mn) + break; + + size_t n_fixed = 0, ncmn = 0, mc_denom = 0; + if (hmx_mul_overflow(nc, per_n_cost, &n_fixed)) continue; + if (n_fixed >= usable) goto next_nc; + + if (hmx_mul_overflow(nc, per_mn_cost, &ncmn)) goto next_nc; + if (hmx_add_overflow(per_m_cost, ncmn, &mc_denom) || mc_denom == 0) goto next_nc; + + { + size_t remain = usable - n_fixed; + size_t mc = remain / mc_denom; + mc = hex_align_down(mc, HMX_FP16_TILE_N_ROWS); + mc = hex_smin(mc, (size_t)m); + + if (mc > 0 && mc * nc > best_mn) { + best_mn = mc * nc; + best_m = mc; + best_n = nc; + } + } + +next_nc: + if (nc == HMX_FP16_TILE_N_COLS) break; // avoid size_t underflow + } + + if (best_m == 0 || best_n == 0) return -1; + + // Compute exact total (with overflow checks) + size_t t0 = 0, t1 = 0, t2 = 0, mn = 0, total = 0; + if (hmx_mul_overflow(best_n, per_n_cost, &t0)) return -1; + if (hmx_mul_overflow(best_m, per_m_cost, &t1)) return -1; + if (hmx_mul_overflow(best_m, best_n, &mn)) return -1; + if (hmx_mul_overflow(mn, per_mn_cost, &t2)) return -1; + if (hmx_add_overflow(t0, t1, &total)) return -1; + if (hmx_add_overflow(total, t2, &total)) return -1; + if (hmx_add_overflow(total, overhead, &total)) return -1; + + *m_chunk_out = best_m; + *n_chunk_out = best_n; + *total_out = total; + return 0; +} + +// forward declaration – defined after transfer_activation_chunk_fp32_to_fp16 +void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *dst, const float *src, int n_rows, int k_block, int k_stride); + +// Scatter row-major FP16 weight (already in VTCM scratch) directly into transposed [K][N] tiles. +// vtcm_src: [n_cols][k] row-major fp16 in VTCM scratch buffer +// vtcm_dst: [n_col_tiles][n_k_tiles][HMX_FP16_TILE_N_ELMS] tile-major interleaved fp16 +static void interleave_fp16_weight_chunk_to_tiles(__fp16 *restrict vtcm_dst, + const __fp16 *restrict vtcm_src, + int n_cols, int k) { + assert(n_cols % HMX_FP16_TILE_N_COLS == 0); + assert(k % HMX_FP16_TILE_N_COLS == 0); + + const int n_k_tiles = k / HMX_FP16_TILE_N_COLS; + const HVX_Vector v_scat_base = hvx_vmem(weight_transpose_scatter_offsets); + const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); + const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); + + for (int r = 0; r < n_cols; r += 2) { + int ct = r / HMX_FP16_TILE_N_ROWS; // N-dimension tile index + int local_r = r % HMX_FP16_TILE_N_ROWS; // intra-tile row index + const bool next_row_valid = (r + 1) < n_cols; + + // Offset vectors for N-columns local_r and local_r+1, reused across K-tiles. + HVX_Vector v_off0 = Q6_Vw_vadd_VwVw(v_scat_base, Q6_V_vsplat_R(local_r * 4)); + HVX_Vector v_off1 = Q6_Vw_vadd_VwVw(v_off0, v_scat_step); + + for (int c = 0; c < k; c += HMX_FP16_TILE_N_COLS) { + int kt = c / HMX_FP16_TILE_N_COLS; + int tile_idx = ct * n_k_tiles + kt; + __fp16 *tile_base = vtcm_dst + tile_idx * HMX_FP16_TILE_N_ELMS; + + HVX_Vector v0 = hvx_vmemu(vtcm_src + r * k + c); + HVX_Vector v1 = next_row_valid ? hvx_vmemu(vtcm_src + (r + 1) * k + c) : Q6_V_vzero(); + + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off0, v0); + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off1, v1); + } + } +} + +// --- x4x2 format dequantizers --- + +// Dequantize one x4x2 Q4_0 group (32 elements from 32 packed bytes) -> 32 FP16 in first 64 bytes. +// In x4x2, sub-blocks 0..3 use lower nibbles, sub-blocks 4..7 use upper nibbles +// of the same 32 packed bytes. +static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx( + const uint8_t *packed_32, bool upper_nibbles, + const __fp16 *scale, const HVX_Vector vlut_cvt) { + HVX_Vector vq = hvx_vmemu(packed_32); + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector v_scales = hvx_vec_splat_f16(*scale); + // q4x4x2 stores two int4 values per byte. Keep only the selected nibble. + HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq; + v_quants = Q6_V_vand_VV(v_quants, mask_h4); + // Shuffle before LUT + v_quants = Q6_Vb_vshuff_Vb(v_quants); + // Use standard vlut16 (not _nomatch) to avoid stale-register NaN. + // _nomatch retains the previous destination-register value for colliding + // indices, but the C intrinsic doesn't model the implicit read so the + // compiler may allocate a register containing garbage/NaN. + HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0); + HVX_Vector v_hf = Q6_V_lo_W(vp); + + return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales)); +} + +// Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using +// full HVX vector width. One vmemu + one vlut16 replaces 4 separate calls. +// Output: out[0..3] each hold 32 FP16 values in the first 64 bytes. +static inline void dequantize_x4x2_q4_0_x4groups_hvx( + const uint8_t *packed_128, bool upper_nibbles, + const __fp16 *scales_4, const HVX_Vector vlut_cvt, + HVX_Vector out[4]) { + // Load all 128 packed bytes (4 contiguous 32-byte groups) + HVX_Vector vq = hvx_vmemu(packed_128); + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq; + v_quants = Q6_V_vand_VV(v_quants, mask_h4); + + // Shuffle before LUT + v_quants = Q6_Vb_vshuff_Vb(v_quants); + + // Full-width vlut16: 128 byte lookups -> 128 fp16 results in a VectorPair + HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0); + HVX_Vector v_lo = Q6_V_lo_W(vp); // [group0: 32 fp16 | group1: 32 fp16] + HVX_Vector v_hi = Q6_V_hi_W(vp); // [group2: 32 fp16 | group3: 32 fp16] + + // Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b + HVX_VectorPred q64 = Q6_Q_vsetq_R(64); + HVX_Vector v_sc01 = Q6_V_vmux_QVV(q64, hvx_vec_splat_f16(scales_4[0]), hvx_vec_splat_f16(scales_4[1])); + HVX_Vector v_sc23 = Q6_V_vmux_QVV(q64, hvx_vec_splat_f16(scales_4[2]), hvx_vec_splat_f16(scales_4[3])); + + v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01)); + v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23)); + + // Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter + out[0] = v_lo; // group0 already in [0:63] + out[1] = Q6_V_vror_VR(v_lo, 64); // group1 rotated to [0:63] + out[2] = v_hi; // group2 already in [0:63] + out[3] = Q6_V_vror_VR(v_hi, 64); // group3 rotated to [0:63] +} + +// Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes. +static inline HVX_Vector dequantize_x4x2_q8_0_group_hvx( + const int8_t *quants_32, const __fp16 *scale) { + HVX_Vector vq = hvx_vmemu(quants_32); + HVX_Vector v_scales = hvx_vec_splat_f16(*scale); + HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(vq)); + HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0); + return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales)); +} + +// Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16. +// Input: vtcm_src has n_cols rows of x4x2 data, each row_stride bytes. +// Output: vtcm_dst in tile-major FP16 layout. +static void dequantize_x4x2_weight_to_fp16_tiles_task( + __fp16 *restrict vtcm_dst, + const uint8_t *restrict vtcm_src, + int n_cols, int k_block, + size_t row_stride, int weight_type, + int start_tile, int end_tile) { + + const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS; + const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL); + const int qrow_size = is_q4 ? (k_block / 2) : k_block; + + const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) + ? hvx_vmem(iq4_nl_to_fp16_lut) : hvx_vmem(q4_0_to_fp16_lut); + + // vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions. + // Each int32 element holds a K-row-pair (2 adjacent fp16 values). word[i] at offset i*128 + // maps to K-rows 2i and 2i+1. Column offset (n*4) added per row. + const HVX_Vector v_scat_base = hvx_vmem(weight_transpose_scatter_offsets); + const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); // 4 bytes = 1 column step + const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); // first 16 words (64 bytes) + + for (int t = start_tile; t < end_tile; ) { + int ct = t / n_k_tiles; // column tile index + int kt = t % n_k_tiles; // K tile index + + // --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row --- + if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) { + int blk_idx = (kt * 32) / QK_Q4_0x4x2; + int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4 + bool upper = (sub_blk_base >= 4); + int packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes + int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + + sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales + + __fp16 *tile_bases[4]; + for (int g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; } + + HVX_Vector v_off = v_scat_base; + for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { + int row0 = ct * HMX_FP16_TILE_N_COLS + r; + int row1 = row0 + 1; + const uint8_t *r0 = vtcm_src + row0 * row_stride; + const uint8_t *r1 = vtcm_src + row1 * row_stride; + + HVX_Vector v0[4], v1[4]; + dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0); + if (row1 < n_cols) { + dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt, v1); + } else { + v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero(); + } + + for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]); } + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]); } + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + } + + for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); } + + t += 4; + continue; + } + + // --- Single-tile fallback --- + __fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS; + + if (is_q4) { + int blk_idx = (kt * 32) / QK_Q4_0x4x2; + int sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32; + bool upper = (sub_blk >= 4); + int byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32; + int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16); + + HVX_Vector v_off = v_scat_base; // reset to column 0 + for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { + int row0 = ct * HMX_FP16_TILE_N_COLS + r; + int row1 = row0 + 1; + + const uint8_t *r0 = vtcm_src + row0 * row_stride; + const uint8_t *r1 = vtcm_src + row1 * row_stride; + + HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx( + r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt); + HVX_Vector v1 = (row1 < n_cols) + ? dequantize_x4x2_q4_0_group_hvx( + r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt) + : Q6_V_vzero(); + + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0); + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1); + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + } + (void) *(volatile HVX_Vector *)(tile_base); + } else { + // Q8_0 + int blk_idx = (kt * 32) / QK_Q8_0x4x2; + int sub_blk = ((kt * 32) % QK_Q8_0x4x2) / 32; + int byte_off = blk_idx * QK_Q8_0x4x2 + sub_blk * 32; + int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16); + + HVX_Vector v_off = v_scat_base; // reset to column 0 + for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { + int row0 = ct * HMX_FP16_TILE_N_COLS + r; + int row1 = row0 + 1; + + const uint8_t *r0 = vtcm_src + row0 * row_stride; + const uint8_t *r1 = vtcm_src + row1 * row_stride; + + HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx( + (const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off)); + HVX_Vector v1 = (row1 < n_cols) + ? dequantize_x4x2_q8_0_group_hvx( + (const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) + : Q6_V_vzero(); + + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0); + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1); + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + } + (void) *(volatile HVX_Vector *)(tile_base); + } + ++t; + } + + // Drain HVX scatter write buffer: a vmem load on the same HW thread retires + // all pending scatter entries to VTCM. Without this, the main thread's HMX + // reads may see stale data because atomic_fetch_sub (release) only orders + // regular stores, not the HVX scatter buffer. + if (start_tile < end_tile) { + (void) *(volatile HVX_Vector *)(vtcm_dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); + } +} + +typedef struct { + __fp16 *dst; + const uint8_t *src; + int n_cols; + int k_block; + size_t row_stride; + int weight_type; + int n_tot_tiles; + int n_tiles_per_task; + int n_tasks; +} x4x2_dequantize_state_t; + +static void dequantize_x4x2_worker_loop(unsigned int n, unsigned int i, void *data) { + x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; + + for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { + int start = task_id * state->n_tiles_per_task; + int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); + + dequantize_x4x2_weight_to_fp16_tiles_task( + state->dst, state->src, state->n_cols, state->k_block, + state->row_stride, state->weight_type, start, end); + } +} + +static void dequantize_x4x2_weight_chunk_to_fp16_tiles( + struct htp_context *ctx, __fp16 *vtcm_dst, + const void *vtcm_src, int n_cols, int k_block, + size_t row_stride, int weight_type) { + + assert(n_cols % HMX_FP16_TILE_N_COLS == 0); + assert(k_block % HMX_FP16_TILE_N_COLS == 0); + + int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS; + int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS; + int n_tot_tiles = n_col_tiles * n_k_tiles; + + size_t n_tiles_per_task = hmx_ceil_div(n_tot_tiles, ctx->n_threads); + + x4x2_dequantize_state_t state; + state.n_tasks = (n_tot_tiles + n_tiles_per_task - 1) / n_tiles_per_task; + state.n_tot_tiles = n_tot_tiles; + state.n_tiles_per_task = n_tiles_per_task; + state.dst = vtcm_dst; + state.src = (const uint8_t *)vtcm_src; + state.n_cols = n_cols; + state.k_block = k_block; + state.row_stride = row_stride; + state.weight_type = weight_type; + + worker_pool_run_func(ctx->worker_pool, dequantize_x4x2_worker_loop, &state, ctx->n_threads); +} + +// --- End x4x2 dequantizers --- + +// requires external HMX lock +static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales, + int n_row_tiles, int n_col_tiles, int n_dot_tiles) { + hmx_set_output_scales(scales); + + for (int r = 0; r < n_row_tiles; ++r) { + for (int c = 0; c < n_col_tiles; ++c) { + Q6_mxclracc_hf(); + + const __fp16 *row_tiles = activation + r * n_dot_tiles * HMX_FP16_TILE_N_ELMS; + const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS; + + for (int k = 0; k < n_dot_tiles; ++k) { + int offset = k * HMX_FP16_TILE_N_ELMS; + hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset); + } + + __fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS; + hmx_consume_accumulator_fp16(out_tile); + } + } +} + +static void transfer_output_chunk_fp16_to_fp32(float *restrict dst, const __fp16 *restrict vtcm_src, int n_rows, int n_cols, int n) { + assert(n_cols % HMX_FP16_TILE_N_COLS == 0); + const int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS; + + const HVX_Vector one = hvx_vec_splat_f16(1.0); + + for (int r = 0; r < n_rows; r += 2) { + int r0 = r / HMX_FP16_TILE_N_ROWS; + int r1 = r % HMX_FP16_TILE_N_ROWS; + + #pragma unroll(4) + for (int c = 0; c < n_cols; c += HMX_FP16_TILE_N_COLS) { + int c0 = c / HMX_FP16_TILE_N_COLS; + + const __fp16 *tile = vtcm_src + (r0 * n_col_tiles + c0) * HMX_FP16_TILE_N_ELMS; + + HVX_Vector v = ((const HVX_Vector *) tile)[r1 / 2]; + HVX_VectorPair vp = Q6_Wqf32_vmpy_VhfVhf(v, one); + + volatile HVX_Vector *pv_out0 = (volatile HVX_Vector *) (dst + (r * n + c + 0)); + volatile HVX_Vector *pv_out1 = (volatile HVX_Vector *) (dst + (r * n + c + n)); // next row in global memory + + *pv_out0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(vp)); + if (r + 1 < n_rows) { + *pv_out1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(vp)); + } + } + } +} + +typedef struct { + const __fp16 *vtcm_src; + float *dst; + int n_tasks; + int n_tot_chunks; + int n_chunks_per_task; + int n_cols; + int n; // DDR row stride (total output columns) +} output_transfer_task_state_t; + +static void transfer_output_chunk_worker_fn(unsigned int n, unsigned int i, void *data) { + output_transfer_task_state_t *st = (output_transfer_task_state_t *) data; + + for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) { + int chunk_idx = task_id * st->n_chunks_per_task; + size_t chunk_size = hex_smin(st->n_tot_chunks - chunk_idx, st->n_chunks_per_task); + + float *dst = st->dst + chunk_idx * st->n; + const __fp16 *vtcm_src = st->vtcm_src + chunk_idx * st->n_cols; + transfer_output_chunk_fp16_to_fp32(dst, vtcm_src, chunk_size, st->n_cols, st->n); + } +} + +static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst, const __fp16 *vtcm_src, + int n_rows, int n_cols, int n) { + assert(n_cols % HMX_FP16_TILE_N_COLS == 0); + + size_t n_tot_chunks = n_rows; + size_t n_chunks_per_task = 32; // must be multiple of HMX_FP16_TILE_N_ROWS (32) + + output_transfer_task_state_t state; + state.n_tasks = (n_tot_chunks + n_chunks_per_task - 1) / n_chunks_per_task; + state.n_tot_chunks = n_tot_chunks; + state.n_chunks_per_task = n_chunks_per_task; + state.dst = dst; + state.vtcm_src = vtcm_src; + state.n_cols = n_cols; + state.n = n; + + worker_pool_run_func(ctx->worker_pool, transfer_output_chunk_worker_fn, &state, ctx->n_threads); +} + +static inline int hmx_matmul_batch_r2(const hmx_matmul_w16a32_batched_params_t *params) { + return params->ne02 > 0 ? params->ne12 / params->ne02 : 1; +} + +static inline int hmx_matmul_batch_r3(const hmx_matmul_w16a32_batched_params_t *params) { + return params->ne03 > 0 ? params->ne13 / params->ne03 : 1; +} + +static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params, + int dst_b2, int dst_b3) { + const int r2 = hmx_matmul_batch_r2(params); + const int r3 = hmx_matmul_batch_r3(params); + return (const __fp16 *) ((const uint8_t *) params->permuted_weight + + (size_t) (dst_b2 / r2) * params->src0_nb2 + + (size_t) (dst_b3 / r3) * params->src0_nb3); +} + +static inline const float *hmx_matmul_activation_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params, + int dst_b2, int dst_b3) { + return (const float *) ((const uint8_t *) params->activation + + (size_t) dst_b2 * params->src1_nb2 + + (size_t) dst_b3 * params->src1_nb3); +} + +static inline float *hmx_matmul_dst_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params, + int dst_b2, int dst_b3) { + return (float *) ((uint8_t *) params->dst + + (size_t) dst_b2 * params->dst_nb2 + + (size_t) dst_b3 * params->dst_nb3); +} + +static int hmx_mat_mul_permuted_w16a32_batched_legacy(struct htp_context *ctx, + const hmx_matmul_w16a32_batched_params_t *params) { + int ret = 0; + for (int b3 = 0; b3 < params->ne13 && ret == 0; ++b3) { + for (int b2 = 0; b2 < params->ne12 && ret == 0; ++b2) { + ret = hmx_mat_mul_permuted_w16a32(ctx, + hmx_matmul_dst_batch_ptr(params, b2, b3), + hmx_matmul_activation_batch_ptr(params, b2, b3), + hmx_matmul_weight_batch_ptr(params, b2, b3), + params->m, params->k, params->n, + params->act_stride, params->weight_stride); + } + } + return ret; +} + +int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmul_w16a32_batched_params_t *params) { + if (!ctx || !params || !params->dst || !params->activation || !params->permuted_weight) { return -1; } + if (!params->m || !params->k || !params->n) { return -1; } + if (params->act_stride < params->k || params->weight_stride < params->k || params->dst_stride < params->n) { return -1; } + if (params->ne02 <= 0 || params->ne03 <= 0 || params->ne12 <= 0 || params->ne13 <= 0) { return -1; } + if (params->ne12 % params->ne02 != 0 || params->ne13 % params->ne03 != 0) { return -1; } + if (params->k % 32 != 0 || params->n % 32 != 0) { return -1; } + + if (!hex_is_aligned(params->dst, VLEN) || + !hex_is_aligned(params->activation, VLEN) || + !hex_is_aligned(params->permuted_weight, VLEN)) { + return -1; + } + + const int group_size = hmx_matmul_batch_r2(params); + + if (group_size <= 1) { + FARF(MEDIUM, "%s: no dim2 GQA reuse (group=%d), using legacy batched loop", __func__, group_size); + return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params); + } + + // Grouped path: reuse interleaved weight across all q_heads sharing a + // kv_head. Each q_head gets its own activation buffer in VTCM (so + // activation is loaded once per m_chunk and reused across all n_chunks), + // and each q_head is computed individually to avoid tile-major packing + // issues. m_chunk_n_rows is always a multiple of 32 (from + // hmx_compute_chunks), so per-head tile arrays don't overlap. + const size_t vtcm_budget = ctx->vtcm_scratch_size; + const size_t vec_dot_size = params->k * sizeof(__fp16); + + // When the activation has a large stride (e.g. permuted Q tensor with + // act_stride >> k), HVX vector loads from strided DDR thrash L2 cache. + // Allocate an F32 scratch buffer in VTCM and use 2D DMA to gather + // strided rows into a contiguous block before the F32->F16 conversion. + const bool use_dma_activation = (params->act_stride > params->k); + const size_t f32_scratch_per_m = use_dma_activation ? (size_t) params->k * sizeof(float) : 0; + + size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0; + if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, + /*per_n=*/3 * vec_dot_size, + /*per_m=*/group_size * vec_dot_size + f32_scratch_per_m, + /*per_mn=*/sizeof(__fp16), + params->m, params->n, + &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { + FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__); + return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params); + } + + const size_t act_head_stride = m_chunk_n_rows * (size_t) params->k; // fp16 elements between heads + const size_t weight_area_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); + const size_t activation_area_size = hex_align_up(group_size * m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE); + const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE); + const size_t scratch_area_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); + const size_t f32_scratch_size = use_dma_activation + ? hex_align_up(m_chunk_n_rows * (size_t) params->k * sizeof(float), HMX_FP16_TILE_SIZE) : 0; + + uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; + __fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); + __fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, activation_area_size); + __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size); + void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch_area_size); + void *vtcm_scratch1 = vtcm_seq_alloc(&vtcm_ptr, scratch_area_size); + __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); + float *vtcm_f32_act = use_dma_activation ? (float *) vtcm_seq_alloc(&vtcm_ptr, f32_scratch_size) : NULL; + + if ((size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base) > vtcm_budget) { + FARF(HIGH, "%s: grouped layout overflowed VTCM, falling back to legacy batched loop", __func__); + return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params); + } + + hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0 + + FARF(MEDIUM, "%s: grouped path m=%d k=%d n=%d group=%d streams=%d mc=%zu nc=%zu vtcm=%zu/%zu", + __func__, params->m, params->k, params->n, group_size, params->ne13, + m_chunk_n_rows, n_chunk_n_cols, + (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget); + + TIMER_DEFINE(activation_load); + TIMER_DEFINE(weight_load); + TIMER_DEFINE(hmx_core); + TIMER_DEFINE(output_store); + TIMER_DEFINE(total); + + TIMER_START(total); + + const size_t fp16_row_bytes = (size_t) params->k * sizeof(__fp16); + const size_t weight_row_bytes = (size_t) params->weight_stride * sizeof(__fp16); + + for (int b3 = 0; b3 < params->ne13; ++b3) { + for (int b2_base = 0; b2_base < params->ne12; b2_base += group_size) { + const __fp16 *weight_group = hmx_matmul_weight_batch_ptr(params, b2_base, b3); + + for (size_t mr = 0; mr < (size_t) params->m; mr += m_chunk_n_rows) { + const size_t n_rows = hex_smin((size_t) params->m - mr, m_chunk_n_rows); + + // Pre-load activations for all heads in the group (once per m_chunk). + // When the source is strided (permuted Q), use 2D DMA to gather + // contiguous rows into a VTCM scratch buffer first, then HVX + // converts from the contiguous VTCM buffer. This avoids L2 cache + // thrashing from HVX loads at large strides. + TIMER_START(activation_load); + for (int g = 0; g < group_size; ++g) { + const float *activation_chunk = hmx_matmul_activation_batch_ptr(params, b2_base + g, b3) + mr * params->act_stride; + __fp16 *vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride; + if (use_dma_activation) { + const size_t row_bytes = (size_t) params->k * sizeof(float); + const size_t stride_bytes = (size_t) params->act_stride * sizeof(float); + dma_queue_push_chained(ctx->dma[0], + dma_make_ptr(vtcm_f32_act, activation_chunk), + row_bytes, stride_bytes, row_bytes, n_rows); + dma_queue_pop(ctx->dma[0]); + transfer_activation_chunk_threaded(ctx, vtcm_act_g, + vtcm_f32_act, (int) n_rows, + params->k, params->k); + } else { + transfer_activation_chunk_threaded(ctx, vtcm_act_g, + activation_chunk, (int) n_rows, + params->k, params->act_stride); + } + } + TIMER_STOP(activation_load); + + void *buf_curr = vtcm_scratch0; + void *buf_next = vtcm_scratch1; + + { + const size_t n_cols_first = hex_smin((size_t) params->n, n_chunk_n_cols); + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_curr, weight_group), + fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_first); + } + + HAP_compute_res_hmx_lock(ctx->vtcm_rctx); + + for (size_t nc = 0; nc < (size_t) params->n; nc += n_chunk_n_cols) { + const size_t n_cols = hex_smin((size_t) params->n - nc, n_chunk_n_cols); + + TIMER_START(weight_load); + { + dma_queue_pop(ctx->dma[0]); + + const size_t nc_next = nc + n_chunk_n_cols; + if (nc_next < (size_t) params->n) { + const size_t n_cols_next = hex_smin((size_t) params->n - nc_next, n_chunk_n_cols); + const __fp16 *next_weight_chunk = weight_group + nc_next * params->weight_stride; + + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), + fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_next); + } + + interleave_fp16_weight_chunk_to_tiles(vtcm_weight, (const __fp16 *) buf_curr, n_cols, params->k); + swap_ptr(&buf_curr, &buf_next); + } + TIMER_STOP(weight_load); + + // Reuse the interleaved weight for every q_head in this GQA group + for (int g = 0; g < group_size; ++g) { + TIMER_START(hmx_core); + { + const __fp16 *vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride; + const int n_row_tiles = hmx_ceil_div((int) n_rows, HMX_FP16_TILE_N_ROWS); + const int n_col_tiles = hmx_ceil_div((int) n_cols, HMX_FP16_TILE_N_COLS); + core_dot_chunk_fp16(vtcm_output, vtcm_act_g, vtcm_weight, vtcm_scales, + n_row_tiles, n_col_tiles, params->k / 32); + } + TIMER_STOP(hmx_core); + + TIMER_START(output_store); + { + float *output = hmx_matmul_dst_batch_ptr(params, b2_base + g, b3) + mr * params->dst_stride + nc; + transfer_output_chunk_threaded(ctx, output, vtcm_output, (int) n_rows, (int) n_cols, params->dst_stride); + } + TIMER_STOP(output_store); + } + } + + HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); + } + } + } + + TIMER_STOP(total); + +#if defined(ENABLE_PROFILE_TIMERS) + FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d group=%d", __func__, TIMER_US(total), + params->m, params->k, params->n, group_size); + FARF(HIGH, " activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us", + TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store)); +#endif + + return 0; +} + +int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation, + const __fp16 *restrict permuted_weight, int m, int k, int n, + int act_stride, int weight_stride) { + if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; } + if (act_stride < k || weight_stride < k) { return -1; } + if (k % 32 != 0 || n % 32 != 0) { return -1; } + + if (!hex_is_aligned(dst, VLEN) || !hex_is_aligned(activation, VLEN) || !hex_is_aligned(permuted_weight, VLEN)) { + return -1; + } + + // --- Dynamic VTCM layout --- + const size_t vtcm_budget = ctx->vtcm_scratch_size; + const size_t vec_dot_size = k * sizeof(__fp16); + + // DMA-based activation gather for strided tensors (see batched path comment). + const bool use_dma_activation = (act_stride > k); + const size_t f32_scratch_per_m = use_dma_activation ? (size_t) k * sizeof(float) : 0; + + size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0; + if (hmx_compute_chunks(vtcm_budget, + /*overhead=*/ 256, + /*per_n=*/ 3 * vec_dot_size, // W + S0 + S1 + /*per_m=*/ vec_dot_size + f32_scratch_per_m, // A + optional F32 scratch + /*per_mn=*/ sizeof(__fp16), // O + m, n, + &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { + FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget); + return -1; + } + + const size_t weight_area_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); + const size_t activation_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE); + const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE); + const size_t scratch_area_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); + const size_t f32_scratch_size = use_dma_activation + ? hex_align_up(m_chunk_n_rows * (size_t) k * sizeof(float), HMX_FP16_TILE_SIZE) : 0; + + // VTCM layout: weight | activation | output | scratch0 | scratch1 | scales | [f32_scratch] + uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; + __fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); + __fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, activation_area_size); + __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size); + void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch_area_size); + void *vtcm_scratch1 = vtcm_seq_alloc(&vtcm_ptr, scratch_area_size); + __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); + float *vtcm_f32_act = use_dma_activation ? (float *) vtcm_seq_alloc(&vtcm_ptr, f32_scratch_size) : NULL; + if ((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) > vtcm_budget) { + FARF(ERROR, "%s: vtcm overflow: used=%zu limit=%zu", __func__, + (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); + return -1; + } + + hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0 + + FARF(MEDIUM, "%s: m=%d k=%d n=%d mc=%zu nc=%zu vtcm=%zu/%zu", + __func__, m, k, n, m_chunk_n_rows, n_chunk_n_cols, + (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); + + TIMER_DEFINE(activation_load); + TIMER_DEFINE(weight_load); + TIMER_DEFINE(hmx_core); + TIMER_DEFINE(output_store); + + TIMER_DEFINE(total); + TIMER_START(total); + + HAP_compute_res_hmx_lock(ctx->vtcm_rctx); + + for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { + // transfer activation matrix chunk into VTCM + size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); + + TIMER_START(activation_load); + { + const float *activation_chunk = activation + mr * act_stride; + if (use_dma_activation) { + const size_t row_bytes = (size_t) k * sizeof(float); + const size_t stride_bytes = (size_t) act_stride * sizeof(float); + dma_queue_push_chained(ctx->dma[0], + dma_make_ptr(vtcm_f32_act, activation_chunk), + row_bytes, stride_bytes, row_bytes, n_rows); + dma_queue_pop(ctx->dma[0]); + transfer_activation_chunk_threaded(ctx, vtcm_activation, + vtcm_f32_act, n_rows, k, k); + } else { + transfer_activation_chunk_threaded(ctx, vtcm_activation, + activation_chunk, n_rows, k, act_stride); + } + } + TIMER_STOP(activation_load); + + const size_t fp16_row_bytes = (size_t) k * sizeof(__fp16); + const size_t weight_row_bytes = (size_t) weight_stride * sizeof(__fp16); + + void *buf_curr = vtcm_scratch0; + void *buf_next = vtcm_scratch1; + + // issue async DMA for the first weight chunk + // NOTE: use 2D DMA (n_cols rows x fp16_row_bytes) to avoid 16-bit roiwidth overflow. + // The source rows can be strided (e.g. KV-cache K after ggml_permute). + { + const size_t n_cols_first = hex_smin(n, n_chunk_n_cols); + + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight), + fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_first); + } + + for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) { + size_t n_cols = hex_smin(n - nc, n_chunk_n_cols); + + TIMER_START(weight_load); + { + dma_queue_pop(ctx->dma[0]); // wait until current weight chunk is ready + + // issue async DMA for the next weight chunk (double buffering) + const size_t nc_next = nc + n_chunk_n_cols; + if (nc_next < n) { + const size_t n_cols_next = hex_smin(n - nc_next, n_chunk_n_cols); + const __fp16 *next_weight_chunk = permuted_weight + nc_next * weight_stride; + + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), + fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_next); + } + + // interleave row-major fp16 from scratch into tile-major in vtcm_weight + interleave_fp16_weight_chunk_to_tiles(vtcm_weight, (const __fp16 *)buf_curr, n_cols, k); + + swap_ptr(&buf_curr, &buf_next); + } + TIMER_STOP(weight_load); + + TIMER_START(hmx_core); + { + const int n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS); + const int n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS); + core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, k / 32); + } + TIMER_STOP(hmx_core); + + TIMER_START(output_store); + { + float *output = dst + (mr * n + nc); + transfer_output_chunk_threaded(ctx, output, vtcm_output, n_rows, n_cols, n); + } + TIMER_STOP(output_store); + } + + } + + HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); + + TIMER_STOP(total); + +#if defined(ENABLE_PROFILE_TIMERS) + FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d", __func__, TIMER_US(total), m, k, n); + FARF(HIGH, " activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us", + TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store)); + { + size_t weight_size = (size_t)k * n * sizeof(__fp16); + float bandwidth = 1e-3f * weight_size / (float)TIMER_US(weight_load); + FARF(HIGH, " weight load bandwidth: %.2f GB/s", bandwidth); + } +#endif + + return 0; +} + +int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, int m, + int k, int n, int w_type); + +int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation, + const uint8_t *restrict permuted_weight, int m, int k, int n, + int weight_type) { + if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; } + if (k % 32 != 0 || n % 32 != 0) { return -1; } + + if (!hex_is_aligned(dst, VLEN) || !hex_is_aligned(activation, VLEN) || !hex_is_aligned(permuted_weight, VLEN)) { + return -1; + } + + // for large m, k (e.g. prefill FFN Down), use out-stationary version + if (m >= 128 && k > n && n > 1024) { + FARF(MEDIUM, "hmx_matmul_qk: OUT-STATIONARY path m=%d k=%d n=%d type=%d (K_BLOCK=512, %d K-iters with fp16 intermediate)", + m, k, n, weight_type, (k + 511) / 512); + return mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type); + } + + size_t row_stride = get_x4x2_row_stride(weight_type, k); + if (row_stride == 0) { + return -1; + } + + FARF(MEDIUM, "hmx_matmul_qk: STANDARD path m=%d k=%d n=%d type=%d", m, k, n, weight_type); + + // --- Dynamic VTCM layout --- + const size_t vtcm_budget = ctx->vtcm_scratch_size; + const size_t vec_dot_size = k * sizeof(__fp16); + const bool use_pipeline = (m >= 128) && (k <= n); + + // Select cost parameters based on execution path + size_t per_n_cost, per_mn_cost; + if (use_pipeline) { + per_n_cost = row_stride + 2 * vec_dot_size; // Q + S0 + S1 (dequant bufs) + per_mn_cost = 2 * sizeof(__fp16); // O x 2 (output double buffer) + } else { + per_n_cost = vec_dot_size + 2 * row_stride; // W + S0 + S1 (x4x2 DMA bufs) + per_mn_cost = sizeof(__fp16); // O x 1 + } + + size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0; + if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, + per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost, + m, n, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { + FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d pipe=%d budget=%zu)", + __func__, m, k, n, use_pipeline, vtcm_budget); + return -1; + } + + // Compute precise buffer sizes per execution path + const size_t weight_area_size = hex_align_up( + n_chunk_n_cols * (use_pipeline ? row_stride : vec_dot_size), HMX_FP16_TILE_SIZE); + const size_t activation_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE); + const size_t output_area_size = hex_align_up( + m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE); + + size_t scratch0_size, scratch1_size, scratch2_size; + if (use_pipeline) { + scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); // dequant buf 0 + scratch1_size = scratch0_size; // dequant buf 1 + scratch2_size = output_area_size; // output buf 1 + } else { + scratch0_size = hex_align_up(n_chunk_n_cols * row_stride, HMX_FP16_TILE_SIZE); // x4x2 DMA buf 0 + scratch1_size = scratch0_size; // x4x2 DMA buf 1 + scratch2_size = 0; // unused + } + + uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; + __fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); + __fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, activation_area_size); + __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size); + void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch0_size); + void *vtcm_scratch1 = vtcm_seq_alloc(&vtcm_ptr, scratch1_size); + void *vtcm_scratch2 = scratch2_size ? vtcm_seq_alloc(&vtcm_ptr, scratch2_size) : NULL; + __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); + if ((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) > vtcm_budget) { + FARF(ERROR, "%s: vtcm overflow: used=%zu limit=%zu", __func__, + (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); + return -1; + } + + hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0 + + FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d pipe=%d mc=%zu nc=%zu vtcm=%zu/%zu", + __func__, m, k, n, weight_type, use_pipeline, + m_chunk_n_rows, n_chunk_n_cols, + (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); + + TIMER_DEFINE(activation_load); + TIMER_DEFINE(weight_load); + TIMER_DEFINE(hmx_core); + TIMER_DEFINE(output_store); + + TIMER_DEFINE(total); + TIMER_START(total); + + FARF(MEDIUM, "hmx_matmul_qk: %s mc=%zu nc=%zu vtcm=%zu/%zu", + use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols, + (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); + + HAP_compute_res_hmx_lock(ctx->vtcm_rctx); + + if (!use_pipeline) { + for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { + // transfer activation matrix chunk into VTCM + size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); + + TIMER_START(activation_load); + { + const float *activation_chunk = activation + mr * k; + transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k); + } + TIMER_STOP(activation_load); + + void *buf_curr = vtcm_scratch0; + void *buf_next = vtcm_scratch1; + + // issue async DDR data transfer for the first weight chunk + // NOTE: use 2D DMA (n_cols rows x row_stride bytes) instead of 1D + // because UDMA roiwidth is 16-bit and total size can exceed 65535. + { + const size_t n_cols_first = hex_smin(n, n_chunk_n_cols); + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight), row_stride, row_stride, row_stride, n_cols_first); + } + + for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) { + size_t n_cols = hex_smin(n - nc, n_chunk_n_cols); + + TIMER_START(weight_load); + { + dma_queue_pop(ctx->dma[0]); // wait until current weight chunk become ready + + const size_t nc_next = nc + n_chunk_n_cols; + if (nc_next < n) { + const size_t n_cols_next = hex_smin(n - nc_next, n_chunk_n_cols); + + const uint8_t *next_weight_chunk = permuted_weight + nc_next * row_stride; + + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), row_stride, row_stride, row_stride, n_cols_next); + } + + // Dequant + vscatter writes directly to [K, N] transposed tiles. + // HMX computes C = A x B, where A=[M,K] activation, B=[K,N] weight. + dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, buf_curr, n_cols, k, row_stride, weight_type); + + swap_ptr(&buf_curr, &buf_next); + } + TIMER_STOP(weight_load); + + TIMER_START(hmx_core); + { + const int n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS); + const int n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS); + core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, k / 32); + } + TIMER_STOP(hmx_core); + + TIMER_START(output_store); + { + float *output = dst + (mr * n + nc); + transfer_output_chunk_threaded(ctx, output, vtcm_output, n_rows, n_cols, n); + } + TIMER_STOP(output_store); + } + } + } else { + // 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D) + // stage B and D (dequantize and store) are expected to be on the critical path + + // A --> B: vtcm_qweight, 1 buffer + // B --> C: vtcm_weight0/vtcm_weight1, 2 buffers + // C --> D: vtcm_output0/vtcm_output1, 2 buffers + + // + // LD ||A3| | B3 || + // MM || C2 || + // ST || D1 | || + + int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols); + for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { + const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); + + void *vtcm_qweight = vtcm_weight; + void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 }; + void *vtcm_output_bufs[2] = { vtcm_output, vtcm_scratch2 }; + + // prologue: A0 + const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols); + { + // Use 2D DMA (n_cols rows x row_stride) to avoid 16-bit roiwidth overflow. + const uint8_t *qweight_chunk_A0 = permuted_weight; + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0); + } + + { + const float *activation_chunk = activation + mr * k; + transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k); + } + + // prologue: B0, A1, C0, B1 + { + // B0 + dma_queue_pop(ctx->dma[0]); + dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type); + + // A1 + const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols); + if (1 < n_chunk_cnt) { + const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride; + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1); + } + + // C0 + core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales, + hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); + + // B1 + if (1 < n_chunk_cnt) { + dma_queue_pop(ctx->dma[0]); + dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type); + } + } + + // main loop + for (int i = 0; i < n_chunk_cnt; ++i) { + const size_t nc = i * n_chunk_n_cols; + const size_t nc_p1 = nc + 1 * n_chunk_n_cols; + const size_t nc_p2 = nc + 2 * n_chunk_n_cols; + + const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols); + const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols); + const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols); + + // issue A_{i+2} + if (i + 2 < n_chunk_cnt) { + const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride; + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2); + } + + // wait for HMX (C_{i}) -- C_{i} is done + + // result of B_{i+1} (input of C_{i+1}) should be ready now + + // issue C_{i+1} + if (i + 1 < n_chunk_cnt) { + core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales, + hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); + } + + // compute D_{i} + float *output_chunk = dst + (mr * n + nc); + transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n); + + // wait for DMA (A_{i+2}), compute B_{i+2} + if (i + 2 < n_chunk_cnt) { + dma_queue_pop(ctx->dma[0]); + dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type); + } + } + } + } + + HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); + + TIMER_STOP(total); + +#if defined(ENABLE_PROFILE_TIMERS) + FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d pipeline=%d", __func__, TIMER_US(total), m, k, n, use_pipeline); + if (!use_pipeline) { + FARF(HIGH, " activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us", + TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store)); + size_t weight_size = (size_t)n * row_stride; + float bandwidth = 1e-3f * weight_size / (float)TIMER_US(weight_load); + FARF(HIGH, " weight load bandwidth: %.2f GB/s", bandwidth); + } +#endif + + return 0; +} + +// C += AB +void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp16 *col_scales, const __fp16 *eye_tile, + int n_row_tiles, int n_col_tiles, int n_dot_tiles, bool zero_init) { + + hmx_set_output_scales(col_scales); + + for (int i = 0; i < n_row_tiles; ++i) { + for (int j = 0; j < n_col_tiles; ++j) { + Q6_mxclracc_hf(); + + const __fp16 *row_tiles = a + i * n_dot_tiles * HMX_FP16_TILE_N_ELMS; + const __fp16 *col_tiles = b + j * n_dot_tiles * HMX_FP16_TILE_N_ELMS; + + __fp16 *accum_tile = c + (i * n_col_tiles + j) * HMX_FP16_TILE_N_ELMS; + if (!zero_init) { + hmx_load_tile_pair_fp16(accum_tile, eye_tile); + } + + for (int k = 0; k < n_dot_tiles; ++k) { + int offset = k * HMX_FP16_TILE_N_ELMS; + hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset); + } + + hmx_consume_accumulator_fp16(accum_tile); + } + } +} + +static void transfer_activation_chunk_fp32_to_fp16(__fp16 *restrict vtcm_dst, const float *restrict src, int n_rows, + int k_block, int k_stride) { + for (int r = 0; r < n_rows; r += 2) { + int r0 = r / HMX_FP16_TILE_N_ROWS; // tile row index + int r1 = r % HMX_FP16_TILE_N_ROWS; // intra-tile row idx + + const bool next_row_valid = (r + 1) < n_rows; + + const HVX_Vector *pv_in0 = (const HVX_Vector *) (src + (r + 0) * k_stride); + const HVX_Vector *pv_in1 = (const HVX_Vector *) (src + (r + 1) * k_stride); + for (int c = 0; c < k_block; c += 32) { + HVX_Vector v0 = *pv_in0++; + HVX_Vector v1 = next_row_valid ? *pv_in1++ : Q6_V_vzero(); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + // compute output position + int c0 = c / HMX_FP16_TILE_N_COLS; // tile column index + int tile_idx = r0 * (k_block / HMX_FP16_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HMX_FP16_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + } +} + +typedef struct { + __fp16 *dst; + const float *src; + int n_tasks; + int n_tot_chunks; + int n_chunks_per_task; + int k_block; + int k_stride; +} activation_transfer_task_state_t; + +static void transfer_activation_chunk_worker_fn(unsigned int n, unsigned int i, void *data) { + activation_transfer_task_state_t *st = (activation_transfer_task_state_t *) data; + + for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) { + // one chunk: one row + int chunk_idx = task_id * st->n_chunks_per_task; + size_t chunk_size = hex_smin(st->n_tot_chunks - chunk_idx, st->n_chunks_per_task); + + __fp16 *dst = st->dst + chunk_idx * st->k_block; + const float *src = st->src + chunk_idx * st->k_stride; + transfer_activation_chunk_fp32_to_fp16(dst, src, chunk_size, st->k_block, st->k_stride); + } +} + +void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *dst, const float *src, int n_rows, int k_block, int k_stride) { + assert(k_block % HMX_FP16_TILE_N_COLS == 0 && k_stride % HMX_FP16_TILE_N_COLS == 0); + assert(VLEN == 32 * sizeof(float)); + + size_t n_tot_chunks = n_rows; + size_t n_chunks_per_task = 32; // must be multiple of 32 to ensure correct destination address + + activation_transfer_task_state_t state; + state.n_tasks = (n_tot_chunks + n_chunks_per_task - 1) / n_chunks_per_task; + state.n_tot_chunks = n_tot_chunks; + state.n_chunks_per_task = n_chunks_per_task; + state.dst = dst; + state.src = src; + state.k_block = k_block; + state.k_stride = k_stride; + + worker_pool_run_func(ctx->worker_pool, transfer_activation_chunk_worker_fn, &state, ctx->n_threads); +} + +int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, int m, + int k, int n, int weight_type) { + // Runtime check -- k >= 16384 exceeds 2D DMA limit + if (k >= 16384) { + FARF(HIGH, "%s: k=%d exceeds 2D DMA limit", __func__, k); + return -1; + } + // assume k % 32 == 0 && n % 32 == 0 + const size_t row_stride = get_x4x2_row_stride(weight_type, k); + if (row_stride == 0) { + return -1; + } + + const size_t vtcm_budget = ctx->vtcm_scratch_size; + + const size_t M_BLOCK_SIZE = 512; + const size_t N_BLOCK_SIZE = 512; + const size_t K_BLOCK_SIZE = 512; + + // Compute precise buffer sizes + const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE); + const size_t weight_size = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE); + const size_t act_size = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE); + const size_t out_size = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE); + const size_t scratch0_sz = hex_align_up(N_BLOCK_SIZE * sub_row_stride_alloc, HMX_FP16_TILE_SIZE); + const size_t scratch1_sz = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(float), HMX_FP16_TILE_SIZE); + + const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256; + if (total_vtcm > vtcm_budget) { + FARF(HIGH, "%s: VTCM too small: need %zu have %zu (m=%d k=%d n=%d)", __func__, total_vtcm, vtcm_budget, m, k, n); + return -1; + } + + uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; + __fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_size); + __fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_size); + __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, out_size); + uint8_t *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch0_sz); + uint8_t *vtcm_scratch1 = vtcm_seq_alloc(&vtcm_ptr, scratch1_sz); + __fp16 *vtcm_eye_tile = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, HMX_FP16_TILE_SIZE); + __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); + assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget); + + FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu", + __func__, m, k, n, weight_type, + (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); + + // initialize eye tile (32x32 identity matrix) + { + HVX_Vector v; + v = Q6_V_vzero(); + v = Q6_Vw_vinsert_VwR(v, 0x3c000000); + v = Q6_V_vror_VR(v, VLEN - 4); + v = Q6_Vw_vinsert_VwR(v, 0x00003c00); + for (int i = 0; i < 16; ++i) { + ((HVX_Vector *) vtcm_eye_tile)[i] = v; + v = Q6_V_vror_VR(v, VLEN - 8); + } + } + hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0 + + TIMER_DEFINE(fetch); + TIMER_DEFINE(act_load); + TIMER_DEFINE(wt_dequant); + TIMER_DEFINE(core); + + HAP_compute_res_hmx_lock(ctx->vtcm_rctx); + + for (size_t mr = 0; mr < m; mr += M_BLOCK_SIZE) { + size_t m_blk_sz = hex_smin(m - mr, M_BLOCK_SIZE); + for (size_t nc = 0; nc < n; nc += N_BLOCK_SIZE) { + size_t n_blk_sz = hex_smin(n - nc, N_BLOCK_SIZE); + + const int n_row_tiles = hmx_ceil_div(m_blk_sz, HMX_FP16_TILE_N_ROWS); + const int n_col_tiles = hmx_ceil_div(n_blk_sz, HMX_FP16_TILE_N_COLS); + + for (size_t kk = 0; kk < k; kk += K_BLOCK_SIZE) { + size_t k_blk_sz = hex_smin(k - kk, K_BLOCK_SIZE); + + TIMER_START(fetch); + // fetch activation block into VTCM + { + const float *activation_block = x + mr * k + kk; + + dma_queue_push_chained(ctx->dma[0], + dma_make_ptr(vtcm_scratch1, activation_block), + k_blk_sz * sizeof(float), + k * sizeof(float), + k_blk_sz * sizeof(float), + m_blk_sz); + } + + // fetch weight block into VTCM (x4x2 sub-block: quants + scales) + { + qweight_fetch_task_state_t s; + + const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL); + const int blk_start = kk / QK_Q4_0x4x2; + const int nb_sub = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2; + const int full_qrow = is_q4 ? (k / 2) : k; + const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz); + + s.dst = vtcm_scratch0; + s.src = w + nc * row_stride; + s.n_rows = n_blk_sz; + s.src_stride = row_stride; + s.dst_stride = sub_row_stride; + s.quant_off = is_q4 ? (blk_start * (QK_Q4_0x4x2 / 2)) : (blk_start * QK_Q8_0x4x2); + s.quant_width = is_q4 ? (nb_sub * (QK_Q4_0x4x2 / 2)) : (nb_sub * QK_Q8_0x4x2); + s.scale_off = full_qrow + blk_start * HMX_X4X2_DBLK_SIZE; + s.scale_width = nb_sub * HMX_X4X2_DBLK_SIZE; + + // 2D DMA: quants sub-range + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(s.dst, s.src + s.quant_off), + s.dst_stride, s.src_stride, s.quant_width, s.n_rows); + // 2D DMA: scales sub-range + dma_queue_push_chained(ctx->dma[0], dma_make_ptr(s.dst + s.quant_width, s.src + s.scale_off), + s.dst_stride, s.src_stride, s.scale_width, s.n_rows); + } + TIMER_STOP(fetch); + + TIMER_START(act_load); + // load activation block + { + dma_queue_pop(ctx->dma[0]); // wait for act DNA + transfer_activation_chunk_threaded(ctx, vtcm_activation, (float *) vtcm_scratch1, m_blk_sz, k_blk_sz, k_blk_sz); + } + TIMER_STOP(act_load); + + TIMER_START(wt_dequant); + // dequantize weight block + { + dma_queue_pop(ctx->dma[0]); + dma_queue_pop(ctx->dma[0]); + // vtcm_scratch0 is used to store the qweight chunk + // worker_pool_run_func already returned, so fetch is done + const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz); + dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, vtcm_scratch0, + n_blk_sz, k_blk_sz, sub_row_stride, weight_type); + } + TIMER_STOP(wt_dequant); + + // core mma + TIMER_START(core); + { + core_mma_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, vtcm_eye_tile, n_row_tiles, + n_col_tiles, k_blk_sz / HMX_FP16_TILE_N_COLS, kk == 0); + } + TIMER_STOP(core); + } + + // store output block + { + float *output_block = out + (mr * n + nc); + transfer_output_chunk_threaded(ctx, output_block, vtcm_output, m_blk_sz, n_blk_sz, n); + } + } + } + + HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); + +#if defined(ENABLE_PROFILE_TIMERS) + FARF(HIGH, "fetch: %lld us, act_load: %lld us, wt_dequant: %lld us, core: %lld us", + TIMER_US(fetch), TIMER_US(act_load), TIMER_US(wt_dequant), TIMER_US(core)); +#endif + return 0; +} diff --git a/ggml/src/ggml-hexagon/htp/hmx-ops.h b/ggml/src/ggml-hexagon/htp/hmx-ops.h new file mode 100644 index 0000000000..b36c8d129b --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hmx-ops.h @@ -0,0 +1,72 @@ +// HMX operation entry-point declarations. +// Ported from htp-ops-lib/include/dsp/ops.h (renamed, benchmark kernels removed). (https://github.com/haozixu/htp-ops-lib) + +#ifndef HMX_OPS_H +#define HMX_OPS_H + +#include +#include + +#ifndef restrict +# define restrict __restrict +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +struct htp_context; // forward declaration + +typedef struct { + float *dst; + const float *activation; + const __fp16 *permuted_weight; + int m; + int k; + int n; + int act_stride; + int weight_stride; + int dst_stride; + int ne02; + int ne03; + int ne12; + int ne13; + size_t src0_nb2; + size_t src0_nb3; + size_t src1_nb2; + size_t src1_nb3; + size_t dst_nb2; + size_t dst_nb3; +} hmx_matmul_w16a32_batched_params_t; + +// HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output +// act_stride: activation row stride in elements (= k for contiguous, or +// nb[1]/sizeof(float) for permuted tensors like attention Q). +// weight_stride: weight row stride in elements (= k for compact weights, or +// nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK). +int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, + float *restrict dst, + const float *activation, + const __fp16 *permuted_weight, + int m, int k, int n, + int act_stride, + int weight_stride); + +// Batched F16 wrapper over hmx_mat_mul_permuted_w16a32. +// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3. +int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, + const hmx_matmul_w16a32_batched_params_t *params); + +// HMX matrix multiplication — tile-permuted quantised weights (Q4_0/Q8_0/IQ4_NL) +int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, + float *restrict dst, + const float *activation, + const uint8_t *permuted_weight, + int m, int k, int n, + int weight_type); + +#ifdef __cplusplus +} +#endif + +#endif // HMX_OPS_H diff --git a/ggml/src/ggml-hexagon/htp/hmx-profile.h b/ggml/src/ggml-hexagon/htp/hmx-profile.h new file mode 100644 index 0000000000..01eece720c --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hmx-profile.h @@ -0,0 +1,34 @@ +// Conditional fine-grained profiling macros for HMX operations. +// +// Define ENABLE_PROFILE_TIMERS (via compiler flag or before including this +// header) to instrument sub-operation latencies with HAP qtimer. When the +// macro is not defined the TIMER_* helpers expand to nothing so there is zero +// overhead. +// +// Usage: +// TIMER_DEFINE(my_phase); // declare accumulator variable +// TIMER_START(my_phase); // snapshot start time +// ... work ... +// TIMER_STOP(my_phase); // accumulate elapsed ticks +// FARF(ALWAYS, "my_phase: %lld us", TIMER_US(my_phase)); + +#ifndef HMX_PROFILE_H +#define HMX_PROFILE_H + +#include + +// #define ENABLE_PROFILE_TIMERS + +#if defined(ENABLE_PROFILE_TIMERS) +# define TIMER_DEFINE(name) int64_t name##_ticks = 0 +# define TIMER_START(name) int64_t name##_t0 = HAP_perf_get_qtimer_count() +# define TIMER_STOP(name) name##_ticks += HAP_perf_get_qtimer_count() - name##_t0 +# define TIMER_US(name) HAP_perf_qtimer_count_to_us(name##_ticks) +#else +# define TIMER_DEFINE(name) +# define TIMER_START(name) +# define TIMER_STOP(name) +# define TIMER_US(name) 0LL +#endif + +#endif // HMX_PROFILE_H diff --git a/ggml/src/ggml-hexagon/htp/hmx-utils.h b/ggml/src/ggml-hexagon/htp/hmx-utils.h new file mode 100644 index 0000000000..aacfbcda28 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hmx-utils.h @@ -0,0 +1,88 @@ +// HMX tile-level inline helpers (FP16 32x32 tile operations). +// Ported from htp-ops-lib/include/dsp/hmx_utils.h. (https://github.com/haozixu/htp-ops-lib) + +#ifndef HMX_UTILS_H +#define HMX_UTILS_H + +#include +#include + +#define HMX_FP16_TILE_N_ROWS 32 +#define HMX_FP16_TILE_N_COLS 32 +#define HMX_FP16_TILE_N_ELMS 1024 +#define HMX_FP16_TILE_SIZE 2048 + +#define HMX_INLINE_ALWAYS inline __attribute__((unused, always_inline)) + +static HMX_INLINE_ALWAYS void hmx_set_output_scales(const void *scales) { + asm volatile("bias = mxmem2(%0)" :: "r"(scales)); +} + +// Initialise aligned 256-byte area with scale vector + zero padding. +static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) { + HVX_Vector *pv = (HVX_Vector *)out_scales; + *pv++ = v_scale; + *pv = Q6_V_vzero(); +} + +// Load multiple contiguous tiles with :deep streaming. +// Rt = total region size - 1; the hardware streams through [Rs, Rs + Rt]. +// IMPORTANT: the tile region [Rs, Rs + Rt] must NOT cross a VTCM 4 MB bank +// boundary, otherwise the mxmem instruction will raise a precise bus error. +// Callers must ensure their VTCM layout satisfies this constraint. +static HMX_INLINE_ALWAYS void hmx_load_tiles_fp16(const __fp16 *row_tiles, + const __fp16 *col_tiles, + size_t n_tiles) { + size_t limit = n_tiles * HMX_FP16_TILE_SIZE - 1; + asm volatile( + "{ activation.hf = mxmem(%0, %1):deep\n" + "weight.hf = mxmem(%2, %3) }\n" + :: "r"(row_tiles), "r"(limit), "r"(col_tiles), "r"(limit) + : "memory"); +} + +// Load a single activation+weight tile pair (no :deep streaming). +// Rt defines the accessible region [Rs, Rs+Rt]. Following the reference formula +// (limit = n_tiles * HMX_FP16_TILE_SIZE - 1), for a single tile Rt = 2047. +// The original code used Rt=0x7FFF (32 KB region); when dynamic VTCM allocation +// places a tile near a 4 MB bank boundary, the oversized region crosses it and +// triggers a precise bus error (0x2601). Rt=2047 confines accesses to exactly +// one 2048-byte tile while covering all 16 HVX vectors (offsets 0..2047). +static HMX_INLINE_ALWAYS void hmx_load_tile_pair_fp16(const __fp16 *act_tile, + const __fp16 *wt_tile) { + asm volatile( + "{ activation.hf = mxmem(%0, %1)\n" + "weight.hf = mxmem(%2, %3) }\n" + :: "r"(act_tile), "r"(2047), + "r"(wt_tile), "r"(2047) + : "memory"); +} + +static HMX_INLINE_ALWAYS void hmx_consume_accumulator_fp16(__fp16 *out) { + // Use the combined convert-and-store instruction (matches the reference + // Q6_mxmem_AR_after_hf intrinsic). The previous two-instruction sequence + // "cvt.hf = acc(2); mxmem = cvt" used an undocumented Rs=2 parameter. + asm volatile( + "mxmem(%0, %1):after.hf = acc\n" + :: "r"(out), "r"(0) + : "memory"); +} + +// Compute inner product of two vectors of tiles and store result. +static HMX_INLINE_ALWAYS void hmx_dot_fp16(__fp16 *out, + const __fp16 *row_tiles, + const __fp16 *col_tiles, + size_t n_tiles) { + hmx_load_tiles_fp16(row_tiles, col_tiles, n_tiles); + hmx_consume_accumulator_fp16(out); +} + +// --- VTCM sequential allocator (from htp-ops-lib/include/dsp/vtcm_mgr.h) --- + +static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) { + uint8_t *p = *vtcm_ptr; + *vtcm_ptr += size; + return p; +} + +#endif // HMX_UTILS_H diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index a707d98239..a92acfa0a8 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -30,6 +30,12 @@ struct htp_context { atomic_bool vtcm_needs_release; uint32_t opmask; + + // HMX acceleration fields (v73+, enabled by compile-time HTP_HAS_HMX) +#ifdef HTP_HAS_HMX + int hmx_enabled; // Runtime flag: HMX initialisation succeeded + size_t vtcm_scratch_size; // Usable dynamic scratch (vtcm_size minus tail reservation) +#endif }; #endif /* HTP_CTX_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h index 56bc5b622c..391148be0e 100644 --- a/ggml/src/ggml-hexagon/htp/htp-msg.h +++ b/ggml/src/ggml-hexagon/htp/htp-msg.h @@ -32,13 +32,14 @@ enum htp_status { // Duplicated here because we can't include full ggml.h in the htp build. // We have some static_asserts in the cpp code to ensure things are in sync. enum htp_data_type { - HTP_TYPE_F32 = 0, - HTP_TYPE_F16 = 1, - HTP_TYPE_Q4_0 = 2, - HTP_TYPE_Q8_0 = 8, - HTP_TYPE_I32 = 26, - HTP_TYPE_I64 = 27, - HTP_TYPE_MXFP4 = 39, + HTP_TYPE_F32 = 0, + HTP_TYPE_F16 = 1, + HTP_TYPE_Q4_0 = 2, + HTP_TYPE_Q8_0 = 8, + HTP_TYPE_IQ4_NL = 20, + HTP_TYPE_I32 = 26, + HTP_TYPE_I64 = 27, + HTP_TYPE_MXFP4 = 39, HTP_TYPE_COUNT }; @@ -87,6 +88,8 @@ static inline size_t htp_t_block_size(uint32_t t) { return QK4_0; case HTP_TYPE_Q8_0: return QK8_0; + case HTP_TYPE_IQ4_NL: + return QK4_NL; case HTP_TYPE_MXFP4: return QK_MXFP4; default: @@ -105,6 +108,8 @@ static inline size_t htp_type_nbytes(uint32_t t) { return sizeof(block_q4_0); case HTP_TYPE_Q8_0: return sizeof(block_q8_0); + case HTP_TYPE_IQ4_NL: + return sizeof(block_iq4_nl); case HTP_TYPE_MXFP4: return sizeof(block_mxfp4); default: diff --git a/ggml/src/ggml-hexagon/htp/htp_iface.idl b/ggml/src/ggml-hexagon/htp/htp_iface.idl index 9ebd937e46..2dc716cb44 100644 --- a/ggml/src/ggml-hexagon/htp/htp_iface.idl +++ b/ggml/src/ggml-hexagon/htp/htp_iface.idl @@ -7,7 +7,7 @@ #include "remote.idl" interface htp_iface : remote_handle64 { - AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx); + AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx); AEEResult stop(); AEEResult enable_etm(); AEEResult disable_etm(); diff --git a/ggml/src/ggml-hexagon/htp/hvx-base.h b/ggml/src/ggml-hexagon/htp/hvx-base.h index 3e6a8579b1..db05ab40d2 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-base.h +++ b/ggml/src/ggml-hexagon/htp/hvx-base.h @@ -9,6 +9,9 @@ #include "hex-utils.h" #include "hvx-types.h" +#define hvx_vmem(A) *((HVX_Vector *)(A)) +#define hvx_vmemu(A) *((HVX_UVector *)(A)) + static inline void hvx_vec_store_u(void * restrict dst, uint32_t n, HVX_Vector v) { // Rotate as needed. v = Q6_V_vlalign_VVR(v, v, (size_t) dst); @@ -112,11 +115,15 @@ static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) { return Q6_Q_and_QQ(p_exp, p_frac); } -static inline HVX_Vector hvx_vec_f32_to_f16(HVX_Vector v0, HVX_Vector v1) { - const HVX_Vector zero = Q6_V_vsplat_R(0); +static inline HVX_Vector hvx_vec_f32_to_f16_shuff(HVX_Vector v0, HVX_Vector v1) { + const HVX_Vector zero = Q6_V_vzero(); HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero); HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero); - HVX_Vector v = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0))); + return Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0)); +} + +static inline HVX_Vector hvx_vec_f32_to_f16(HVX_Vector v0, HVX_Vector v1) { + HVX_Vector v = Q6_Vh_vdeal_Vh(hvx_vec_f32_to_f16_shuff(v0, v1)); #if __HVX_ARCH__ < 79 // replace NaNs with -INF, older arches produce NaNs for (-INF + 0.0) @@ -128,6 +135,30 @@ static inline HVX_Vector hvx_vec_f32_to_f16(HVX_Vector v0, HVX_Vector v1) { return v; } +#if __HVX_ARCH__ >= 79 +static inline HVX_VectorPair hvx_vec_f16_to_f32_shuff(HVX_Vector v) { + const HVX_Vector one = hvx_vec_splat_f16(1.0); + HVX_VectorPair p = Q6_Wsf_vmpy_VhfVhf(v, one); + return Q6_W_vcombine_VV(Q6_V_hi_W(p), Q6_V_lo_W(p)); +} +static inline HVX_VectorPair hvx_vec_f16_to_f32(HVX_Vector v) { + const HVX_Vector one = hvx_vec_splat_f16(1.0); + HVX_VectorPair p = Q6_Wsf_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(v), one); + return Q6_W_vcombine_VV(Q6_V_hi_W(p), Q6_V_lo_W(p)); +} +#else +static inline HVX_VectorPair hvx_vec_f16_to_f32_shuff(HVX_Vector v) { + const HVX_Vector one = hvx_vec_splat_f16(1.0); + HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(v, one); + return Q6_W_vcombine_VV(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(p)), Q6_Vsf_equals_Vqf32(Q6_V_lo_W(p))); +} +static inline HVX_VectorPair hvx_vec_f16_to_f32(HVX_Vector v) { + const HVX_Vector one = hvx_vec_splat_f16(1.0); + HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(v), one); + return Q6_W_vcombine_VV(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(p)), Q6_Vsf_equals_Vqf32(Q6_V_lo_W(p))); +} +#endif + /* Q6_Vsf_equals_Vw is only available on v73+.*/ #if __HVX_ARCH__ < 73 static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in) diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 2a3f9e562b..ef9cba8ecc 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -25,6 +25,10 @@ #include "htp-ops.h" #include "worker-pool.h" +#ifdef HTP_HAS_HMX +#include "hmx-ops.h" +#endif // HTP_HAS_HMX + AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) { struct htp_context * ctx; int err = 0; @@ -163,6 +167,9 @@ static int vtcm_acquire(struct htp_context * ctx) { } ctx->vtcm_inuse = true; + + + return 0; } @@ -246,7 +253,7 @@ static void vtcm_free(struct htp_context * ctx) { static void htp_packet_callback(dspqueue_t queue, int error, void * context); static void htp_error_callback(dspqueue_t queue, int error, void * context); -AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) { +AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx) { struct htp_context * ctx = (struct htp_context *) handle; if (!ctx) { @@ -280,6 +287,21 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que return AEE_ENOMEMORY; } +#ifdef HTP_HAS_HMX + if (use_hmx) { + ctx->vtcm_scratch_size = ctx->vtcm_size; + ctx->hmx_enabled = 1; + + FARF(HIGH, "HMX enabled: vtcm-scratch %zu", ctx->vtcm_scratch_size); + } else { + // HMX disabled: skip HMX initialisation so the + // dispatch loop falls through to the HVX compute paths. + ctx->hmx_enabled = 0; + ctx->vtcm_scratch_size = ctx->vtcm_size; + FARF(HIGH, "HMX disabled (use_hmx=0): vtcm-scratch %zu", ctx->vtcm_scratch_size); + } +#endif + qurt_sysenv_max_hthreads_t hw_threads; qurt_sysenv_get_max_hw_threads(&hw_threads); uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF; @@ -340,6 +362,12 @@ AEEResult htp_iface_stop(remote_handle64 handle) { for (int i = 0; i < ctx->n_threads; i++) { dma_queue_delete(ctx->dma[i]); } +#ifdef HTP_HAS_HMX + if (ctx->hmx_enabled) { + ctx->hmx_enabled = 0; + } +#endif + vtcm_free(ctx); @@ -375,8 +403,9 @@ static int send_htp_rsp(struct htp_context * c, struct dspqueue_buffer * bufs, size_t n_bufs, struct profile_data * prof) { - // Prep response struct + // Prep response struct (zero-init to clear cmp/unused union) struct htp_general_rsp rsp; + memset(&rsp, 0, sizeof(rsp)); rsp.op = op; rsp.status = status; rsp.prof_usecs = prof->usecs; @@ -1037,6 +1066,210 @@ static void proc_flash_attn_ext_req(struct htp_context * ctx, send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof); } +#ifdef HTP_HAS_HMX +// --------------------------------------------------------------------------- +// HMX operation wrappers — self-contained, bypass htp_ops_context / htp_spad. +// VTCM, DMA and thread dispatch are managed inside the HMX kernels. +// --------------------------------------------------------------------------- + +static void proc_hmx_matmul_req(struct htp_context * ctx, + struct htp_general_req * req, + struct dspqueue_buffer * bufs, + size_t n_bufs) { + // HMX weight tile requires N to be 32-aligned. + if (req->src0.ne[1] % 32 != 0) { + proc_matmul_req(ctx, req, bufs, n_bufs); + return; + } + + const bool is_batched = (req->src0.ne[2] * req->src0.ne[3] > 1 || + req->src1.ne[2] * req->src1.ne[3] > 1); + + // Quantised HMX kernels only handle flat 2D matmul (host already rejects + // batched quantised, but guard here too). F16 batched matmul is handled + // by the dedicated wrapper in hmx-matmul-ops.c. + if (is_batched && + req->src0.type != HTP_TYPE_F16) { + proc_matmul_req(ctx, req, bufs, n_bufs); + return; + } + + // HMX assumes contiguous row-major layout. Fall back for permuted + // tensors where strides are non-monotonic (e.g. transposed KV cache). + if (req->src0.nb[0] > req->src0.nb[1] || + req->src1.nb[0] > req->src1.nb[1]) { + proc_matmul_req(ctx, req, bufs, n_bufs); + return; + } + + // M alignment: when M > 32 but not 32-aligned, we split into + // HMX (first m_hmx = M & ~31 rows) + HVX (remaining m_tail rows). + // When M <= 32 and not 32-aligned, fall back entirely to HVX. + const int m_total = (int) req->src1.ne[1]; + const int m_tail = m_total % 32; + const int m_hmx = m_total - m_tail; + + if (m_hmx == 0) { + proc_matmul_req(ctx, req, bufs, n_bufs); + return; + } + + // HMX only supports F16, Q4_0, Q8_0, IQ4_NL weights. + // Other types (e.g. MXFP4) fall back to HVX. + { + uint32_t wtype = req->src0.type; + if (wtype != HTP_TYPE_F16 && + wtype != HTP_TYPE_Q4_0 && + wtype != HTP_TYPE_Q8_0 && + wtype != HTP_TYPE_IQ4_NL) { + proc_matmul_req(ctx, req, bufs, n_bufs); + return; + } + // Quantised HMX path requires K aligned to 256 (x4x2 super-block). + // F16 HMX path requires K aligned to 32 (tile width). + if (wtype != HTP_TYPE_F16 && req->src0.ne[0] % 256 != 0) { + proc_matmul_req(ctx, req, bufs, n_bufs); + return; + } + if (wtype == HTP_TYPE_F16 && req->src0.ne[0] % 32 != 0) { + proc_matmul_req(ctx, req, bufs, n_bufs); + return; + } + } + + (void) n_bufs; + + struct dspqueue_buffer rsp_bufs[1]; + rsp_bufs[0].fd = bufs[2].fd; + rsp_bufs[0].ptr = bufs[2].ptr; + rsp_bufs[0].size = bufs[2].size; + rsp_bufs[0].offset = bufs[2].offset; + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); + + // src0 = weights, src1 = activation, dst = output + void * wgt = (void *) bufs[0].ptr; + float * act = (float *) bufs[1].ptr; + float * dst = (float *) bufs[2].ptr; + + int k = (int) req->src0.ne[0]; // inner dimension + int n = (int) req->src0.ne[1]; // weight columns + + + struct profile_data prof; + profile_start(&prof); + + uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; + + // --- Phase 1: HMX on the first m_hmx (32-aligned) rows --- + if (vtcm_acquire(ctx) == AEE_SUCCESS) { + int ret = -1; + + const int ne02 = (int) req->src0.ne[2]; + const int ne03 = (int) req->src0.ne[3]; + const int ne12 = (int) req->src1.ne[2]; + const int ne13 = (int) req->src1.ne[3]; + // Row strides in elements. For compact tensors these equal k; for + // permuted attention views they can be larger, so pass the real stride. + const int act_stride = (int)(req->src1.nb[1] / sizeof(float)); + const int weight_stride = (int)(req->src0.nb[1] / sizeof(__fp16)); + + switch (req->src0.type) { + case HTP_TYPE_F16: + if (is_batched) { + hmx_matmul_w16a32_batched_params_t batch_params = { + .dst = dst, + .activation = act, + .permuted_weight = (const __fp16 *) wgt, + .m = m_hmx, + .k = k, + .n = n, + .act_stride = act_stride, + .weight_stride = weight_stride, + .dst_stride = (int)(req->dst.nb[1] / sizeof(float)), + .ne02 = ne02, + .ne03 = ne03, + .ne12 = ne12, + .ne13 = ne13, + .src0_nb2 = req->src0.nb[2], + .src0_nb3 = req->src0.nb[3], + .src1_nb2 = req->src1.nb[2], + .src1_nb3 = req->src1.nb[3], + .dst_nb2 = req->dst.nb[2], + .dst_nb3 = req->dst.nb[3], + }; + ret = hmx_mat_mul_permuted_w16a32_batched(ctx, &batch_params); + } else { + ret = hmx_mat_mul_permuted_w16a32(ctx, dst, act, + (const __fp16 *) wgt, + m_hmx, k, n, + act_stride, + weight_stride); + } + break; + default: + ret = hmx_mat_mul_permuted_qk_0_d16a32(ctx, dst, act, + (const uint8_t *) wgt, + m_hmx, k, n, (int) req->src0.type); + break; + } + + if (ret == 0) { + rsp_status = HTP_STATUS_OK; + } else { + FARF(HIGH, "HMX matmul failed (ret=%d), falling back to HVX", ret); + vtcm_release(ctx); + req->flags &= ~HTP_OPFLAGS_SKIP_QUANTIZE; + proc_matmul_req(ctx, req, bufs, n_bufs); + return; + } + vtcm_release(ctx); + } + + // --- Phase 2: HVX on the remaining m_tail rows --- + if (m_tail > 0 && rsp_status == HTP_STATUS_OK) { + struct htp_ops_context octx = { 0 }; + octx.ctx = ctx; + octx.src0 = req->src0; // weights: unchanged + octx.src1 = req->src1; + octx.src1.ne[1] = m_tail; // only tail rows + octx.dst = req->dst; + octx.dst.ne[1] = m_tail; // only tail rows + // Always re-quantize tail src1: HMX Phase 1 overwrites VTCM, + // so any previously cached quantized data (SKIP_QUANTIZE pipeline) + // is invalid. + octx.flags = req->flags & ~HTP_OPFLAGS_SKIP_QUANTIZE; + octx.op = req->op; + octx.n_threads = ctx->n_threads; + + // Offset activation and dst pointers past the HMX-processed rows. + // Use nb[1] (row stride in bytes) to compute the byte offset. + octx.src0.data = (uint32_t) bufs[0].ptr; + octx.src1.data = (uint32_t)((uint8_t *) bufs[1].ptr + (size_t) m_hmx * req->src1.nb[1]); + octx.dst.data = (uint32_t)((uint8_t *) bufs[2].ptr + (size_t) m_hmx * req->dst.nb[1]); + + FARF(HIGH, "proc_hmx_matmul: HVX tail m_tail=%d act=%p dst=%p", + m_tail, (void *)(uintptr_t) octx.src1.data, (void *)(uintptr_t) octx.dst.data); + + if (vtcm_acquire(ctx) == AEE_SUCCESS) { + uint32_t hvx_ret = op_matmul(&octx); + vtcm_release(ctx); + if (hvx_ret != HTP_STATUS_OK) { + FARF(ERROR, "HVX tail matmul failed (ret=%u)", hvx_ret); + rsp_status = HTP_STATUS_INTERNAL_ERR; + } + } else { + rsp_status = HTP_STATUS_INTERNAL_ERR; + } + } + + profile_stop(&prof); + + send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); +} + +#endif // HTP_HAS_HMX + static void htp_packet_callback(dspqueue_t queue, int error, void * context) { struct htp_context * ctx = (struct htp_context *) context; @@ -1089,7 +1322,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { FARF(ERROR, "Bad matmul-req buffer list"); continue; } - proc_matmul_req(ctx, &req, bufs, n_bufs); +#ifdef HTP_HAS_HMX + if (ctx->hmx_enabled) { + proc_hmx_matmul_req(ctx, &req, bufs, n_bufs); + } else +#endif + { + proc_matmul_req(ctx, &req, bufs, n_bufs); + } break; case HTP_OP_MUL_MAT_ID: diff --git a/scripts/snapdragon/adb/run-cli.sh b/scripts/snapdragon/adb/run-cli.sh index dfc051b28b..0178034b1f 100755 --- a/scripts/snapdragon/adb/run-cli.sh +++ b/scripts/snapdragon/adb/run-cli.sh @@ -39,6 +39,9 @@ opmask= nhvx= [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" +hmx= +[ "$HMX" != "" ] && hmx="GGML_HEXAGON_USE_HMX=$HMX" + ndev= [ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV" @@ -51,7 +54,7 @@ adb $adbserial $adbhost shell " \ cd $basedir; ulimit -c unlimited; \ LD_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ - $verbose $experimental $sched $opmask $profile $nhvx $ndev $hb \ + $verbose $experimental $sched $opmask $profile $nhvx $hmx $ndev $hb \ ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ --ctx-size 8192 --ubatch-size 256 -fa on \ diff --git a/scripts/snapdragon/adb/run-completion.sh b/scripts/snapdragon/adb/run-completion.sh index d53b588739..67f58b156c 100755 --- a/scripts/snapdragon/adb/run-completion.sh +++ b/scripts/snapdragon/adb/run-completion.sh @@ -39,6 +39,9 @@ opmask= nhvx= [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" +hmx= +[ "$HMX" != "" ] && hmx="GGML_HEXAGON_USE_HMX=$HMX" + ndev= [ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV" @@ -51,7 +54,7 @@ adb $adbserial $adbhost shell " \ cd $basedir; ulimit -c unlimited; \ LD_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ - $verbose $experimental $sched $opmask $profile $nhvx $ndev $hb \ + $verbose $experimental $sched $opmask $profile $nhvx $hmx $ndev $hb \ ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ --ctx-size 8192 --ubatch-size 256 -fa on \ diff --git a/scripts/snapdragon/adb/run-mtmd.sh b/scripts/snapdragon/adb/run-mtmd.sh index 41d7cd44f8..0c1cf89280 100755 --- a/scripts/snapdragon/adb/run-mtmd.sh +++ b/scripts/snapdragon/adb/run-mtmd.sh @@ -45,6 +45,9 @@ opmask= nhvx= [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" +hmx= +[ "$HMX" != "" ] && hmx="GGML_HEXAGON_USE_HMX=$HMX" + ndev= [ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV" @@ -58,7 +61,7 @@ adb $adbserial $adbhost shell " \ cd $basedir; ulimit -c unlimited; \ LD_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ - $verbose $experimental $sched $opmask $profile $nhvx $ndev $mtmd_backend \ + $verbose $experimental $sched $opmask $profile $hmx $nhvx $ndev $mtmd_backend \ ./$branch/bin/llama-mtmd-cli --no-mmap -m $basedir/../gguf/$model \ --mmproj $basedir/../gguf/$mmproj \ --image $basedir/../gguf/$image \ diff --git a/scripts/snapdragon/adb/run-tool.sh b/scripts/snapdragon/adb/run-tool.sh index 4647ede1f8..e53cacf320 100755 --- a/scripts/snapdragon/adb/run-tool.sh +++ b/scripts/snapdragon/adb/run-tool.sh @@ -36,6 +36,9 @@ opmask= nhvx= [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" +hmx= +[ "$HMX" != "" ] && hmx="GGML_HEXAGON_USE_HMX=$HMX" + ndev= [ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV" @@ -50,5 +53,5 @@ adb $adbserial $adbhost shell " \ cd $basedir; ulimit -c unlimited; \ LD_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ - $verbose $experimental $sched $opmask $profile $nhvx $ndev $hb ./$branch/bin/$tool $@ \ + $verbose $experimental $sched $opmask $profile $nhvx $hmx $ndev $hb ./$branch/bin/$tool $@ \ "