hexagon: Q4_0 and MXFP4 repack fixes (#20527)
* hexagon: fix tail corruption with rows sizes not multiple of 256 * hexagon: use different stride for repacking partial blocks * hex-mm: update repack and kernels to avoid shuffles for full 256-element blocks Previous commit changed the repacking to use even:odd (0:1,2:3,..) packing instead of the original (0:128,1:129,...) packing in order to fix tail corruption. Since the mm kernels already deal with partial tails we can use even:odd packing only for the last block. This avoid performance penalty of having to shuffle to zip the elements in the common case. * hex-mm: update rmpy x8 for better optimizations * hex-mm: tighten supported MUL_MAT checks to avoid spurios failures * hex-mm: use vzero to init accumulators * hex-mm: properly call partial rmpy_x8
This commit is contained in:
parent
9f774e45ee
commit
609ea50026
|
|
@ -402,6 +402,7 @@ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi
|
|||
static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
||||
static const int qk = QK_Q4_0x4x2;
|
||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||
const int nloe = k % qk; // leftovers
|
||||
|
||||
const int dblk_size = 8 * 2; // 8x __fp16
|
||||
const int qblk_size = qk / 2; // int4
|
||||
|
|
@ -435,9 +436,11 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|||
unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
|
||||
unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
|
||||
|
||||
bool partial = (nloe && i == nb-1);
|
||||
|
||||
uint8_t * q = y_q + (i * qblk_size);
|
||||
for (int j = 0; j < qk / 2; j++) {
|
||||
q[j] = (qs[j + 128] << 4) | qs[j];
|
||||
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -467,6 +470,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
|||
static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
||||
static const int qk = QK_Q4_0x4x2;
|
||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||
const int nloe = k % qk; // leftovers
|
||||
|
||||
const int dblk_size = 8 * 2; // 8x __fp16
|
||||
const int qblk_size = qk / 2; // int4
|
||||
|
|
@ -485,10 +489,17 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
|||
for (int i = 0; i < nb; i++) {
|
||||
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
||||
|
||||
bool partial = (nloe && i == nb-1);
|
||||
|
||||
const uint8_t * q = y_q + (i * qblk_size);
|
||||
for (int j = 0; j < qk / 2; j++) {
|
||||
qs[j] = q[j] & 0xf;
|
||||
qs[j + 128] = q[j] >> 4;
|
||||
if (partial) {
|
||||
qs[j*2+0] = q[j] & 0xf;
|
||||
qs[j*2+1] = q[j] >> 4;
|
||||
} else {
|
||||
qs[j+000] = q[j] & 0xf;
|
||||
qs[j+128] = q[j] >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
|
||||
|
|
@ -1078,6 +1089,7 @@ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int
|
|||
static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
|
||||
static const int qk = QK_MXFP4x4x2;
|
||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||
const int nloe = k % qk; // leftovers
|
||||
|
||||
const int eblk_size = 8 * 1; // 8x E8M0
|
||||
const int qblk_size = qk / 2; // int4
|
||||
|
|
@ -1112,9 +1124,11 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
|||
unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
|
||||
unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
|
||||
|
||||
bool partial = (nloe && i == nb-1);
|
||||
|
||||
uint8_t * q = y_q + (i * qblk_size);
|
||||
for (int j = 0; j < qk / 2; j++) {
|
||||
q[j] = (qs[j + 128] << 4) | qs[j];
|
||||
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1144,6 +1158,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
|||
static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
|
||||
static const int qk = QK_MXFP4x4x2;
|
||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||
const int nloe = k % qk; // leftovers
|
||||
|
||||
const int eblk_size = 8 * 1; // 8x E8M0
|
||||
const int qblk_size = qk / 2; // int4
|
||||
|
|
@ -1162,10 +1177,17 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
|
|||
for (int i = 0; i < nb; i++) {
|
||||
uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
|
||||
|
||||
bool partial = (nloe && i == nb-1);
|
||||
|
||||
const uint8_t * q = y_q + (i * qblk_size);
|
||||
for (int j = 0; j < qk / 2; j++) {
|
||||
qs[j] = q[j] & 0xf;
|
||||
qs[j + 128] = q[j] >> 4;
|
||||
if (partial) {
|
||||
qs[j*2+0] = q[j] & 0xf;
|
||||
qs[j*2+1] = q[j] >> 4;
|
||||
} else {
|
||||
qs[j+000] = q[j] & 0xf;
|
||||
qs[j+128] = q[j] >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
|
||||
|
|
@ -1801,12 +1823,12 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|||
return false;
|
||||
}
|
||||
|
||||
if (src0->ne[1] > 16 * 1024) {
|
||||
if (ggml_nrows(src0) > 16 * 1024) {
|
||||
return false; // typically the lm-head which would be too large for VTCM
|
||||
}
|
||||
|
||||
if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
|
||||
return false;
|
||||
if (ggml_nrows(src1) > 1024 || src1->ne[2] != 1 || src1->ne[3] != 1) {
|
||||
return false; // no huge batches or broadcasting (for now)
|
||||
}
|
||||
|
||||
// src0 (weights) must be repacked
|
||||
|
|
@ -1820,6 +1842,9 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|||
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
|
||||
return false;
|
||||
}
|
||||
if (ggml_nrows(src1) > 1024) {
|
||||
return false; // no huge batches (for now)
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ static inline size_t q8x4x2_row_size(uint32_t ne) {
|
|||
return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128);
|
||||
}
|
||||
|
||||
static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
|
||||
static inline HVX_Vector_x8 hvx_vec_load_q4x4x8_full(const uint8_t * restrict ptr) {
|
||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||
|
||||
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
|
||||
|
|
@ -88,9 +88,9 @@ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
|
|||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
|
||||
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4
|
||||
HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F : first 128 elements
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 : second 128 elements
|
||||
HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F ...
|
||||
HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4
|
||||
HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F
|
||||
HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4
|
||||
|
|
@ -111,7 +111,41 @@ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
|
|||
return r;
|
||||
}
|
||||
|
||||
static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr) {
|
||||
static HVX_Vector_x8 hvx_vec_load_q4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
|
||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||
|
||||
const uint32_t qk = QK_Q4_0x4x2; // 256
|
||||
const uint32_t nb = n / qk;
|
||||
const uint32_t nloe = n % qk;
|
||||
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
|
||||
|
||||
HVX_Vector_x8 r;
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(2)
|
||||
for (i=0; i < nb; i++) {
|
||||
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
|
||||
r.v[i*2+0] = Q6_Vb_vsub_VbVb(v0, i8);
|
||||
r.v[i*2+1] = Q6_Vb_vsub_VbVb(v1, i8);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
|
||||
HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
|
||||
r.v[i*2+0] = Q6_Vb_vsub_VbVb(Q6_V_lo_W(v0_1_p), i8);
|
||||
r.v[i*2+1] = Q6_Vb_vsub_VbVb(Q6_V_hi_W(v0_1_p), i8);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_full(const uint8_t * restrict ptr) {
|
||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||
|
||||
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
|
||||
|
|
@ -144,7 +178,41 @@ static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr)
|
|||
return r;
|
||||
}
|
||||
|
||||
static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
|
||||
static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
|
||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||
|
||||
const uint32_t qk = QK_Q4_0x4x2; // 256
|
||||
const uint32_t nb = n / qk;
|
||||
const uint32_t nloe = n % qk;
|
||||
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
const HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut;
|
||||
|
||||
HVX_Vector_x8 r;
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(2)
|
||||
for (i=0; i < nb; i++) {
|
||||
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
|
||||
r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
|
||||
r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
|
||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
|
||||
HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
|
||||
r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0);
|
||||
r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_full(const uint8_t * restrict ptr) {
|
||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||
|
||||
HVX_Vector v0 = vptr[0]; // first 128 vals
|
||||
|
|
@ -160,6 +228,10 @@ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
|
|||
return r;
|
||||
}
|
||||
|
||||
static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_partial(const uint8_t * restrict ptr, uint32_t nloe) {
|
||||
return hvx_vec_load_q8x4x8_full(ptr);
|
||||
}
|
||||
|
||||
// Reduce multiply 1024 x 1024 int8 elements (32x q4/8 blocks in 8x HVX vectors).
|
||||
// Accumulate each block into a single int32 value.
|
||||
// Return a single HVX vector with 32x int32 accumulators.
|
||||
|
|
@ -167,14 +239,14 @@ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
|
|||
// if() checks are optimized out at compile time -- make sure to pass N as a constexpr.
|
||||
|
||||
static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
|
||||
HVX_Vector r0 = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1 = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r2 = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r3 = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r4 = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r5 = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r6 = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r7 = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0 = Q6_V_vzero();
|
||||
HVX_Vector r1 = Q6_V_vzero();
|
||||
HVX_Vector r2 = Q6_V_vzero();
|
||||
HVX_Vector r3 = Q6_V_vzero();
|
||||
HVX_Vector r4 = Q6_V_vzero();
|
||||
HVX_Vector r5 = Q6_V_vzero();
|
||||
HVX_Vector r6 = Q6_V_vzero();
|
||||
HVX_Vector r7 = Q6_V_vzero();
|
||||
|
||||
HVX_VectorPair p3;
|
||||
HVX_VectorPair p2;
|
||||
|
|
@ -213,15 +285,42 @@ static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, uns
|
|||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_rmpy_x8_full(HVX_Vector_x8 x, HVX_Vector_x8 y) {
|
||||
return hvx_vec_rmpy_x8_n(x, y, 1024);
|
||||
HVX_Vector r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]);
|
||||
HVX_Vector r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]);
|
||||
HVX_Vector r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]);
|
||||
HVX_Vector r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]);
|
||||
HVX_Vector r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]);
|
||||
HVX_Vector r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]);
|
||||
HVX_Vector r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]);
|
||||
HVX_Vector r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]);
|
||||
|
||||
HVX_VectorPair p0 = Q6_W_vdeal_VVR(r1, r0, -4);
|
||||
HVX_VectorPair p1 = Q6_W_vdeal_VVR(r3, r2, -4);
|
||||
HVX_VectorPair p2 = Q6_W_vdeal_VVR(r5, r4, -4);
|
||||
HVX_VectorPair p3 = Q6_W_vdeal_VVR(r7, r6, -4);
|
||||
|
||||
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
|
||||
r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
|
||||
r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2));
|
||||
r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3));
|
||||
|
||||
p0 = Q6_W_vdeal_VVR(r1, r0, -4);
|
||||
p1 = Q6_W_vdeal_VVR(r3, r2, -4);
|
||||
|
||||
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
|
||||
r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
|
||||
|
||||
p0 = Q6_W_vdeal_VVR(r1, r0, -4);
|
||||
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
|
||||
|
||||
return r0;
|
||||
}
|
||||
|
||||
// Handle most common cases of tensors not multiple of 1024.
|
||||
static inline HVX_Vector hvx_vec_rmpy_x8_nloe(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
|
||||
if (n <= 256) { return hvx_vec_rmpy_x8_n(x, y, 256); };
|
||||
if (n <= 512) { return hvx_vec_rmpy_x8_n(x, y, 512); };
|
||||
if (n <= 768) { return hvx_vec_rmpy_x8_n(x, y, 768); };
|
||||
return hvx_vec_rmpy_x8_n(x, y, 1024);
|
||||
static inline HVX_Vector hvx_vec_rmpy_x8_partial(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
|
||||
if (n >= 512)
|
||||
return hvx_vec_rmpy_x8_full(x, y);
|
||||
|
||||
return hvx_vec_rmpy_x8_partial(x, y, 512);
|
||||
}
|
||||
|
||||
static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
|
||||
|
|
@ -246,7 +345,7 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
|||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||
|
||||
// Row sum (sf)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_sum = Q6_V_vzero();
|
||||
|
||||
// Multiply and accumulate into int32.
|
||||
// Compute combined scale (fp32).
|
||||
|
|
@ -257,12 +356,12 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
|||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
|
|
@ -272,19 +371,19 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
|||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
||||
// Process leftovers
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
|
||||
// Zero out unused scales
|
||||
// Zero out unused elements
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
|
|
@ -326,8 +425,8 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
|||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||
|
||||
// Row sum (sf)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_sum = Q6_V_vzero();
|
||||
|
||||
// Multiply and accumulate into int32.
|
||||
// Compute combined scale (fp32).
|
||||
|
|
@ -338,14 +437,14 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
|||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
|
|
@ -359,23 +458,23 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
|||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
||||
// Process leftovers
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
|
||||
|
||||
// Zero out unused scales
|
||||
// Zero out unused elements
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
||||
|
|
@ -423,10 +522,10 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
|||
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
|
||||
|
||||
// Row sums (sf) - 4 accumulators for 2×2 tile
|
||||
HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r0_c1_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c1_sum = Q6_V_vzero();
|
||||
|
||||
const uint32_t nb = n / qk; // num full blocks
|
||||
const uint32_t nloe = n % qk; // num leftover elements
|
||||
|
|
@ -434,12 +533,12 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
|||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
// Load src1 columns (reused across both src0 rows)
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
|
||||
|
||||
// Load src0 rows (reused across both src1 columns)
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
|
||||
|
||||
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
||||
|
|
@ -448,8 +547,8 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
|||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
|
||||
|
||||
// Load scales
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
|
|
@ -473,18 +572,18 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
|||
|
||||
// Process leftovers
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
|
||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
|
||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
|
||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
|
||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
|
||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
|
||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
|
||||
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
|
|
@ -545,7 +644,7 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
|||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||
|
||||
// Row sum (sf)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_sum = Q6_V_vzero();
|
||||
|
||||
// Multiply and accumulate into int32.
|
||||
// Compute combined scale (fp32).
|
||||
|
|
@ -556,12 +655,12 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
|||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
|
|
@ -571,19 +670,19 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
|||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
||||
// Process leftovers
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
|
||||
// Zero out unused scales
|
||||
// Zero out unused elements
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
|
|
@ -625,8 +724,8 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
|||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||
|
||||
// Row sum (qf32)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_sum = Q6_V_vzero();
|
||||
|
||||
// Multiply and accumulate into int32.
|
||||
// Compute combined scale (fp32).
|
||||
|
|
@ -637,14 +736,14 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
|||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
|
|
@ -658,14 +757,14 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
|||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
||||
// Process leftovers
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
|
||||
|
||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
|
|
@ -674,7 +773,7 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
|||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
|
||||
|
||||
// Zero out unused scales
|
||||
// Zero out unused elements
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
||||
|
|
@ -722,10 +821,10 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
|||
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
|
||||
|
||||
// Row sums (sf) - 4 accumulators for 2×2 tile
|
||||
HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r0_c1_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c1_sum = Q6_V_vzero();
|
||||
|
||||
const uint32_t nb = n / qk; // num full blocks
|
||||
const uint32_t nloe = n % qk; // num leftover elements
|
||||
|
|
@ -733,12 +832,12 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
|||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
// Load src1 columns (reused across both src0 rows)
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
|
||||
|
||||
// Load src0 rows (reused across both src1 columns)
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size);
|
||||
|
||||
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
||||
|
|
@ -747,8 +846,8 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
|||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
|
||||
|
||||
// Load scales
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
|
|
@ -772,18 +871,18 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
|||
|
||||
// Process leftovers
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
|
||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
|
||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
|
||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
|
||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
|
||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
|
||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
|
||||
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||
|
||||
|
|
@ -792,7 +891,7 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
|||
HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
|
||||
HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
|
||||
|
||||
// Zero out unused scales
|
||||
// Zero out unused elements
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
|
||||
r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
|
||||
|
|
@ -844,7 +943,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
|
|||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||
|
||||
// Row sum (sf)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_sum = Q6_V_vzero();
|
||||
|
||||
// Multiply and accumulate into int32.
|
||||
// Compute combined scale (fp32).
|
||||
|
|
@ -855,8 +954,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
|
|||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
|
||||
|
|
@ -887,12 +986,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
|
|||
|
||||
// Process leftovers
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||
|
||||
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
|
||||
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
|
||||
HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
|
||||
|
||||
// Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
|
||||
|
|
@ -954,8 +1053,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
|||
const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales
|
||||
|
||||
// Row sum (sf)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_sum = Q6_V_vzero();
|
||||
|
||||
// Multiply and accumulate into int32.
|
||||
// Compute combined scale (fp32).
|
||||
|
|
@ -966,9 +1065,9 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
|||
|
||||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
||||
|
|
@ -1007,14 +1106,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
|||
|
||||
// Process leftovers
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
||||
|
||||
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
|
||||
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
|
||||
HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
|
||||
HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
|
||||
|
||||
|
|
@ -1087,10 +1186,10 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float
|
|||
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
|
||||
|
||||
// Row sums (sf) - 4 accumulators for 2×2 tile
|
||||
HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r0_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r0_c1_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c0_sum = Q6_V_vzero();
|
||||
HVX_Vector r1_c1_sum = Q6_V_vzero();
|
||||
|
||||
const uint32_t nb = n / qk; // num full blocks
|
||||
const uint32_t nloe = n % qk; // num leftover elements
|
||||
|
|
@ -1098,12 +1197,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float
|
|||
uint32_t i = 0;
|
||||
for (; i < nb; i++) {
|
||||
// Load src1 columns (reused across both src0 rows)
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
|
||||
|
||||
// Load src0 rows (reused across both src1 columns)
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size);
|
||||
|
||||
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
||||
|
|
@ -1157,15 +1256,15 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float
|
|||
|
||||
// Process leftovers
|
||||
if (nloe) {
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
|
||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial( y0_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial( y1_q + i * y_qblk_size, nloe);
|
||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
|
||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
|
||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
|
||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
|
||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
|
||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
|
||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
|
||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
|
||||
|
||||
HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d + i * y_dblk_size);
|
||||
HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d + i * y_dblk_size);
|
||||
|
|
@ -1234,7 +1333,7 @@ static void vec_dot_f16_f16_aa_1x1(const int n, float * restrict s, const void *
|
|||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||
|
||||
HVX_VectorPair rsum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
||||
HVX_VectorPair rsum_p = Q6_W_vzero();
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
|
|
@ -1264,8 +1363,8 @@ static void vec_dot_f16_f16_aa_2x1(const int n, float * restrict s0,
|
|||
uint32_t nvec = n / VLEN_FP16;
|
||||
uint32_t nloe = n % VLEN_FP16;
|
||||
|
||||
HVX_VectorPair rsum0_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
||||
HVX_VectorPair rsum1_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
||||
HVX_VectorPair rsum0_p = Q6_W_vzero();
|
||||
HVX_VectorPair rsum1_p = Q6_W_vzero();
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
|
|
@ -1303,10 +1402,10 @@ static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * res
|
|||
uint32_t nloe = n % VLEN_FP16;
|
||||
|
||||
// Row sums (sf) - 4 accumulators for 2×2 tile
|
||||
HVX_VectorPair r0_c0_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
||||
HVX_VectorPair r0_c1_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
||||
HVX_VectorPair r1_c0_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
||||
HVX_VectorPair r1_c1_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
||||
HVX_VectorPair r0_c0_sum_p = Q6_W_vzero();
|
||||
HVX_VectorPair r0_c1_sum_p = Q6_W_vzero();
|
||||
HVX_VectorPair r1_c0_sum_p = Q6_W_vzero();
|
||||
HVX_VectorPair r1_c1_sum_p = Q6_W_vzero();
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
|
|
@ -1358,7 +1457,7 @@ static void vec_dot_f16_f16_uu_1x1(const int n, float * restrict s, const void *
|
|||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||
|
||||
HVX_Vector rsum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector rsum = Q6_V_vzero();
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
|
|
@ -1388,9 +1487,9 @@ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void *
|
|||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||
|
||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
const HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
HVX_Vector rsum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector rsum = Q6_V_vzero();
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
|
|
@ -1973,7 +2072,7 @@ static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restric
|
|||
assert((unsigned long) y_q % 128 == 0);
|
||||
|
||||
HVX_Vector * vx = (HVX_Vector *) x;
|
||||
HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
|
||||
// Use reduce max fp32 to find max(abs(e)) first
|
||||
HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0]));
|
||||
|
|
@ -2034,7 +2133,7 @@ static inline void quantize_block_f32_q8x2(float * restrict x, uint8_t * restric
|
|||
HVX_Vector * vx = (HVX_Vector *) x;
|
||||
|
||||
// Load and convert into QF32
|
||||
HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
|
||||
HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
|
||||
HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements
|
||||
|
|
@ -2077,7 +2176,7 @@ static inline void quantize_block_f32_q8x4(float * restrict x, uint8_t * restric
|
|||
HVX_Vector * vx = (HVX_Vector *) x;
|
||||
|
||||
// Load and convert into QF32
|
||||
HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
HVX_Vector zero = Q6_V_vzero();
|
||||
HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
|
||||
HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
|
||||
HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements
|
||||
|
|
|
|||
Loading…
Reference in New Issue