Align RVV implementation plan with xxHash upstream community

This commit is contained in:
ixgbe 2026-01-06 14:50:53 +08:00
parent 818e7d2cf7
commit 74dbb2eef2
1 changed files with 112 additions and 96 deletions

View File

@ -5644,114 +5644,130 @@ XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
#endif
#if (XXH_VECTOR == XXH_RVV)
#define XXH_CONCAT2(X, Y) X ## Y
#define XXH_CONCAT(X, Y) XXH_CONCAT2(X, Y)
#if ((defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 13) || \
(defined(__clang__) && __clang_major__ < 16))
#define XXH_RVOP(op) op
#define XXH_RVCAST(op) XXH_CONCAT(vreinterpret_v_, op)
#else
#define XXH_RVOP(op) XXH_CONCAT(__riscv_, op)
#define XXH_RVCAST(op) XXH_CONCAT(__riscv_vreinterpret_v_, op)
#endif
XXH_FORCE_INLINE void
XXH3_accumulate_512_rvv(void* XXH_RESTRICT acc,
const void* XXH_RESTRICT input,
const void* XXH_RESTRICT secret)
XXH3_accumulate_512_rvv( void* XXH_RESTRICT acc,
const void* XXH_RESTRICT input,
const void* XXH_RESTRICT secret)
{
XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_input[XXH_ACC_NB];
XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB];
XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB];
XXH_ASSERT((((size_t)acc) & 63) == 0);
{
// Try to set vector lenght to 512 bits.
// If this length is unavailable, then maximum available will be used
size_t vl = XXH_RVOP(vsetvl_e64m2)(8);
memcpy(aligned_input, input, XXH_STRIPE_LEN);
memcpy(aligned_secret, secret, XXH_STRIPE_LEN);
memcpy(aligned_acc, acc, XXH_STRIPE_LEN);
uint64_t* xacc = (uint64_t*) acc;
const uint64_t* xinput = (const uint64_t*) input;
const uint64_t* xsecret = (const uint64_t*) secret;
static const uint64_t swap_mask[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
vuint64m2_t xswap_mask = XXH_RVOP(vle64_v_u64m2)(swap_mask, vl);
size_t vl = __riscv_vsetvl_e64m8(8);
vuint64m8_t data_vec = __riscv_vle64_v_u64m8(aligned_input, vl);
vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl);
vuint64m8_t acc_vec = __riscv_vle64_v_u64m8(aligned_acc, vl);
vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl);
vuint64m8_t data_key_lo = __riscv_vand_vx_u64m8(data_key, 0xFFFFFFFFULL, vl);
vuint64m8_t data_key_hi = __riscv_vsrl_vx_u64m8(data_key, 32, vl);
vuint64m8_t idx = __riscv_vxor_vx_u64m8(__riscv_vid_v_u64m8(vl), 1, vl);
vuint64m8_t data_swap = __riscv_vrgather_vv_u64m8(data_vec, idx, vl);
acc_vec = __riscv_vadd_vv_u64m8(acc_vec, data_swap, vl);
acc_vec = __riscv_vmacc_vv_u64m8(acc_vec, data_key_lo, data_key_hi, vl);
__riscv_vse64_v_u64m8(aligned_acc, acc_vec, vl);
memcpy(acc, aligned_acc, XXH_STRIPE_LEN);
size_t i;
for (i = 0; i < XXH_STRIPE_LEN/8; i += vl) {
/* data_vec = xinput[i]; */
vuint64m2_t data_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)((const uint8_t*)(xinput + i), vl * 8));
/* key_vec = xsecret[i]; */
vuint64m2_t key_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)((const uint8_t*)(xsecret + i), vl * 8));
/* acc_vec = xacc[i]; */
vuint64m2_t acc_vec = XXH_RVOP(vle64_v_u64m2)(xacc + i, vl);
/* data_key = data_vec ^ key_vec; */
vuint64m2_t data_key = XXH_RVOP(vxor_vv_u64m2)(data_vec, key_vec, vl);
/* data_key_hi = data_key >> 32; */
vuint64m2_t data_key_hi = XXH_RVOP(vsrl_vx_u64m2)(data_key, 32, vl);
/* data_key_lo = data_key & 0xffffffff; */
vuint64m2_t data_key_lo = XXH_RVOP(vand_vx_u64m2)(data_key, 0xffffffff, vl);
/* swap high and low halves */
vuint64m2_t data_swap = XXH_RVOP(vrgather_vv_u64m2)(data_vec, xswap_mask, vl);
/* acc_vec += data_key_lo * data_key_hi; */
acc_vec = XXH_RVOP(vmacc_vv_u64m2)(acc_vec, data_key_lo, data_key_hi, vl);
/* acc_vec += data_swap; */
acc_vec = XXH_RVOP(vadd_vv_u64m2)(acc_vec, data_swap, vl);
/* xacc[i] = acc_vec; */
XXH_RVOP(vse64_v_u64m2)(xacc + i, acc_vec, vl);
}
}
}
XXH_FORCE_INLINE void
XXH3_accumulate_rvv(xxh_u64* XXH_RESTRICT acc,
const xxh_u8* XXH_RESTRICT input,
const xxh_u8* XXH_RESTRICT secret,
size_t nbStripes)
{
if (nbStripes == 0) return;
XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB];
memcpy(aligned_acc, acc, XXH_STRIPE_LEN);
const uint64_t* xinput = (const uint64_t*) (const void*) input;
const uint64_t* xsecret = (const uint64_t*) (const void*) secret;
XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_input[XXH_ACC_NB];
XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB];
size_t vl = __riscv_vsetvl_e64m8(8);
vuint64m8_t vacc = __riscv_vle64_v_u64m8(aligned_acc, vl);
vuint64m8_t idx = __riscv_vxor_vx_u64m8(__riscv_vid_v_u64m8(vl), 1, vl);
do {
XXH_PREFETCH((const xxh_u8*)xinput + XXH_PREFETCH_DIST);
memcpy(aligned_input, xinput, XXH_STRIPE_LEN);
memcpy(aligned_secret, xsecret, XXH_STRIPE_LEN);
vuint64m8_t data_vec = __riscv_vle64_v_u64m8(aligned_input, vl);
vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl);
vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl);
vuint64m8_t data_key_lo = __riscv_vand_vx_u64m8(data_key, 0xFFFFFFFFULL, vl);
vuint64m8_t data_key_hi = __riscv_vsrl_vx_u64m8(data_key, 32, vl);
vuint64m8_t data_swap = __riscv_vrgather_vv_u64m8(data_vec, idx, vl);
vacc = __riscv_vadd_vv_u64m8(vacc, data_swap, vl);
vacc = __riscv_vmacc_vv_u64m8(vacc, data_key_lo, data_key_hi, vl);
xinput += 8;
xsecret += 1;
nbStripes--;
} while (nbStripes > 0);
__riscv_vse64_v_u64m8(aligned_acc, vacc, vl);
memcpy(acc, aligned_acc, XXH_STRIPE_LEN);
}
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(rvv)
XXH_FORCE_INLINE void
XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB];
XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB];
memcpy(aligned_acc, acc, XXH_STRIPE_LEN);
memcpy(aligned_secret, secret, XXH_STRIPE_LEN);
size_t vl = __riscv_vsetvl_e64m8(8);
vuint64m8_t acc_vec = __riscv_vle64_v_u64m8(aligned_acc, vl);
vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl);
vuint64m8_t shifted = __riscv_vsrl_vx_u64m8(acc_vec, 47, vl);
vuint64m8_t data_vec = __riscv_vxor_vv_u64m8(acc_vec, shifted, vl);
vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl);
acc_vec = __riscv_vmul_vx_u64m8(data_key, XXH_PRIME32_1, vl);
__riscv_vse64_v_u64m8(aligned_acc, acc_vec, vl);
memcpy(acc, aligned_acc, XXH_STRIPE_LEN);
XXH_ASSERT((((size_t)acc) & 15) == 0);
{
size_t count = XXH_STRIPE_LEN/8;
uint64_t* xacc = (uint64_t*)acc;
const uint8_t* xsecret = (const uint8_t *)secret;
size_t vl;
for (; count > 0; count -= vl, xacc += vl, xsecret += vl*8) {
vl = XXH_RVOP(vsetvl_e64m2)(count);
{
/* key_vec = xsecret[i]; */
vuint64m2_t key_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)(xsecret, vl*8));
/* acc_vec = xacc[i]; */
vuint64m2_t acc_vec = XXH_RVOP(vle64_v_u64m2)(xacc, vl);
/* acc_vec ^= acc_vec >> 47; */
vuint64m2_t vsrl = XXH_RVOP(vsrl_vx_u64m2)(acc_vec, 47, vl);
acc_vec = XXH_RVOP(vxor_vv_u64m2)(acc_vec, vsrl, vl);
/* acc_vec ^= key_vec; */
acc_vec = XXH_RVOP(vxor_vv_u64m2)(acc_vec, key_vec, vl);
/* acc_vec *= XXH_PRIME32_1; */
acc_vec = XXH_RVOP(vmul_vx_u64m2)(acc_vec, XXH_PRIME32_1, vl);
/* xacc[i] *= acc_vec; */
XXH_RVOP(vse64_v_u64m2)(xacc, acc_vec, vl);
}
}
}
}
XXH_FORCE_INLINE void
XXH3_initCustomSecret_rvv(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
{
XXH_STATIC_ASSERT(XXH_SEC_ALIGN >= 8);
XXH_ASSERT(((size_t)customSecret & 7) == 0);
(void)(&XXH_writeLE64);
{
size_t count = XXH_SECRET_DEFAULT_SIZE/8;
size_t vl;
size_t VLMAX = XXH_RVOP(vsetvlmax_e64m2)();
int64_t* cSecret = (int64_t*)customSecret;
const int64_t* kSecret = (const int64_t*)(const void*)XXH3_kSecret;
#if __riscv_v_intrinsic >= 1000000
// ratified v1.0 intrinics version
vbool32_t mneg = XXH_RVCAST(u8m1_b32)(
XXH_RVOP(vmv_v_x_u8m1)(0xaa, XXH_RVOP(vsetvlmax_e8m1)()));
#else
// support pre-ratification intrinics, which lack mask to vector casts
size_t vlmax = XXH_RVOP(vsetvlmax_e8m1)();
vbool32_t mneg = XXH_RVOP(vmseq_vx_u8mf4_b32)(
XXH_RVOP(vand_vx_u8mf4)(
XXH_RVOP(vid_v_u8mf4)(vlmax), 1, vlmax), 1, vlmax);
#endif
vint64m2_t seed = XXH_RVOP(vmv_v_x_i64m2)((int64_t)seed64, VLMAX);
seed = XXH_RVOP(vneg_v_i64m2_mu)(mneg, seed, seed, VLMAX);
for (; count > 0; count -= vl, cSecret += vl, kSecret += vl) {
/* make sure vl=VLMAX until last iteration */
vl = XXH_RVOP(vsetvl_e64m2)(count < VLMAX ? count : VLMAX);
{
vint64m2_t src = XXH_RVOP(vle64_v_i64m2)(kSecret, vl);
vint64m2_t res = XXH_RVOP(vadd_vv_i64m2)(src, seed, vl);
XXH_RVOP(vse64_v_i64m2)(cSecret, res, vl);
}
}
}
}
#endif // XXH_VECTOR == XXH_RVV
/*!
@ -5955,7 +5971,7 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
#define XXH3_accumulate_512 XXH3_accumulate_512_rvv
#define XXH3_accumulate XXH3_accumulate_rvv
#define XXH3_scrambleAcc XXH3_scrambleAcc_rvv
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
#define XXH3_initCustomSecret XXH3_initCustomSecret_rvv
#else /* scalar */
#define XXH3_accumulate_512 XXH3_accumulate_512_scalar