From 32a347340acb98ce346a01198a86e3cb7d0e6245 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Sun, 4 Jan 2026 10:04:04 +0800 Subject: [PATCH 1/3] gguf-hash: add RVV tensor hashing using xxh3 Signed-off-by: Wang Yang --- examples/gguf-hash/CMakeLists.txt | 8 ++ examples/gguf-hash/README.md | 1 + examples/gguf-hash/deps/xxhash/xxhash.h | 135 +++++++++++++++++++++++- examples/gguf-hash/gguf-hash.cpp | 111 +++++++++++++++++-- 4 files changed, 248 insertions(+), 7 deletions(-) mode change 100644 => 100755 examples/gguf-hash/deps/xxhash/xxhash.h mode change 100644 => 100755 examples/gguf-hash/gguf-hash.cpp diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt index 15c5c68c6f..58f4fb833e 100644 --- a/examples/gguf-hash/CMakeLists.txt +++ b/examples/gguf-hash/CMakeLists.txt @@ -20,3 +20,11 @@ target_link_libraries(${TARGET} PRIVATE sha256) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64") + if (GGML_RVV) + message(STATUS "gguf-hash: RISC-V Vector support enabled") + target_compile_options(xxhash PRIVATE -march=rv64gcv -mabi=lp64d) + target_compile_options(${TARGET} PRIVATE -march=rv64gcv -mabi=lp64d) + endif() +endif() diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md index 9871651e38..3c3db0f2ec 100644 --- a/examples/gguf-hash/README.md +++ b/examples/gguf-hash/README.md @@ -8,6 +8,7 @@ CLI to hash GGUF files to detect difference on a per model and per tensor level. - `--help`: display help message - `--xxh64`: use xhash 64bit hash mode (default) - `--sha1`: use sha1 +- `--xxh3`: use xxh3 - `--uuid`: use uuid - `--sha256`: use sha256 - `--all`: use all hash diff --git a/examples/gguf-hash/deps/xxhash/xxhash.h b/examples/gguf-hash/deps/xxhash/xxhash.h old mode 100644 new mode 100755 index c0fafe20d5..41f4e35b40 --- a/examples/gguf-hash/deps/xxhash/xxhash.h +++ b/examples/gguf-hash/deps/xxhash/xxhash.h @@ -1092,6 +1092,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const * - WebAssembly SIMD128 * - POWER8 VSX * - s390x ZVector + * - RVV * This can be controlled via the @ref XXH_VECTOR macro, but it automatically * selects the best version according to predefined macros. For the x86 family, an * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. @@ -3751,6 +3752,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can # include # elif defined(__SSE2__) # include +# elif defined(__riscv_v) +# include # endif #endif @@ -3873,6 +3876,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ { */ XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ + XXH_RVV = 7, }; /*! * @ingroup tuning @@ -3895,6 +3899,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_NEON 4 # define XXH_VSX 5 # define XXH_SVE 6 +# define XXH_RVV 7 #endif #ifndef XXH_VECTOR /* can be defined on command line */ @@ -3919,6 +3924,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ { || (defined(__s390x__) && defined(__VEC__)) \ && defined(__GNUC__) /* TODO: IBM XL */ # define XXH_VECTOR XXH_VSX +# elif defined(__riscv_v) +# define XXH_VECTOR XXH_RVV # else # define XXH_VECTOR XXH_SCALAR # endif @@ -3935,6 +3942,12 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_VECTOR XXH_SCALAR #endif +#if (XXH_VECTOR == XXH_RVV) && !defined(__riscv_v) +# warning "__riscv_v isn't supported. Use SCALAR instead." +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + /* * Controls the alignment of the accumulator, * for compatibility with aligned vector loads, which are usually faster. @@ -3956,6 +3969,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_ACC_ALIGN 64 # elif XXH_VECTOR == XXH_SVE /* sve */ # define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_RVV /* rvv */ +# define XXH_ACC_ALIGN 64 # endif #endif @@ -3964,6 +3979,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_SEC_ALIGN XXH_ACC_ALIGN #elif XXH_VECTOR == XXH_SVE # define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_RVV +# define XXH_SEC_ALIGN XXH_ACC_ALIGN #else # define XXH_SEC_ALIGN 8 #endif @@ -5626,6 +5643,117 @@ XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) } #endif +#if (XXH_VECTOR == XXH_RVV) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_rvv(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_input[XXH_ACC_NB]; + XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB]; + XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB]; + + memcpy(aligned_input, input, XXH_STRIPE_LEN); + memcpy(aligned_secret, secret, XXH_STRIPE_LEN); + memcpy(aligned_acc, acc, XXH_STRIPE_LEN); + + size_t vl = __riscv_vsetvl_e64m8(8); + + vuint64m8_t data_vec = __riscv_vle64_v_u64m8(aligned_input, vl); + vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl); + vuint64m8_t acc_vec = __riscv_vle64_v_u64m8(aligned_acc, vl); + + vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl); + + vuint64m8_t data_key_lo = __riscv_vand_vx_u64m8(data_key, 0xFFFFFFFFULL, vl); + vuint64m8_t data_key_hi = __riscv_vsrl_vx_u64m8(data_key, 32, vl); + + vuint64m8_t idx = __riscv_vxor_vx_u64m8(__riscv_vid_v_u64m8(vl), 1, vl); + vuint64m8_t data_swap = __riscv_vrgather_vv_u64m8(data_vec, idx, vl); + + acc_vec = __riscv_vadd_vv_u64m8(acc_vec, data_swap, vl); + acc_vec = __riscv_vmacc_vv_u64m8(acc_vec, data_key_lo, data_key_hi, vl); + + __riscv_vse64_v_u64m8(aligned_acc, acc_vec, vl); + memcpy(acc, aligned_acc, XXH_STRIPE_LEN); +} + +XXH_FORCE_INLINE void +XXH3_accumulate_rvv(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes == 0) return; + + XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB]; + memcpy(aligned_acc, acc, XXH_STRIPE_LEN); + + const uint64_t* xinput = (const uint64_t*) (const void*) input; + const uint64_t* xsecret = (const uint64_t*) (const void*) secret; + + XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_input[XXH_ACC_NB]; + XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB]; + + size_t vl = __riscv_vsetvl_e64m8(8); + vuint64m8_t vacc = __riscv_vle64_v_u64m8(aligned_acc, vl); + vuint64m8_t idx = __riscv_vxor_vx_u64m8(__riscv_vid_v_u64m8(vl), 1, vl); + + do { + XXH_PREFETCH((const xxh_u8*)xinput + XXH_PREFETCH_DIST); + + memcpy(aligned_input, xinput, XXH_STRIPE_LEN); + memcpy(aligned_secret, xsecret, XXH_STRIPE_LEN); + + vuint64m8_t data_vec = __riscv_vle64_v_u64m8(aligned_input, vl); + vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl); + + vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl); + vuint64m8_t data_key_lo = __riscv_vand_vx_u64m8(data_key, 0xFFFFFFFFULL, vl); + vuint64m8_t data_key_hi = __riscv_vsrl_vx_u64m8(data_key, 32, vl); + + vuint64m8_t data_swap = __riscv_vrgather_vv_u64m8(data_vec, idx, vl); + + vacc = __riscv_vadd_vv_u64m8(vacc, data_swap, vl); + vacc = __riscv_vmacc_vv_u64m8(vacc, data_key_lo, data_key_hi, vl); + + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes > 0); + + __riscv_vse64_v_u64m8(aligned_acc, vacc, vl); + memcpy(acc, aligned_acc, XXH_STRIPE_LEN); +} + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB]; + XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB]; + + memcpy(aligned_acc, acc, XXH_STRIPE_LEN); + memcpy(aligned_secret, secret, XXH_STRIPE_LEN); + + size_t vl = __riscv_vsetvl_e64m8(8); + + vuint64m8_t acc_vec = __riscv_vle64_v_u64m8(aligned_acc, vl); + vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl); + + vuint64m8_t shifted = __riscv_vsrl_vx_u64m8(acc_vec, 47, vl); + vuint64m8_t data_vec = __riscv_vxor_vv_u64m8(acc_vec, shifted, vl); + + vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl); + + acc_vec = __riscv_vmul_vx_u64m8(data_key, XXH_PRIME32_1, vl); + + __riscv_vse64_v_u64m8(aligned_acc, acc_vec, vl); + memcpy(acc, aligned_acc, XXH_STRIPE_LEN); +} + +#endif // XXH_VECTOR == XXH_RVV + /*! * @internal * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). @@ -5823,8 +5951,13 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar -#else /* scalar */ +#elif (XXH_VECTOR == XXH_RVV) +#define XXH3_accumulate_512 XXH3_accumulate_512_rvv +#define XXH3_accumulate XXH3_accumulate_rvv +#define XXH3_scrambleAcc XXH3_scrambleAcc_rvv +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#else /* scalar */ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar #define XXH3_accumulate XXH3_accumulate_scalar #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp old mode 100644 new mode 100755 index 9523ec122f..0d63836d22 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -33,6 +33,7 @@ extern "C" { #define HASH_TYPE_SHA256_STR "sha256" #define HASH_TYPE_SHA1_STR "sha1" #define HASH_TYPE_XXH64_STR "xxh64" +#define HASH_TYPE_XXH3_STR "xxh3" #define HASH_TYPE_UUID_STR "uuid" @@ -55,10 +56,11 @@ typedef enum { struct hash_params { std::string input; - bool xxh64 = false; - bool sha1 = false; + bool xxh64 = false; + bool xxh3 = false; + bool sha1 = false; bool sha256 = false; - bool uuid = false; + bool uuid = false; bool no_layer = false; @@ -68,6 +70,7 @@ struct hash_params { struct manifest_check_params { bool xxh64 = false; + bool xxh3 = false; bool sha1 = false; bool sha256 = false; bool uuid = false; @@ -104,6 +107,7 @@ static void hash_print_usage(const char * executable) { printf("options:\n"); printf(" -h, --help show this help message and exit\n"); printf(" --xxh64 use xxh64 hash\n"); + printf(" --xxh3 use xxh3 hash\n"); printf(" --sha1 use sha1 hash\n"); printf(" --sha256 use sha256 hash\n"); printf(" --all use all hash\n"); @@ -136,6 +140,11 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par params.xxh64 = true; } + if (arg == "--xxh3") { + arg_found = true; + params.xxh3 = true; + } + if (arg == "--sha1") { arg_found = true; params.sha1 = true; @@ -156,6 +165,7 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par params.sha256 = true; params.sha1 = true; params.xxh64 = true; + params.xxh3 = true; } if (arg == "--no-layer") { @@ -224,6 +234,8 @@ static bool manifest_type(const std::string & manifest_file, manifest_check_para manifest_check.sha1 = true; } else if (file_hash_type == HASH_TYPE_XXH64_STR) { manifest_check.xxh64 = true; + } else if (file_hash_type == HASH_TYPE_XXH3_STR) { + manifest_check.xxh3 = true; } else if (file_hash_type == HASH_TYPE_UUID_STR) { manifest_check.uuid = true; } @@ -306,6 +318,19 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { } } + // xxh3 init + XXH3_state_t* xxh3_model_hash_state = NULL; + if (hash_params.xxh3) { + xxh3_model_hash_state = XXH3_createState(); + if (xxh3_model_hash_state == NULL) { + abort(); + } + + if (XXH3_64bits_reset(xxh3_model_hash_state) == XXH_ERROR) { + abort(); + } + } + // sha1 init SHA1_CTX sha1_model_hash_ctx; if (hash_params.sha1) { @@ -376,6 +401,44 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort(); } + if (hash_params.xxh3) { + + if (!hash_params.no_layer) { + // Per Layer Hash + XXH64_hash_t hash = XXH3_64bits(raw_data, n_bytes); + + char hex_result[17]; + for (int offset = 0; offset < 8; offset++) { + unsigned int shift_bits_by = (8 * (8 - offset - 1)); + snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff); + } + + if (hash_params.manifest_is_usable) { + hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH3_STR, hex_result, tensor_layer_name); + + switch (verify_result) { + case HASH_MANIFEST_NOT_FOUND: + break; + case HASH_MANIFEST_MISMATCH: + tensor_layer_in_manifest = true; + tensor_layer_has_mismatch = true; + break; + case HASH_MANIFEST_OK: + tensor_layer_in_manifest = true; + break; + } + + printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH3_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result)); + } else { + printf("%-8s %-s %s\n", HASH_TYPE_XXH3_STR, hex_result, tensor_layer_name.c_str()); + } + } + + // Overall Model Hash + if (XXH3_64bits_update(xxh3_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort(); + } + + if (hash_params.sha1) { if (!hash_params.no_layer) { @@ -485,6 +548,36 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { } } + if (hash_params.xxh3) { + XXH64_hash_t const hash = XXH3_64bits_digest(xxh3_model_hash_state); + + char hex_result[17]; + for (int offset = 0; offset < 8; offset++) { + unsigned int shift_bits_by = (8 * (8 - offset - 1)); + snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff); + } + + if (hash_params.manifest_is_usable) { + hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH3_STR, hex_result, fname); + + switch (verify_result) { + case HASH_MANIFEST_NOT_FOUND: + break; + case HASH_MANIFEST_MISMATCH: + model_in_manifest = true; + model_has_mismatch = true; + break; + case HASH_MANIFEST_OK: + model_in_manifest = true; + break; + } + + printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH3_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result)); + } else { + printf("%-8s %-s %s\n", HASH_TYPE_XXH3_STR, hex_result, fname.c_str()); + } + } + if (hash_params.sha1) { unsigned char result[21]; SHA1Final(result, &sha1_model_hash_ctx); @@ -636,7 +729,7 @@ int main(int argc, const char ** argv) { return HASH_EXIT_MANIFEST_FILE_ERROR; } - if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) { + if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.xxh3 && !manifest_check.uuid) { printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str()); return HASH_EXIT_MANIFEST_UNKNOWN_HASH; } @@ -655,6 +748,10 @@ int main(int argc, const char ** argv) { printf(" xxh64"); } + if (manifest_check.xxh3) { + printf(" xxh3"); + } + if (manifest_check.uuid) { printf(" uuid"); } @@ -663,7 +760,7 @@ int main(int argc, const char ** argv) { // Autoselect the highest security hash if manifest is provided but // the user has not specifically defined the hash they care about - if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) { + if (!params.xxh64 && !params.xxh3 && !params.sha1 && !params.uuid && !params.sha256) { // User has not selected a specific value, pick most secure hash if (manifest_check.sha256) { params.sha256 = true; @@ -671,6 +768,8 @@ int main(int argc, const char ** argv) { params.sha1 = true; } else if (manifest_check.xxh64) { params.xxh64 = true; + } else if (manifest_check.xxh3) { + params.xxh3 = true; } else if (manifest_check.uuid) { params.uuid = true; } @@ -680,7 +779,7 @@ int main(int argc, const char ** argv) { } // By default if no swich argument provided, assume xxh64 - if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) { + if (!params.xxh64 && !params.xxh3 && !params.sha1 && !params.uuid && !params.sha256) { params.xxh64 = true; } From 3ea543ffb1fc331da86ab7e212f02d2711f04c23 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Sun, 4 Jan 2026 10:15:57 +0800 Subject: [PATCH 2/3] fix trailing whitespace --- examples/gguf-hash/deps/xxhash/xxhash.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gguf-hash/deps/xxhash/xxhash.h b/examples/gguf-hash/deps/xxhash/xxhash.h index 41f4e35b40..071c37b130 100755 --- a/examples/gguf-hash/deps/xxhash/xxhash.h +++ b/examples/gguf-hash/deps/xxhash/xxhash.h @@ -3876,7 +3876,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ { */ XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ - XXH_RVV = 7, + XXH_RVV = 7, }; /*! * @ingroup tuning From 74dbb2eef2a2b000099e73ce7405a0a491e23967 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Tue, 6 Jan 2026 14:50:53 +0800 Subject: [PATCH 3/3] Align RVV implementation plan with xxHash upstream community --- examples/gguf-hash/deps/xxhash/xxhash.h | 208 +++++++++++++----------- 1 file changed, 112 insertions(+), 96 deletions(-) diff --git a/examples/gguf-hash/deps/xxhash/xxhash.h b/examples/gguf-hash/deps/xxhash/xxhash.h index 071c37b130..7dba263ee4 100755 --- a/examples/gguf-hash/deps/xxhash/xxhash.h +++ b/examples/gguf-hash/deps/xxhash/xxhash.h @@ -5644,114 +5644,130 @@ XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) #endif #if (XXH_VECTOR == XXH_RVV) + #define XXH_CONCAT2(X, Y) X ## Y + #define XXH_CONCAT(X, Y) XXH_CONCAT2(X, Y) + #if ((defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 13) || \ + (defined(__clang__) && __clang_major__ < 16)) + #define XXH_RVOP(op) op + #define XXH_RVCAST(op) XXH_CONCAT(vreinterpret_v_, op) + #else + #define XXH_RVOP(op) XXH_CONCAT(__riscv_, op) + #define XXH_RVCAST(op) XXH_CONCAT(__riscv_vreinterpret_v_, op) + #endif XXH_FORCE_INLINE void -XXH3_accumulate_512_rvv(void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) +XXH3_accumulate_512_rvv( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) { - XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_input[XXH_ACC_NB]; - XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB]; - XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB]; + XXH_ASSERT((((size_t)acc) & 63) == 0); + { + // Try to set vector lenght to 512 bits. + // If this length is unavailable, then maximum available will be used + size_t vl = XXH_RVOP(vsetvl_e64m2)(8); - memcpy(aligned_input, input, XXH_STRIPE_LEN); - memcpy(aligned_secret, secret, XXH_STRIPE_LEN); - memcpy(aligned_acc, acc, XXH_STRIPE_LEN); + uint64_t* xacc = (uint64_t*) acc; + const uint64_t* xinput = (const uint64_t*) input; + const uint64_t* xsecret = (const uint64_t*) secret; + static const uint64_t swap_mask[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + vuint64m2_t xswap_mask = XXH_RVOP(vle64_v_u64m2)(swap_mask, vl); - size_t vl = __riscv_vsetvl_e64m8(8); - - vuint64m8_t data_vec = __riscv_vle64_v_u64m8(aligned_input, vl); - vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl); - vuint64m8_t acc_vec = __riscv_vle64_v_u64m8(aligned_acc, vl); - - vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl); - - vuint64m8_t data_key_lo = __riscv_vand_vx_u64m8(data_key, 0xFFFFFFFFULL, vl); - vuint64m8_t data_key_hi = __riscv_vsrl_vx_u64m8(data_key, 32, vl); - - vuint64m8_t idx = __riscv_vxor_vx_u64m8(__riscv_vid_v_u64m8(vl), 1, vl); - vuint64m8_t data_swap = __riscv_vrgather_vv_u64m8(data_vec, idx, vl); - - acc_vec = __riscv_vadd_vv_u64m8(acc_vec, data_swap, vl); - acc_vec = __riscv_vmacc_vv_u64m8(acc_vec, data_key_lo, data_key_hi, vl); - - __riscv_vse64_v_u64m8(aligned_acc, acc_vec, vl); - memcpy(acc, aligned_acc, XXH_STRIPE_LEN); + size_t i; + for (i = 0; i < XXH_STRIPE_LEN/8; i += vl) { + /* data_vec = xinput[i]; */ + vuint64m2_t data_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)((const uint8_t*)(xinput + i), vl * 8)); + /* key_vec = xsecret[i]; */ + vuint64m2_t key_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)((const uint8_t*)(xsecret + i), vl * 8)); + /* acc_vec = xacc[i]; */ + vuint64m2_t acc_vec = XXH_RVOP(vle64_v_u64m2)(xacc + i, vl); + /* data_key = data_vec ^ key_vec; */ + vuint64m2_t data_key = XXH_RVOP(vxor_vv_u64m2)(data_vec, key_vec, vl); + /* data_key_hi = data_key >> 32; */ + vuint64m2_t data_key_hi = XXH_RVOP(vsrl_vx_u64m2)(data_key, 32, vl); + /* data_key_lo = data_key & 0xffffffff; */ + vuint64m2_t data_key_lo = XXH_RVOP(vand_vx_u64m2)(data_key, 0xffffffff, vl); + /* swap high and low halves */ + vuint64m2_t data_swap = XXH_RVOP(vrgather_vv_u64m2)(data_vec, xswap_mask, vl); + /* acc_vec += data_key_lo * data_key_hi; */ + acc_vec = XXH_RVOP(vmacc_vv_u64m2)(acc_vec, data_key_lo, data_key_hi, vl); + /* acc_vec += data_swap; */ + acc_vec = XXH_RVOP(vadd_vv_u64m2)(acc_vec, data_swap, vl); + /* xacc[i] = acc_vec; */ + XXH_RVOP(vse64_v_u64m2)(xacc + i, acc_vec, vl); + } + } } -XXH_FORCE_INLINE void -XXH3_accumulate_rvv(xxh_u64* XXH_RESTRICT acc, - const xxh_u8* XXH_RESTRICT input, - const xxh_u8* XXH_RESTRICT secret, - size_t nbStripes) -{ - if (nbStripes == 0) return; - - XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB]; - memcpy(aligned_acc, acc, XXH_STRIPE_LEN); - - const uint64_t* xinput = (const uint64_t*) (const void*) input; - const uint64_t* xsecret = (const uint64_t*) (const void*) secret; - - XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_input[XXH_ACC_NB]; - XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB]; - - size_t vl = __riscv_vsetvl_e64m8(8); - vuint64m8_t vacc = __riscv_vle64_v_u64m8(aligned_acc, vl); - vuint64m8_t idx = __riscv_vxor_vx_u64m8(__riscv_vid_v_u64m8(vl), 1, vl); - - do { - XXH_PREFETCH((const xxh_u8*)xinput + XXH_PREFETCH_DIST); - - memcpy(aligned_input, xinput, XXH_STRIPE_LEN); - memcpy(aligned_secret, xsecret, XXH_STRIPE_LEN); - - vuint64m8_t data_vec = __riscv_vle64_v_u64m8(aligned_input, vl); - vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl); - - vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl); - vuint64m8_t data_key_lo = __riscv_vand_vx_u64m8(data_key, 0xFFFFFFFFULL, vl); - vuint64m8_t data_key_hi = __riscv_vsrl_vx_u64m8(data_key, 32, vl); - - vuint64m8_t data_swap = __riscv_vrgather_vv_u64m8(data_vec, idx, vl); - - vacc = __riscv_vadd_vv_u64m8(vacc, data_swap, vl); - vacc = __riscv_vmacc_vv_u64m8(vacc, data_key_lo, data_key_hi, vl); - - xinput += 8; - xsecret += 1; - nbStripes--; - } while (nbStripes > 0); - - __riscv_vse64_v_u64m8(aligned_acc, vacc, vl); - memcpy(acc, aligned_acc, XXH_STRIPE_LEN); -} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(rvv) XXH_FORCE_INLINE void XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { - XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB]; - XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB]; - - memcpy(aligned_acc, acc, XXH_STRIPE_LEN); - memcpy(aligned_secret, secret, XXH_STRIPE_LEN); - - size_t vl = __riscv_vsetvl_e64m8(8); - - vuint64m8_t acc_vec = __riscv_vle64_v_u64m8(aligned_acc, vl); - vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl); - - vuint64m8_t shifted = __riscv_vsrl_vx_u64m8(acc_vec, 47, vl); - vuint64m8_t data_vec = __riscv_vxor_vv_u64m8(acc_vec, shifted, vl); - - vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl); - - acc_vec = __riscv_vmul_vx_u64m8(data_key, XXH_PRIME32_1, vl); - - __riscv_vse64_v_u64m8(aligned_acc, acc_vec, vl); - memcpy(acc, aligned_acc, XXH_STRIPE_LEN); + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + size_t count = XXH_STRIPE_LEN/8; + uint64_t* xacc = (uint64_t*)acc; + const uint8_t* xsecret = (const uint8_t *)secret; + size_t vl; + for (; count > 0; count -= vl, xacc += vl, xsecret += vl*8) { + vl = XXH_RVOP(vsetvl_e64m2)(count); + { + /* key_vec = xsecret[i]; */ + vuint64m2_t key_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)(xsecret, vl*8)); + /* acc_vec = xacc[i]; */ + vuint64m2_t acc_vec = XXH_RVOP(vle64_v_u64m2)(xacc, vl); + /* acc_vec ^= acc_vec >> 47; */ + vuint64m2_t vsrl = XXH_RVOP(vsrl_vx_u64m2)(acc_vec, 47, vl); + acc_vec = XXH_RVOP(vxor_vv_u64m2)(acc_vec, vsrl, vl); + /* acc_vec ^= key_vec; */ + acc_vec = XXH_RVOP(vxor_vv_u64m2)(acc_vec, key_vec, vl); + /* acc_vec *= XXH_PRIME32_1; */ + acc_vec = XXH_RVOP(vmul_vx_u64m2)(acc_vec, XXH_PRIME32_1, vl); + /* xacc[i] *= acc_vec; */ + XXH_RVOP(vse64_v_u64m2)(xacc, acc_vec, vl); + } + } + } } +XXH_FORCE_INLINE void +XXH3_initCustomSecret_rvv(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT(XXH_SEC_ALIGN >= 8); + XXH_ASSERT(((size_t)customSecret & 7) == 0); + (void)(&XXH_writeLE64); + { + size_t count = XXH_SECRET_DEFAULT_SIZE/8; + size_t vl; + size_t VLMAX = XXH_RVOP(vsetvlmax_e64m2)(); + int64_t* cSecret = (int64_t*)customSecret; + const int64_t* kSecret = (const int64_t*)(const void*)XXH3_kSecret; + +#if __riscv_v_intrinsic >= 1000000 + // ratified v1.0 intrinics version + vbool32_t mneg = XXH_RVCAST(u8m1_b32)( + XXH_RVOP(vmv_v_x_u8m1)(0xaa, XXH_RVOP(vsetvlmax_e8m1)())); +#else + // support pre-ratification intrinics, which lack mask to vector casts + size_t vlmax = XXH_RVOP(vsetvlmax_e8m1)(); + vbool32_t mneg = XXH_RVOP(vmseq_vx_u8mf4_b32)( + XXH_RVOP(vand_vx_u8mf4)( + XXH_RVOP(vid_v_u8mf4)(vlmax), 1, vlmax), 1, vlmax); +#endif + vint64m2_t seed = XXH_RVOP(vmv_v_x_i64m2)((int64_t)seed64, VLMAX); + seed = XXH_RVOP(vneg_v_i64m2_mu)(mneg, seed, seed, VLMAX); + + for (; count > 0; count -= vl, cSecret += vl, kSecret += vl) { + /* make sure vl=VLMAX until last iteration */ + vl = XXH_RVOP(vsetvl_e64m2)(count < VLMAX ? count : VLMAX); + { + vint64m2_t src = XXH_RVOP(vle64_v_i64m2)(kSecret, vl); + vint64m2_t res = XXH_RVOP(vadd_vv_i64m2)(src, seed, vl); + XXH_RVOP(vse64_v_i64m2)(cSecret, res, vl); + } + } + } +} #endif // XXH_VECTOR == XXH_RVV /*! @@ -5955,7 +5971,7 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); #define XXH3_accumulate_512 XXH3_accumulate_512_rvv #define XXH3_accumulate XXH3_accumulate_rvv #define XXH3_scrambleAcc XXH3_scrambleAcc_rvv -#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_rvv #else /* scalar */ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar