From 32a347340acb98ce346a01198a86e3cb7d0e6245 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Sun, 4 Jan 2026 10:04:04 +0800
Subject: [PATCH 1/3] gguf-hash: add RVV tensor hashing using xxh3

Signed-off-by: Wang Yang <yangwang@iscas.ac.cn>
---
 examples/gguf-hash/CMakeLists.txt       |   8 ++
 examples/gguf-hash/README.md            |   1 +
 examples/gguf-hash/deps/xxhash/xxhash.h | 135 +++++++++++++++++++++++-
 examples/gguf-hash/gguf-hash.cpp        | 111 +++++++++++++++++--
 4 files changed, 248 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 examples/gguf-hash/deps/xxhash/xxhash.h
 mode change 100644 => 100755 examples/gguf-hash/gguf-hash.cpp

diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt
index 15c5c68c6f..58f4fb833e 100644
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@@ -20,3 +20,11 @@ target_link_libraries(${TARGET} PRIVATE sha256)
 
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
+    if (GGML_RVV)
+        message(STATUS "gguf-hash: RISC-V Vector support enabled")
+        target_compile_options(xxhash PRIVATE -march=rv64gcv -mabi=lp64d)
+        target_compile_options(${TARGET} PRIVATE -march=rv64gcv -mabi=lp64d)
+    endif()
+endif()
diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md
index 9871651e38..3c3db0f2ec 100644
--- a/examples/gguf-hash/README.md
+++ b/examples/gguf-hash/README.md
@@ -8,6 +8,7 @@ CLI to hash GGUF files to detect difference on a per model and per tensor level.
 - `--help`: display help message
 - `--xxh64`: use xhash 64bit hash mode (default)
 - `--sha1`: use sha1
+- `--xxh3`: use xxh3
 - `--uuid`: use uuid
 - `--sha256`: use sha256
 - `--all`: use all hash
diff --git a/examples/gguf-hash/deps/xxhash/xxhash.h b/examples/gguf-hash/deps/xxhash/xxhash.h
old mode 100644
new mode 100755
index c0fafe20d5..41f4e35b40
--- a/examples/gguf-hash/deps/xxhash/xxhash.h
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
@@ -1092,6 +1092,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const
  *   - WebAssembly SIMD128
  *   - POWER8 VSX
  *   - s390x ZVector
+ *   - RVV
  * This can be controlled via the @ref XXH_VECTOR macro, but it automatically
  * selects the best version according to predefined macros. For the x86 family, an
  * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c.
@@ -3751,6 +3752,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
 #    include <immintrin.h>
 #  elif defined(__SSE2__)
 #    include <emmintrin.h>
+#  elif defined(__riscv_v)
+#    include <riscv_vector.h>
 #  endif
 #endif
 
@@ -3873,6 +3876,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
                        */
     XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
     XXH_SVE    = 6,  /*!< SVE for some ARMv8-A and ARMv9-A */
+    XXH_RVV    = 7,  
 };
 /*!
  * @ingroup tuning
@@ -3895,6 +3899,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
 #  define XXH_NEON   4
 #  define XXH_VSX    5
 #  define XXH_SVE    6
+#  define XXH_RVV    7
 #endif
 
 #ifndef XXH_VECTOR    /* can be defined on command line */
@@ -3919,6 +3924,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
      || (defined(__s390x__) && defined(__VEC__)) \
      && defined(__GNUC__) /* TODO: IBM XL */
 #    define XXH_VECTOR XXH_VSX
+#  elif defined(__riscv_v)
+#    define XXH_VECTOR XXH_RVV
 #  else
 #    define XXH_VECTOR XXH_SCALAR
 #  endif
@@ -3935,6 +3942,12 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
 #  define XXH_VECTOR XXH_SCALAR
 #endif
 
+#if (XXH_VECTOR == XXH_RVV) && !defined(__riscv_v)
+#  warning "__riscv_v isn't supported. Use SCALAR instead."
+#  undef XXH_VECTOR
+#  define XXH_VECTOR XXH_SCALAR
+#endif
+
 /*
  * Controls the alignment of the accumulator,
  * for compatibility with aligned vector loads, which are usually faster.
@@ -3956,6 +3969,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
 #     define XXH_ACC_ALIGN 64
 #  elif XXH_VECTOR == XXH_SVE   /* sve */
 #     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_RVV   /* rvv */
+#     define XXH_ACC_ALIGN 64
 #  endif
 #endif
 
@@ -3964,6 +3979,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
 #  define XXH_SEC_ALIGN XXH_ACC_ALIGN
 #elif XXH_VECTOR == XXH_SVE
 #  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#elif XXH_VECTOR == XXH_RVV
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
 #else
 #  define XXH_SEC_ALIGN 8
 #endif
@@ -5626,6 +5643,117 @@ XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
 }
 #endif
 
+#if (XXH_VECTOR == XXH_RVV)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_rvv(void* XXH_RESTRICT acc,
+                        const void* XXH_RESTRICT input,
+                        const void* XXH_RESTRICT secret)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_input[XXH_ACC_NB];
+    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB];
+    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB];
+
+    memcpy(aligned_input,  input,  XXH_STRIPE_LEN);
+    memcpy(aligned_secret, secret, XXH_STRIPE_LEN);
+    memcpy(aligned_acc,    acc,    XXH_STRIPE_LEN);
+
+    size_t vl = __riscv_vsetvl_e64m8(8);
+
+    vuint64m8_t data_vec = __riscv_vle64_v_u64m8(aligned_input, vl);
+    vuint64m8_t key_vec  = __riscv_vle64_v_u64m8(aligned_secret, vl);
+    vuint64m8_t acc_vec  = __riscv_vle64_v_u64m8(aligned_acc, vl);
+
+    vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl);
+
+    vuint64m8_t data_key_lo = __riscv_vand_vx_u64m8(data_key, 0xFFFFFFFFULL, vl);
+    vuint64m8_t data_key_hi = __riscv_vsrl_vx_u64m8(data_key, 32, vl);
+
+    vuint64m8_t idx = __riscv_vxor_vx_u64m8(__riscv_vid_v_u64m8(vl), 1, vl);
+    vuint64m8_t data_swap = __riscv_vrgather_vv_u64m8(data_vec, idx, vl);
+
+    acc_vec = __riscv_vadd_vv_u64m8(acc_vec, data_swap, vl);
+    acc_vec = __riscv_vmacc_vv_u64m8(acc_vec, data_key_lo, data_key_hi, vl);
+
+    __riscv_vse64_v_u64m8(aligned_acc, acc_vec, vl);
+    memcpy(acc, aligned_acc, XXH_STRIPE_LEN);
+}
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_rvv(xxh_u64* XXH_RESTRICT acc,
+                    const xxh_u8* XXH_RESTRICT input,
+                    const xxh_u8* XXH_RESTRICT secret,
+                    size_t nbStripes)
+{
+    if (nbStripes == 0) return;
+
+    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB];
+    memcpy(aligned_acc, acc, XXH_STRIPE_LEN);
+
+    const uint64_t* xinput  = (const uint64_t*) (const void*) input;
+    const uint64_t* xsecret = (const uint64_t*) (const void*) secret;
+
+    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_input[XXH_ACC_NB];
+    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB];
+
+    size_t vl        = __riscv_vsetvl_e64m8(8);
+    vuint64m8_t vacc = __riscv_vle64_v_u64m8(aligned_acc, vl);
+    vuint64m8_t idx  = __riscv_vxor_vx_u64m8(__riscv_vid_v_u64m8(vl), 1, vl);
+
+    do {
+        XXH_PREFETCH((const xxh_u8*)xinput + XXH_PREFETCH_DIST);
+
+        memcpy(aligned_input,  xinput,  XXH_STRIPE_LEN);
+        memcpy(aligned_secret, xsecret, XXH_STRIPE_LEN);
+
+        vuint64m8_t data_vec = __riscv_vle64_v_u64m8(aligned_input, vl);
+        vuint64m8_t key_vec  = __riscv_vle64_v_u64m8(aligned_secret, vl);
+
+        vuint64m8_t data_key    = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl);
+        vuint64m8_t data_key_lo = __riscv_vand_vx_u64m8(data_key, 0xFFFFFFFFULL, vl);
+        vuint64m8_t data_key_hi = __riscv_vsrl_vx_u64m8(data_key, 32, vl);
+
+        vuint64m8_t data_swap = __riscv_vrgather_vv_u64m8(data_vec, idx, vl);
+
+        vacc = __riscv_vadd_vv_u64m8(vacc, data_swap, vl);
+        vacc = __riscv_vmacc_vv_u64m8(vacc, data_key_lo, data_key_hi, vl);
+
+        xinput  += 8;
+        xsecret += 1;
+        nbStripes--;
+    } while (nbStripes > 0);
+
+    __riscv_vse64_v_u64m8(aligned_acc, vacc, vl);
+    memcpy(acc, aligned_acc, XXH_STRIPE_LEN);
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB];
+    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB];
+
+    memcpy(aligned_acc,    acc,    XXH_STRIPE_LEN);
+    memcpy(aligned_secret, secret, XXH_STRIPE_LEN);
+
+    size_t vl = __riscv_vsetvl_e64m8(8);
+
+    vuint64m8_t acc_vec = __riscv_vle64_v_u64m8(aligned_acc, vl);
+    vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl);
+
+    vuint64m8_t shifted  = __riscv_vsrl_vx_u64m8(acc_vec, 47, vl);
+    vuint64m8_t data_vec = __riscv_vxor_vv_u64m8(acc_vec, shifted, vl);
+
+    vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl);
+
+    acc_vec = __riscv_vmul_vx_u64m8(data_key, XXH_PRIME32_1, vl);
+
+    __riscv_vse64_v_u64m8(aligned_acc, acc_vec, vl);
+    memcpy(acc, aligned_acc, XXH_STRIPE_LEN);
+}
+
+#endif  // XXH_VECTOR == XXH_RVV
+
 /*!
  * @internal
  * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
@@ -5823,8 +5951,13 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
 #define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
 
-#else /* scalar */
+#elif (XXH_VECTOR == XXH_RVV)
+#define XXH3_accumulate_512 XXH3_accumulate_512_rvv
+#define XXH3_accumulate     XXH3_accumulate_rvv
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_rvv
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
 
+#else /* scalar */
 #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
 #define XXH3_accumulate     XXH3_accumulate_scalar
 #define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
old mode 100644
new mode 100755
index 9523ec122f..0d63836d22
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@@ -33,6 +33,7 @@ extern "C" {
 #define HASH_TYPE_SHA256_STR "sha256"
 #define HASH_TYPE_SHA1_STR   "sha1"
 #define HASH_TYPE_XXH64_STR  "xxh64"
+#define HASH_TYPE_XXH3_STR   "xxh3"
 #define HASH_TYPE_UUID_STR   "uuid"
 
 
@@ -55,10 +56,11 @@ typedef enum {
 
 struct hash_params {
     std::string input;
-    bool xxh64 = false;
-    bool sha1 = false;
+    bool xxh64  = false;
+    bool xxh3   = false;
+    bool sha1   = false;
     bool sha256 = false;
-    bool uuid = false;
+    bool uuid   = false;
 
     bool no_layer = false;
 
@@ -68,6 +70,7 @@ struct hash_params {
 
 struct manifest_check_params {
     bool xxh64 = false;
+    bool xxh3  = false;
     bool sha1 = false;
     bool sha256 = false;
     bool uuid = false;
@@ -104,6 +107,7 @@ static void hash_print_usage(const char * executable) {
     printf("options:\n");
     printf("  -h, --help              show this help message and exit\n");
     printf("      --xxh64             use xxh64 hash\n");
+    printf("      --xxh3              use xxh3  hash\n");
     printf("      --sha1              use sha1 hash\n");
     printf("      --sha256            use sha256 hash\n");
     printf("      --all               use all hash\n");
@@ -136,6 +140,11 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
             params.xxh64 = true;
         }
 
+        if (arg == "--xxh3") {
+            arg_found = true;
+            params.xxh3 = true;
+        }
+
         if (arg == "--sha1") {
             arg_found = true;
             params.sha1 = true;
@@ -156,6 +165,7 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
             params.sha256 = true;
             params.sha1 = true;
             params.xxh64 = true;
+            params.xxh3 = true;
         }
 
         if (arg == "--no-layer") {
@@ -224,6 +234,8 @@ static bool manifest_type(const std::string & manifest_file, manifest_check_para
                 manifest_check.sha1 = true;
             } else if (file_hash_type == HASH_TYPE_XXH64_STR) {
                 manifest_check.xxh64 = true;
+            } else if (file_hash_type == HASH_TYPE_XXH3_STR) {
+                manifest_check.xxh3 = true;
             } else if (file_hash_type == HASH_TYPE_UUID_STR) {
                 manifest_check.uuid = true;
             }
@@ -306,6 +318,19 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
         }
     }
 
+    // xxh3 init
+    XXH3_state_t* xxh3_model_hash_state = NULL;
+    if (hash_params.xxh3) {
+        xxh3_model_hash_state = XXH3_createState();
+        if (xxh3_model_hash_state == NULL) {
+            abort();
+        }
+
+        if (XXH3_64bits_reset(xxh3_model_hash_state) == XXH_ERROR) {
+            abort();
+        }
+    }
+
     // sha1 init
     SHA1_CTX sha1_model_hash_ctx;
     if (hash_params.sha1) {
@@ -376,6 +401,44 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
             if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
         }
 
+        if (hash_params.xxh3) {
+
+            if (!hash_params.no_layer) {
+                // Per Layer Hash
+                XXH64_hash_t hash = XXH3_64bits(raw_data, n_bytes);
+
+                char hex_result[17];
+                for (int  offset = 0; offset < 8; offset++) {
+                    unsigned int shift_bits_by = (8 * (8 - offset - 1));
+                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
+                }
+
+                if (hash_params.manifest_is_usable) {
+                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH3_STR, hex_result, tensor_layer_name);
+
+                    switch (verify_result) {
+                        case HASH_MANIFEST_NOT_FOUND:
+                            break;
+                        case HASH_MANIFEST_MISMATCH:
+                            tensor_layer_in_manifest = true;
+                            tensor_layer_has_mismatch = true;
+                            break;
+                        case HASH_MANIFEST_OK:
+                            tensor_layer_in_manifest = true;
+                            break;
+                    }
+
+                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH3_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
+                } else {
+                    printf("%-8s  %-s  %s\n", HASH_TYPE_XXH3_STR, hex_result, tensor_layer_name.c_str());
+                }
+            }
+
+            // Overall Model Hash
+            if (XXH3_64bits_update(xxh3_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
+        }
+
+
         if (hash_params.sha1) {
 
             if (!hash_params.no_layer) {
@@ -485,6 +548,36 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
         }
     }
 
+    if (hash_params.xxh3) {
+        XXH64_hash_t const hash = XXH3_64bits_digest(xxh3_model_hash_state);
+
+        char hex_result[17];
+        for (int  offset = 0; offset < 8; offset++) {
+            unsigned int shift_bits_by = (8 * (8 - offset - 1));
+            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
+        }
+
+        if (hash_params.manifest_is_usable) {
+            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH3_STR, hex_result, fname);
+
+            switch (verify_result) {
+                case HASH_MANIFEST_NOT_FOUND:
+                    break;
+                case HASH_MANIFEST_MISMATCH:
+                    model_in_manifest = true;
+                    model_has_mismatch = true;
+                    break;
+                case HASH_MANIFEST_OK:
+                    model_in_manifest = true;
+                    break;
+            }
+
+            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH3_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
+        } else {
+            printf("%-8s  %-s  %s\n", HASH_TYPE_XXH3_STR, hex_result, fname.c_str());
+        }
+    }
+
     if (hash_params.sha1) {
         unsigned char result[21];
         SHA1Final(result, &sha1_model_hash_ctx);
@@ -636,7 +729,7 @@ int main(int argc, const char ** argv) {
             return HASH_EXIT_MANIFEST_FILE_ERROR;
         }
 
-        if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) {
+        if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.xxh3 && !manifest_check.uuid) {
             printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str());
             return HASH_EXIT_MANIFEST_UNKNOWN_HASH;
         }
@@ -655,6 +748,10 @@ int main(int argc, const char ** argv) {
             printf("  xxh64");
         }
 
+        if (manifest_check.xxh3) {
+            printf("  xxh3");
+        }
+
         if (manifest_check.uuid) {
             printf("  uuid");
         }
@@ -663,7 +760,7 @@ int main(int argc, const char ** argv) {
 
         // Autoselect the highest security hash if manifest is provided but
         // the user has not specifically defined the hash they care about
-        if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
+        if (!params.xxh64 && !params.xxh3 && !params.sha1 && !params.uuid && !params.sha256) {
             // User has not selected a specific value, pick most secure hash
             if (manifest_check.sha256) {
                 params.sha256 = true;
@@ -671,6 +768,8 @@ int main(int argc, const char ** argv) {
                 params.sha1 = true;
             } else if (manifest_check.xxh64) {
                 params.xxh64 = true;
+            } else if (manifest_check.xxh3) {
+                params.xxh3  = true;
             } else if (manifest_check.uuid) {
                 params.uuid = true;
             }
@@ -680,7 +779,7 @@ int main(int argc, const char ** argv) {
     }
 
     // By default if no swich argument provided, assume xxh64
-    if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
+    if (!params.xxh64 && !params.xxh3 && !params.sha1 && !params.uuid && !params.sha256) {
         params.xxh64 = true;
     }
 

From 3ea543ffb1fc331da86ab7e212f02d2711f04c23 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Sun, 4 Jan 2026 10:15:57 +0800
Subject: [PATCH 2/3] fix trailing whitespace

---
 examples/gguf-hash/deps/xxhash/xxhash.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gguf-hash/deps/xxhash/xxhash.h b/examples/gguf-hash/deps/xxhash/xxhash.h
index 41f4e35b40..071c37b130 100755
--- a/examples/gguf-hash/deps/xxhash/xxhash.h
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
@@ -3876,7 +3876,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
                        */
     XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
     XXH_SVE    = 6,  /*!< SVE for some ARMv8-A and ARMv9-A */
-    XXH_RVV    = 7,  
+    XXH_RVV    = 7,
 };
 /*!
  * @ingroup tuning

From 74dbb2eef2a2b000099e73ce7405a0a491e23967 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Tue, 6 Jan 2026 14:50:53 +0800
Subject: [PATCH 3/3] Align RVV implementation plan with xxHash upstream
 community

---
 examples/gguf-hash/deps/xxhash/xxhash.h | 208 +++++++++++++-----------
 1 file changed, 112 insertions(+), 96 deletions(-)

diff --git a/examples/gguf-hash/deps/xxhash/xxhash.h b/examples/gguf-hash/deps/xxhash/xxhash.h
index 071c37b130..7dba263ee4 100755
--- a/examples/gguf-hash/deps/xxhash/xxhash.h
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
@@ -5644,114 +5644,130 @@ XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
 #endif
 
 #if (XXH_VECTOR == XXH_RVV)
+    #define XXH_CONCAT2(X, Y) X ## Y
+    #define XXH_CONCAT(X, Y) XXH_CONCAT2(X, Y)
+    #if ((defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 13) || \
+            (defined(__clang__) && __clang_major__ < 16))
+        #define XXH_RVOP(op) op
+        #define XXH_RVCAST(op) XXH_CONCAT(vreinterpret_v_, op)
+    #else
+        #define XXH_RVOP(op) XXH_CONCAT(__riscv_, op)
+        #define XXH_RVCAST(op) XXH_CONCAT(__riscv_vreinterpret_v_, op)
+    #endif
 
 XXH_FORCE_INLINE void
-XXH3_accumulate_512_rvv(void* XXH_RESTRICT acc,
-                        const void* XXH_RESTRICT input,
-                        const void* XXH_RESTRICT secret)
+XXH3_accumulate_512_rvv(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
 {
-    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_input[XXH_ACC_NB];
-    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB];
-    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB];
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    {
+        // Try to set vector lenght to 512 bits.
+        // If this length is unavailable, then maximum available will be used
+        size_t vl = XXH_RVOP(vsetvl_e64m2)(8);
 
-    memcpy(aligned_input,  input,  XXH_STRIPE_LEN);
-    memcpy(aligned_secret, secret, XXH_STRIPE_LEN);
-    memcpy(aligned_acc,    acc,    XXH_STRIPE_LEN);
+        uint64_t*       xacc    = (uint64_t*) acc;
+        const uint64_t* xinput  = (const uint64_t*) input;
+        const uint64_t* xsecret = (const uint64_t*) secret;
+        static const uint64_t swap_mask[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+        vuint64m2_t xswap_mask = XXH_RVOP(vle64_v_u64m2)(swap_mask, vl);
 
-    size_t vl = __riscv_vsetvl_e64m8(8);
-
-    vuint64m8_t data_vec = __riscv_vle64_v_u64m8(aligned_input, vl);
-    vuint64m8_t key_vec  = __riscv_vle64_v_u64m8(aligned_secret, vl);
-    vuint64m8_t acc_vec  = __riscv_vle64_v_u64m8(aligned_acc, vl);
-
-    vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl);
-
-    vuint64m8_t data_key_lo = __riscv_vand_vx_u64m8(data_key, 0xFFFFFFFFULL, vl);
-    vuint64m8_t data_key_hi = __riscv_vsrl_vx_u64m8(data_key, 32, vl);
-
-    vuint64m8_t idx = __riscv_vxor_vx_u64m8(__riscv_vid_v_u64m8(vl), 1, vl);
-    vuint64m8_t data_swap = __riscv_vrgather_vv_u64m8(data_vec, idx, vl);
-
-    acc_vec = __riscv_vadd_vv_u64m8(acc_vec, data_swap, vl);
-    acc_vec = __riscv_vmacc_vv_u64m8(acc_vec, data_key_lo, data_key_hi, vl);
-
-    __riscv_vse64_v_u64m8(aligned_acc, acc_vec, vl);
-    memcpy(acc, aligned_acc, XXH_STRIPE_LEN);
+        size_t i;
+        for (i = 0; i < XXH_STRIPE_LEN/8; i += vl) {
+            /* data_vec = xinput[i]; */
+            vuint64m2_t data_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)((const uint8_t*)(xinput + i), vl * 8));
+            /* key_vec = xsecret[i]; */
+            vuint64m2_t key_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)((const uint8_t*)(xsecret + i), vl * 8));
+            /* acc_vec = xacc[i]; */
+            vuint64m2_t acc_vec = XXH_RVOP(vle64_v_u64m2)(xacc + i, vl);
+            /* data_key = data_vec ^ key_vec; */
+            vuint64m2_t data_key = XXH_RVOP(vxor_vv_u64m2)(data_vec, key_vec, vl);
+            /* data_key_hi = data_key >> 32; */
+            vuint64m2_t data_key_hi = XXH_RVOP(vsrl_vx_u64m2)(data_key, 32, vl);
+            /* data_key_lo = data_key & 0xffffffff; */
+            vuint64m2_t data_key_lo = XXH_RVOP(vand_vx_u64m2)(data_key, 0xffffffff, vl);
+            /* swap high and low halves */
+            vuint64m2_t data_swap = XXH_RVOP(vrgather_vv_u64m2)(data_vec, xswap_mask, vl);
+            /* acc_vec += data_key_lo * data_key_hi; */
+            acc_vec = XXH_RVOP(vmacc_vv_u64m2)(acc_vec, data_key_lo, data_key_hi, vl);
+            /* acc_vec += data_swap; */
+            acc_vec = XXH_RVOP(vadd_vv_u64m2)(acc_vec, data_swap, vl);
+            /* xacc[i] = acc_vec; */
+            XXH_RVOP(vse64_v_u64m2)(xacc + i, acc_vec, vl);
+        }
+    }
 }
 
-XXH_FORCE_INLINE void
-XXH3_accumulate_rvv(xxh_u64* XXH_RESTRICT acc,
-                    const xxh_u8* XXH_RESTRICT input,
-                    const xxh_u8* XXH_RESTRICT secret,
-                    size_t nbStripes)
-{
-    if (nbStripes == 0) return;
-
-    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB];
-    memcpy(aligned_acc, acc, XXH_STRIPE_LEN);
-
-    const uint64_t* xinput  = (const uint64_t*) (const void*) input;
-    const uint64_t* xsecret = (const uint64_t*) (const void*) secret;
-
-    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_input[XXH_ACC_NB];
-    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB];
-
-    size_t vl        = __riscv_vsetvl_e64m8(8);
-    vuint64m8_t vacc = __riscv_vle64_v_u64m8(aligned_acc, vl);
-    vuint64m8_t idx  = __riscv_vxor_vx_u64m8(__riscv_vid_v_u64m8(vl), 1, vl);
-
-    do {
-        XXH_PREFETCH((const xxh_u8*)xinput + XXH_PREFETCH_DIST);
-
-        memcpy(aligned_input,  xinput,  XXH_STRIPE_LEN);
-        memcpy(aligned_secret, xsecret, XXH_STRIPE_LEN);
-
-        vuint64m8_t data_vec = __riscv_vle64_v_u64m8(aligned_input, vl);
-        vuint64m8_t key_vec  = __riscv_vle64_v_u64m8(aligned_secret, vl);
-
-        vuint64m8_t data_key    = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl);
-        vuint64m8_t data_key_lo = __riscv_vand_vx_u64m8(data_key, 0xFFFFFFFFULL, vl);
-        vuint64m8_t data_key_hi = __riscv_vsrl_vx_u64m8(data_key, 32, vl);
-
-        vuint64m8_t data_swap = __riscv_vrgather_vv_u64m8(data_vec, idx, vl);
-
-        vacc = __riscv_vadd_vv_u64m8(vacc, data_swap, vl);
-        vacc = __riscv_vmacc_vv_u64m8(vacc, data_key_lo, data_key_hi, vl);
-
-        xinput  += 8;
-        xsecret += 1;
-        nbStripes--;
-    } while (nbStripes > 0);
-
-    __riscv_vse64_v_u64m8(aligned_acc, vacc, vl);
-    memcpy(acc, aligned_acc, XXH_STRIPE_LEN);
-}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(rvv)
 
 XXH_FORCE_INLINE void
 XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
 {
-    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_acc[XXH_ACC_NB];
-    XXH_ALIGN(XXH_ACC_ALIGN) uint64_t aligned_secret[XXH_ACC_NB];
-
-    memcpy(aligned_acc,    acc,    XXH_STRIPE_LEN);
-    memcpy(aligned_secret, secret, XXH_STRIPE_LEN);
-
-    size_t vl = __riscv_vsetvl_e64m8(8);
-
-    vuint64m8_t acc_vec = __riscv_vle64_v_u64m8(aligned_acc, vl);
-    vuint64m8_t key_vec = __riscv_vle64_v_u64m8(aligned_secret, vl);
-
-    vuint64m8_t shifted  = __riscv_vsrl_vx_u64m8(acc_vec, 47, vl);
-    vuint64m8_t data_vec = __riscv_vxor_vv_u64m8(acc_vec, shifted, vl);
-
-    vuint64m8_t data_key = __riscv_vxor_vv_u64m8(data_vec, key_vec, vl);
-
-    acc_vec = __riscv_vmul_vx_u64m8(data_key, XXH_PRIME32_1, vl);
-
-    __riscv_vse64_v_u64m8(aligned_acc, acc_vec, vl);
-    memcpy(acc, aligned_acc, XXH_STRIPE_LEN);
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        size_t count = XXH_STRIPE_LEN/8;
+        uint64_t* xacc = (uint64_t*)acc;
+        const uint8_t* xsecret = (const uint8_t *)secret;
+        size_t vl;
+        for (; count > 0; count -= vl, xacc += vl, xsecret += vl*8) {
+            vl = XXH_RVOP(vsetvl_e64m2)(count);
+            {
+                /* key_vec = xsecret[i]; */
+                vuint64m2_t key_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)(xsecret, vl*8));
+                /* acc_vec = xacc[i]; */
+                vuint64m2_t acc_vec = XXH_RVOP(vle64_v_u64m2)(xacc, vl);
+                /* acc_vec ^= acc_vec >> 47; */
+                vuint64m2_t vsrl = XXH_RVOP(vsrl_vx_u64m2)(acc_vec, 47, vl);
+                acc_vec = XXH_RVOP(vxor_vv_u64m2)(acc_vec, vsrl, vl);
+                /* acc_vec ^= key_vec; */
+                acc_vec = XXH_RVOP(vxor_vv_u64m2)(acc_vec, key_vec, vl);
+                /* acc_vec *= XXH_PRIME32_1; */
+                acc_vec = XXH_RVOP(vmul_vx_u64m2)(acc_vec, XXH_PRIME32_1, vl);
+                /* xacc[i] *= acc_vec; */
+                XXH_RVOP(vse64_v_u64m2)(xacc, acc_vec, vl);
+            }
+        }
+    }
 }
 
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_rvv(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN >= 8);
+    XXH_ASSERT(((size_t)customSecret & 7) == 0);
+    (void)(&XXH_writeLE64);
+    {
+        size_t count = XXH_SECRET_DEFAULT_SIZE/8;
+        size_t vl;
+        size_t VLMAX = XXH_RVOP(vsetvlmax_e64m2)();
+        int64_t* cSecret = (int64_t*)customSecret;
+        const int64_t* kSecret = (const int64_t*)(const void*)XXH3_kSecret;
+
+#if __riscv_v_intrinsic >= 1000000
+        // ratified v1.0 intrinics version
+        vbool32_t mneg = XXH_RVCAST(u8m1_b32)(
+                         XXH_RVOP(vmv_v_x_u8m1)(0xaa, XXH_RVOP(vsetvlmax_e8m1)()));
+#else
+        // support pre-ratification intrinics, which lack mask to vector casts
+        size_t vlmax = XXH_RVOP(vsetvlmax_e8m1)();
+        vbool32_t mneg = XXH_RVOP(vmseq_vx_u8mf4_b32)(
+                         XXH_RVOP(vand_vx_u8mf4)(
+                         XXH_RVOP(vid_v_u8mf4)(vlmax), 1, vlmax), 1, vlmax);
+#endif
+        vint64m2_t seed = XXH_RVOP(vmv_v_x_i64m2)((int64_t)seed64, VLMAX);
+        seed = XXH_RVOP(vneg_v_i64m2_mu)(mneg, seed, seed, VLMAX);
+
+        for (; count > 0; count -= vl, cSecret += vl, kSecret += vl) {
+            /* make sure vl=VLMAX until last iteration */
+            vl = XXH_RVOP(vsetvl_e64m2)(count < VLMAX ? count : VLMAX);
+            {
+                vint64m2_t src = XXH_RVOP(vle64_v_i64m2)(kSecret, vl);
+                vint64m2_t res = XXH_RVOP(vadd_vv_i64m2)(src, seed, vl);
+                XXH_RVOP(vse64_v_i64m2)(cSecret, res, vl);
+            }
+        }
+    }
+}
 #endif  // XXH_VECTOR == XXH_RVV
 
 /*!
@@ -5955,7 +5971,7 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
 #define XXH3_accumulate_512 XXH3_accumulate_512_rvv
 #define XXH3_accumulate     XXH3_accumulate_rvv
 #define XXH3_scrambleAcc    XXH3_scrambleAcc_rvv
-#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_rvv
 
 #else /* scalar */
 #define XXH3_accumulate_512 XXH3_accumulate_512_scalar