From 41bee681d97b683da09f0c223964bd2c6e4c1a72 Mon Sep 17 00:00:00 2001 From: taimur-10x Date: Fri, 5 Dec 2025 16:21:06 +0500 Subject: [PATCH 1/2] ggml-cpu: add repack GEMM and GEMV for floating-point --- ggml/src/ggml-cpu/arch-fallback.h | 30 ++ ggml/src/ggml-cpu/arch/riscv/repack.cpp | 274 ++++++++++++ ggml/src/ggml-cpu/repack.cpp | 568 +++++++++++++++++++++--- ggml/src/ggml-cpu/repack.h | 61 +++ 4 files changed, 883 insertions(+), 50 deletions(-) diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 41da829315..019fd056c8 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -37,6 +37,8 @@ #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 +#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 +#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0 @@ -76,15 +78,33 @@ #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K +#define ggml_gemv_f16_1x16_f16_generic ggml_gemv_f16_1x16_f16 +#define ggml_gemv_f16_1x32_f16_generic ggml_gemv_f16_1x32_f16 +#define ggml_gemv_f16_1x64_f16_generic ggml_gemv_f16_1x64_f16 +#define ggml_gemv_f16_1x128_f16_generic ggml_gemv_f16_1x128_f16 +#define ggml_gemv_f32_1x16_f32_generic ggml_gemv_f32_1x16_f32 +#define ggml_gemv_f32_1x32_f32_generic ggml_gemv_f32_1x32_f32 +#define ggml_gemv_f32_1x64_f32_generic ggml_gemv_f32_1x64_f32 +#define ggml_gemv_f32_1x128_f32_generic ggml_gemv_f32_1x128_f32 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K +#define ggml_gemm_f16_7x1x16_f16_generic ggml_gemm_f16_7x1x16_f16 +#define ggml_gemm_f16_7x1x32_f16_generic ggml_gemm_f16_7x1x32_f16 +#define ggml_gemm_f16_7x1x64_f16_generic ggml_gemm_f16_7x1x64_f16 +#define ggml_gemm_f16_7x1x128_f16_generic ggml_gemm_f16_7x1x128_f16 +#define ggml_gemm_f32_7x1x16_f32_generic ggml_gemm_f32_7x1x16_f32 +#define ggml_gemm_f32_7x1x32_f32_generic ggml_gemm_f32_7x1x32_f32 +#define ggml_gemm_f32_7x1x64_f32_generic ggml_gemm_f32_7x1x64_f32 +#define ggml_gemm_f32_7x1x128_f32_generic ggml_gemm_f32_7x1x128_f32 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) // quants.c #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 +#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 +#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K @@ -120,6 +140,8 @@ #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 +#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 +#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0 @@ -165,6 +187,8 @@ #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 +#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 +#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0 @@ -206,6 +230,8 @@ #define ggml_quantize_mat_q8_K_4x1_generic ggml_quantize_mat_q8_K_4x1 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 +#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 +#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K @@ -255,6 +281,8 @@ #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 +#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 +#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0 @@ -308,6 +336,8 @@ #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 +#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 +#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0 diff --git a/ggml/src/ggml-cpu/arch/riscv/repack.cpp b/ggml/src/ggml-cpu/arch/riscv/repack.cpp index c37488cae5..6058c9b416 100644 --- a/ggml/src/ggml-cpu/arch/riscv/repack.cpp +++ b/ggml/src/ggml-cpu/arch/riscv/repack.cpp @@ -1701,3 +1701,277 @@ void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v } } #endif + +template +static inline void ggml_gemv_f16_1xM_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int nb = n / 1; + + assert (nr == 1); + assert(n % 1 == 0); + assert(nc % ncols_interleaved == 0); + + const _Float16 * a_ptr = (const _Float16 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_f16 * b_ptr = (const block_f16 *) vx + (x * nb); + + // Accumulators + vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + + for (int l = 0; l < nb; l++) { + vfloat16m2_t b_0 = __riscv_vle16_v_f16m2((const _Float16 *)&b_ptr[l].d[0], ncols_interleaved); + + sumf_0 = __riscv_vfwmacc_vf_f32m4(sumf_0, *(const _Float16*)(&a_ptr[l]), b_0, ncols_interleaved); + } + + __riscv_vse32_v_f32m4(&s[x * ncols_interleaved], sumf_0, ncols_interleaved); + } + + return; +} + +void ggml_gemv_f16_1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemv_f16_1xM_f16<16>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemv_f16_1x16_f16_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f16_1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemv_f16_1xM_f16<32>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemv_f16_1x32_f16_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f16_1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemv_f16_1xM_f16<64>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemv_f16_1x64_f16_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f16_1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemv_f16_1xM_f16<128>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemv_f16_1x128_f16_generic(n, s, bs, vx, vy, nr, nc); +} + +template +static inline void ggml_gemv_f32_1xM_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int nb = n / 1; + + assert (nr == 1); + assert(n % 1 == 0); + assert(nc % ncols_interleaved == 0); + + const float * a_ptr = (const float *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_f32 * b_ptr = (const block_f32 *) vx + (x * nb); + + // Accumulators + vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + + for (int l = 0; l < nb; l++) { + vfloat32m4_t b_0 = __riscv_vle32_v_f32m4((const float *)&b_ptr[l].d[0], ncols_interleaved); + + sumf_0 = __riscv_vfmacc_vf_f32m4(sumf_0, *(const float*)(&a_ptr[l]), b_0, ncols_interleaved); + } + + __riscv_vse32_v_f32m4(&s[x * ncols_interleaved], sumf_0, ncols_interleaved); + } + + return; +} + +void ggml_gemv_f32_1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemv_f32_1xM_f32<16>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemv_f32_1x16_f32_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemv_f32_1xM_f32<32>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemv_f32_1x32_f32_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemv_f32_1xM_f32<64>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemv_f32_1x64_f32_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemv_f32_1xM_f32<128>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemv_f32_1x128_f32_generic(n, s, bs, vx, vy, nr, nc); +} + +template +static inline void ggml_gemm_f16_7x1xM_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int nb = n / 1; + + assert (nr % 7 == 0); + assert(n % 1 == 0); + assert(nc % ncols_interleaved == 0); + + for (int y = 0; y < nr / 7; y++) { + const block_f16_7x1 * a_ptr = (const block_f16_7x1*) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_f16 * b_ptr = (const block_f16 *) vx + (x * nb); + + // Accumulators + vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_1 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_2 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_3 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_4 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_5 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_6 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + + for (int l = 0; l < nb; l++) { + vfloat16m2_t b_0 = __riscv_vle16_v_f16m2((const _Float16 *)&b_ptr[l].d[0], ncols_interleaved); + + sumf_0 = __riscv_vfwmacc_vf_f32m4(sumf_0, *(const _Float16*)&a_ptr[l].d[0], b_0, ncols_interleaved); + sumf_1 = __riscv_vfwmacc_vf_f32m4(sumf_1, *(const _Float16*)&a_ptr[l].d[1], b_0, ncols_interleaved); + sumf_2 = __riscv_vfwmacc_vf_f32m4(sumf_2, *(const _Float16*)&a_ptr[l].d[2], b_0, ncols_interleaved); + sumf_3 = __riscv_vfwmacc_vf_f32m4(sumf_3, *(const _Float16*)&a_ptr[l].d[3], b_0, ncols_interleaved); + sumf_4 = __riscv_vfwmacc_vf_f32m4(sumf_4, *(const _Float16*)&a_ptr[l].d[4], b_0, ncols_interleaved); + sumf_5 = __riscv_vfwmacc_vf_f32m4(sumf_5, *(const _Float16*)&a_ptr[l].d[5], b_0, ncols_interleaved); + sumf_6 = __riscv_vfwmacc_vf_f32m4(sumf_6, *(const _Float16*)&a_ptr[l].d[6], b_0, ncols_interleaved); + } + + __riscv_vse32_v_f32m4(&s[(y * 7 + 0) * bs + x * ncols_interleaved], sumf_0, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 1) * bs + x * ncols_interleaved], sumf_1, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 2) * bs + x * ncols_interleaved], sumf_2, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 3) * bs + x * ncols_interleaved], sumf_3, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 4) * bs + x * ncols_interleaved], sumf_4, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 5) * bs + x * ncols_interleaved], sumf_5, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 6) * bs + x * ncols_interleaved], sumf_6, ncols_interleaved); + } + } + return; +} + +void ggml_gemm_f16_7x1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemm_f16_7x1xM_f16<16>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemm_f16_7x1x16_f16_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f16_7x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemm_f16_7x1xM_f16<32>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemm_f16_7x1x32_f16_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f16_7x1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemm_f16_7x1xM_f16<64>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemm_f16_7x1x64_f16_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f16_7x1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemm_f16_7x1xM_f16<128>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemm_f16_7x1x128_f16_generic(n, s, bs, vx, vy, nr, nc); +} + +template +static inline void ggml_gemm_f32_7x1xM_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int nb = n / 1; + + assert (nr % 7 == 0); + assert(n % 1 == 0); + assert(nc % ncols_interleaved == 0); + + for (int y = 0; y < nr / 7; y++) { + const block_f32_7x1 * a_ptr = (const block_f32_7x1*) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_f32 * b_ptr = (const block_f32 *) vx + (x * nb); + + // Accumulators + vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_1 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_2 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_3 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_4 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_5 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + vfloat32m4_t sumf_6 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + + for (int l = 0; l < nb; l++) { + vfloat32m4_t b_0 = __riscv_vle32_v_f32m4((const float*)&b_ptr[l].d[0], ncols_interleaved); + + sumf_0 = __riscv_vfmacc_vf_f32m4(sumf_0, *(const float*)&a_ptr[l].d[0], b_0, ncols_interleaved); + sumf_1 = __riscv_vfmacc_vf_f32m4(sumf_1, *(const float*)&a_ptr[l].d[1], b_0, ncols_interleaved); + sumf_2 = __riscv_vfmacc_vf_f32m4(sumf_2, *(const float*)&a_ptr[l].d[2], b_0, ncols_interleaved); + sumf_3 = __riscv_vfmacc_vf_f32m4(sumf_3, *(const float*)&a_ptr[l].d[3], b_0, ncols_interleaved); + sumf_4 = __riscv_vfmacc_vf_f32m4(sumf_4, *(const float*)&a_ptr[l].d[4], b_0, ncols_interleaved); + sumf_5 = __riscv_vfmacc_vf_f32m4(sumf_5, *(const float*)&a_ptr[l].d[5], b_0, ncols_interleaved); + sumf_6 = __riscv_vfmacc_vf_f32m4(sumf_6, *(const float*)&a_ptr[l].d[6], b_0, ncols_interleaved); + } + + __riscv_vse32_v_f32m4(&s[(y * 7 + 0) * bs + x * ncols_interleaved], sumf_0, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 1) * bs + x * ncols_interleaved], sumf_1, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 2) * bs + x * ncols_interleaved], sumf_2, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 3) * bs + x * ncols_interleaved], sumf_3, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 4) * bs + x * ncols_interleaved], sumf_4, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 5) * bs + x * ncols_interleaved], sumf_5, ncols_interleaved); + __riscv_vse32_v_f32m4(&s[(y * 7 + 6) * bs + x * ncols_interleaved], sumf_6, ncols_interleaved); + } + } + return; +} + +void ggml_gemm_f32_7x1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemm_f32_7x1xM_f32<16>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemm_f32_7x1x16_f32_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f32_7x1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemm_f32_7x1xM_f32<32>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemm_f32_7x1x32_f32_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f32_7x1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemm_f32_7x1xM_f32<64>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemm_f32_7x1x64_f32_generic(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f32_7x1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +#if defined __riscv_v_intrinsic + ggml_gemm_f32_7x1xM_f32<128>(n, s, bs, vx, vy, nr, nc); + return; +#endif + ggml_gemm_f32_7x1x128_f32_generic(n, s, bs, vx, vy, nr, nc); +} diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index f18758f16b..3f720a82ef 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -31,6 +31,40 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } +// Helper functions for `fp16` and `fp32`. +// +template +static inline void ggml_repack_mat_f16_NxK_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % interleave_size == 0); + const int nb = k / interleave_size; + + block_f16 * GGML_RESTRICT y = (block_f16 *) vy; + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < nrows_interleaved; j++) { + for (int l = 0; l < interleave_size; l++) { + y[i].d[j * interleave_size + l] = GGML_CPU_FP32_TO_FP16(x[j * k + i * interleave_size + l]); + } + } + } +} + +template +static inline void ggml_repack_mat_f32_NxK_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % interleave_size == 0); + const int nb = k / interleave_size; + + block_f32 * GGML_RESTRICT y = (block_f32 *) vy; + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < nrows_interleaved; j++) { + for (int l = 0; l < interleave_size; l++) { + y[i].d[j * interleave_size + l] = x[j * k + i * interleave_size + l]; + } + } + } +} + // Functions to create the interleaved data layout formats // interleave 4 block_q4_0s in blocks of blck_size_interleave @@ -310,30 +344,40 @@ void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GG } } +#if defined __riscv_zvfh +void ggml_repack_mat_f16_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + ggml_repack_mat_f16_NxK_generic<7, 1>(x, vy, k); +} + +void ggml_repack_mat_f32_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + ggml_repack_mat_f32_NxK_generic<7, 1>(x, vy, k); +} +#endif + } // extern "C" -template -void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row); +template +void ggml_repack_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row); -template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { +template <> void ggml_repack_mat_t<4, 4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { assert(nrow == 4); UNUSED(nrow); ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row); } -template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { +template <> void ggml_repack_mat_t<4, 8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { assert(nrow == 4); UNUSED(nrow); ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row); } -template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { +template <> void ggml_repack_mat_t<4, 4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { assert(nrow == 4); UNUSED(nrow); ggml_quantize_mat_q8_K_4x4(x, vy, n_per_row); } -template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { +template <> void ggml_repack_mat_t<4, 8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { assert(nrow == 4); UNUSED(nrow); ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row); @@ -353,6 +397,154 @@ template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_K>(const float * GGML_RESTR } #endif +#if defined __riscv_zvfh +template <> void ggml_repack_mat_t<7, 1, GGML_TYPE_F16>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { + assert(nrow == 7); + UNUSED(nrow); + ggml_repack_mat_f16_7x1(x, vy, n_per_row); +} + +template <> void ggml_repack_mat_t<7, 1, GGML_TYPE_F32>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { + assert(nrow == 7); + UNUSED(nrow); + ggml_repack_mat_f32_7x1(x, vy, n_per_row); +} +#endif + +template +static inline void ggml_gemv_f16_KxM_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int nb = n / interleave_size; + + assert(nr == 1); + assert(n % interleave_size == 0); + assert(nc % ncols_interleaved == 0); + + UNUSED(bs); + UNUSED(nr); + + float sumf[ncols_interleaved]; + + const ggml_half * a_ptr = (const ggml_half *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_f16 * b_ptr = + (const block_f16 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) { sumf[j] = 0.0f; } + for (int l = 0; l < nb; l++) { + for (int j = 0; j < ncols_interleaved; j++) { + for (int k = 0; k < interleave_size; k++) { + sumf[j] += GGML_FP16_TO_FP32(b_ptr[l].d[j * interleave_size + k]) * GGML_FP16_TO_FP32(a_ptr[l * interleave_size + k]); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) { s[x * ncols_interleaved + j] = sumf[j]; } + } +} + +template +static inline void ggml_gemv_f32_KxM_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int nb = n / interleave_size; + + assert(nr == 1); + assert(n % interleave_size == 0); + assert(nc % ncols_interleaved == 0); + + UNUSED(bs); + UNUSED(nr); + + float sumf[ncols_interleaved]; + + const float * a_ptr = (const float *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_f32 * b_ptr = + (const block_f32 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) { sumf[j] = 0.0f; } + for (int l = 0; l < nb; l++) { + for (int j = 0; j < ncols_interleaved; j++) { + for (int k = 0; k < interleave_size; k++) { + sumf[j] += b_ptr[l].d[j * interleave_size + k] * a_ptr[l * interleave_size + k]; + } + } + } + for (int j = 0; j < ncols_interleaved; j++) { s[x * ncols_interleaved + j] = sumf[j]; } + } +} + +template +static inline void ggml_gemm_f16_NxKxM_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int nb = n / interleave_size; + + assert(nr % nrows == 0); + assert(n % interleave_size == 0); + assert(nc % ncols_interleaved == 0); + + float sumf[nrows][ncols_interleaved]; + + for (int y = 0; y < nr / nrows; y++) { + const block_f16 * a_ptr = + (const block_f16 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_f16 * b_ptr = + (const block_f16 *) vx + (x * nb); + + for (int m = 0; m < nrows; m++) { + for (int j = 0; j < ncols_interleaved; j++) { sumf[m][j] = 0.0f; } + } + for (int l = 0; l < nb; l++) { + for (int m = 0; m < nrows; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + for (int k = 0; k < interleave_size; k++) { + sumf[m][j] += GGML_FP16_TO_FP32(b_ptr[l].d[j * interleave_size + k]) * GGML_FP16_TO_FP32(a_ptr[l].d[m * interleave_size + k]); + } + } + } + } + for (int m = 0; m < nrows; m++) { + for (int j = 0; j < ncols_interleaved; j++) + { s[(y * nrows + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; } + } + } + } +} + +template +static inline void ggml_gemm_f32_NxKxM_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int nb = n / interleave_size; + + assert(nr % nrows == 0); + assert(n % interleave_size == 0); + assert(nc % ncols_interleaved == 0); + + float sumf[nrows][ncols_interleaved]; + + for (int y = 0; y < nr / nrows; y++) { + const block_f32 * a_ptr = + (const block_f32 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_f32 * b_ptr = + (const block_f32 *) vx + (x * nb); + + for (int m = 0; m < nrows; m++) { + for (int j = 0; j < ncols_interleaved; j++) { sumf[m][j] = 0.0f; } + } + for (int l = 0; l < nb; l++) { + for (int m = 0; m < nrows; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + for (int k = 0; k < interleave_size; k++) { + sumf[m][j] += b_ptr[l].d[j * interleave_size + k] * a_ptr[l].d[m * interleave_size + k]; + } + } + } + } + for (int m = 0; m < nrows; m++) { + for (int j = 0; j < ncols_interleaved; j++) + { s[(y * nrows + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; } + } + } + } +} + template static void ggml_gemv_q6_K_NxM_q8_K_generic_impl(int n, float * GGML_RESTRICT s, @@ -1655,6 +1847,40 @@ void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } #endif +#if defined __riscv_zvfh +void ggml_gemv_f16_1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f16_KxM_f16_generic<1, 16>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f16_1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f16_KxM_f16_generic<1, 32>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f16_1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f16_KxM_f16_generic<1, 64>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f16_1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f16_KxM_f16_generic<1, 128>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x16_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_KxM_f32_generic<1, 16>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x32_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_KxM_f32_generic<1, 32>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x64_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_KxM_f32_generic<1, 64>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x128_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_KxM_f32_generic<1, 128>(n, s, bs, vx, vy, nr, nc); +} +#endif + void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; @@ -2720,6 +2946,40 @@ void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } #endif +#if defined __riscv_zvfh +void ggml_gemm_f16_7x1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_f16_NxKxM_f16_generic<7, 1, 16>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f16_7x1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_f16_NxKxM_f16_generic<7, 1, 32>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f16_7x1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_f16_NxKxM_f16_generic<7, 1, 64>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f16_7x1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_f16_NxKxM_f16_generic<7, 1, 128>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f32_7x1x16_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_f32_NxKxM_f32_generic<7, 1, 16>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f32_7x1x32_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_f32_NxKxM_f32_generic<7, 1, 32>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f32_7x1x64_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_f32_NxKxM_f32_generic<7, 1, 64>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemm_f32_7x1x128_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemm_f32_NxKxM_f32_generic<7, 1, 128>(n, s, bs, vx, vy, nr, nc); +} +#endif + } // extern "C" static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) { @@ -3859,6 +4119,78 @@ static int repack_mxfp4_to_mxfp4_8_bl(struct ggml_tensor * t, int interleave_blo GGML_UNUSED(data_size); } +template +static int repack_f16_to_f16_MxK_bl(struct ggml_tensor * t, const void * GGML_RESTRICT data, size_t data_size) { + GGML_ASSERT(t->type == GGML_TYPE_F16); + + const ggml_half * src = (const ggml_half *)data; + block_f16 * dst = ( block_f16 *)t->data; + + ggml_half dst_tmp[ncols_interleaved * interleave_size]; + + int nrow = ggml_nrows(t); + int row_size = t->ne[0]; + int nblocks = row_size / interleave_size; + + GGML_ASSERT(data_size == nrow * nblocks * interleave_size * sizeof(ggml_half)); + + if (t->ne[1] % ncols_interleaved != 0 || t->ne[0] % interleave_size != 0) { + return -1; + } + + for (int b = 0; b < nrow; b += ncols_interleaved) { + for (int i = 0; i < nblocks; i++) { + for (int j = 0; j < ncols_interleaved; j++) { + for (int k = 0; k < interleave_size; k++) { + dst_tmp[j * interleave_size + k] = src[(j + b) * row_size + i * interleave_size + k]; + } + } + block_f16 out; + memcpy(&out.d, dst_tmp, sizeof(ggml_half) * ncols_interleaved * interleave_size); + *dst = out; + dst++; + } + } + + return 0; +} + +template +static int repack_f32_to_f32_MxK_bl(struct ggml_tensor * t, const void * GGML_RESTRICT data, size_t data_size) { + GGML_ASSERT(t->type == GGML_TYPE_F32); + + const float * src = (const float *)data; + block_f32 * dst = ( block_f32 *)t->data; + + float dst_tmp[ncols_interleaved * interleave_size]; + + int nrow = ggml_nrows(t); + int row_size = t->ne[0]; + int nblocks = row_size / interleave_size; + + GGML_ASSERT(data_size == nrow * nblocks * interleave_size * sizeof(float)); + + if (t->ne[1] % ncols_interleaved != 0 || t->ne[0] % interleave_size != 0) { + return -1; + } + + for (int b = 0; b < nrow; b += ncols_interleaved) { + for (int i = 0; i < nblocks; i++) { + for (int j = 0; j < ncols_interleaved; j++) { + for (int k = 0; k < interleave_size; k++) { + dst_tmp[j * interleave_size + k] = src[(j + b) * row_size + i * interleave_size + k]; + } + } + block_f32 out; + memcpy(&out.d, dst_tmp, sizeof(float) * ncols_interleaved * interleave_size); + *dst = out; + dst++; + } + } + + return 0; +} + namespace ggml::cpu::repack { // repack template @@ -3956,6 +4288,34 @@ template <> int repack(struct ggml_tensor * t, const void * d } #endif +#if defined __riscv_zvfh +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_f16_to_f16_MxK_bl<16, 1>(t, data, data_size); +} +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_f16_to_f16_MxK_bl<32, 1>(t, data, data_size); +} +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_f16_to_f16_MxK_bl<64, 1>(t, data, data_size); +} +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_f16_to_f16_MxK_bl<128, 1>(t, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_f32_to_f32_MxK_bl<16, 1>(t, data, data_size); +} +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_f32_to_f32_MxK_bl<32, 1>(t, data, data_size); +} +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_f32_to_f32_MxK_bl<64, 1>(t, data, data_size); +} +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_f32_to_f32_MxK_bl<128, 1>(t, data, data_size); +} +#endif + // gemv template void gemv(int, float *, size_t, const void *, const void *, int, int); @@ -4053,20 +4413,54 @@ template <> void gemv(int n, float * s, size_ } #endif +#if defined __riscv_zvfh +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_f16_1x16_f16(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_f16_1x32_f16(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_f16_1x64_f16(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_f16_1x128_f16(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_f32_1x16_f32(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_f32_1x32_f32(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_f32_1x64_f32(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_f32_1x128_f32(n, s, bs, vx, vy, nr, nc); +} +#endif + // gemm -template +template void gemm(int, float *, size_t, const void *, const void *, int, int); -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); } template <> -void gemm(int n, +void gemm(int n, float * s, size_t bs, const void * vx, @@ -4076,55 +4470,55 @@ void gemm(int n, ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); } @@ -4150,12 +4544,46 @@ template <> void gemm(int n, float * s, size_ } #endif +#if defined __riscv_zvfh +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_f16_7x1x16_f16(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_f16_7x1x32_f16(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_f16_7x1x64_f16(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_f16_7x1x128_f16(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_f32_7x1x16_f32(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_f32_7x1x32_f32(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_f32_7x1x64_f32(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_f32_7x1x128_f32(n, s, bs, vx, vy, nr, nc); +} +#endif + class tensor_traits_base : public ggml::cpu::tensor_traits { public: virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0; }; -template class tensor_traits : public tensor_traits_base { +template class tensor_traits : public tensor_traits_base { bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { // not realy a GGML_TYPE_Q8_0 but same size. @@ -4237,13 +4665,13 @@ template wdata + params->wsize); - // If there are more than three rows in src1, use gemm; otherwise, use gemv. - if (nrows > 3) { - gemm(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0, + // If there are more than `NB_ROWS` rows in src1, use gemm; otherwise, use gemv. + if (nrows > (NB_ROWS - 1)) { + gemm(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0, src0_ptr + src0_start * nb01, src1_ptr, - nrows - (nrows % 4), ncols); + nrows - (nrows % NB_ROWS), ncols); } - for (int iter = nrows - (nrows % 4); iter < nrows; iter++) { + for (int iter = nrows - (nrows % NB_ROWS); iter < nrows; iter++) { gemv(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start, ne01, src0_ptr + src0_start * nb01, src1_ptr + (src1_col_stride * iter), 1 /* nrows */, ncols); @@ -4297,12 +4725,12 @@ template data + i12 * nb12; char * wdata_ptr = wdata + i12 * nbw2; - for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { - ggml_quantize_mat_t((float *) (data_ptr + i11 * nb11), - (void *) (wdata_ptr + i11 * nbw1), 4, ne10); + for (int64_t i11 = ith * NB_ROWS; i11 < ne11 - ne11 % NB_ROWS; i11 += nth * NB_ROWS) { + ggml_repack_mat_t((float *) (data_ptr + i11 * nb11), + (void *) (wdata_ptr + i11 * nbw1), NB_ROWS, ne10); } - const int64_t i11_processed = ne11 - ne11 % 4; + const int64_t i11_processed = ne11 - ne11 % NB_ROWS; for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10); } @@ -4314,7 +4742,7 @@ template src[0]); - int nth_scaled = nth * 4; + int nth_scaled = nth * NB_ROWS; int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled; int64_t nchunk0 = (nr0 + chunk_size0 - 1) / chunk_size0; @@ -4527,36 +4955,36 @@ template q4_0_4x4_q8_0; - static const ggml::cpu::repack::tensor_traits q4_0_4x8_q8_0; - static const ggml::cpu::repack::tensor_traits q4_0_8x8_q8_0; + static const ggml::cpu::repack::tensor_traits q4_0_4x4_q8_0; + static const ggml::cpu::repack::tensor_traits q4_0_4x8_q8_0; + static const ggml::cpu::repack::tensor_traits q4_0_8x8_q8_0; // instance for Q4_K - static const ggml::cpu::repack::tensor_traits q4_K_8x4_q8_K; - static const ggml::cpu::repack::tensor_traits q4_K_8x8_q8_K; + static const ggml::cpu::repack::tensor_traits q4_K_8x4_q8_K; + static const ggml::cpu::repack::tensor_traits q4_K_8x8_q8_K; // instance for Q5_K - static const ggml::cpu::repack::tensor_traits q5_K_8x4_q8_K; - static const ggml::cpu::repack::tensor_traits q5_K_8x8_q8_K; + static const ggml::cpu::repack::tensor_traits q5_K_8x4_q8_K; + static const ggml::cpu::repack::tensor_traits q5_K_8x8_q8_K; // instance for Q6_K - static const ggml::cpu::repack::tensor_traits q6_K_8x4_q8_K; - static const ggml::cpu::repack::tensor_traits q6_K_8x8_q8_K; + static const ggml::cpu::repack::tensor_traits q6_K_8x4_q8_K; + static const ggml::cpu::repack::tensor_traits q6_K_8x8_q8_K; // instance for Q2 - static const ggml::cpu::repack::tensor_traits q2_K_8x8_q8_K; + static const ggml::cpu::repack::tensor_traits q2_K_8x8_q8_K; // instance for IQ4 - static const ggml::cpu::repack::tensor_traits iq4_nl_4x4_q8_0; - static const ggml::cpu::repack::tensor_traits iq4_nl_8x8_q8_0; + static const ggml::cpu::repack::tensor_traits iq4_nl_4x4_q8_0; + static const ggml::cpu::repack::tensor_traits iq4_nl_8x8_q8_0; // instance for MXFP4 - static const ggml::cpu::repack::tensor_traits mxfp4_4x4_q8_0; - static const ggml::cpu::repack::tensor_traits mxfp4_8x8_q8_0; + static const ggml::cpu::repack::tensor_traits mxfp4_4x4_q8_0; + static const ggml::cpu::repack::tensor_traits mxfp4_8x8_q8_0; // instance for Q8_0 - static const ggml::cpu::repack::tensor_traits q8_0_4x4_q8_0; - static const ggml::cpu::repack::tensor_traits q8_0_4x8_q8_0; + static const ggml::cpu::repack::tensor_traits q8_0_4x4_q8_0; + static const ggml::cpu::repack::tensor_traits q8_0_4x8_q8_0; // instances for RISC-V // @@ -4570,6 +4998,22 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons static const ggml::cpu::repack::tensor_traits q2_K_16x1_q8_K; #endif + // instance for F16 +#if defined __riscv_zvfh + static const ggml::cpu::repack::tensor_traits f16_7x16x1_f16; + static const ggml::cpu::repack::tensor_traits f16_7x32x1_f16; + static const ggml::cpu::repack::tensor_traits f16_7x64x1_f16; + static const ggml::cpu::repack::tensor_traits f16_7x128x1_f16; +#endif + + // instance for F32 +#if defined __riscv_zvfh + static const ggml::cpu::repack::tensor_traits f32_7x16x1_f32; + static const ggml::cpu::repack::tensor_traits f32_7x32x1_f32; + static const ggml::cpu::repack::tensor_traits f32_7x64x1_f32; + static const ggml::cpu::repack::tensor_traits f32_7x128x1_f32; +#endif + if (cur->type == GGML_TYPE_Q4_0) { if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { if (cur->ne[1] % 8 == 0) { @@ -4718,6 +5162,30 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons } #endif } + } else if (cur->type == GGML_TYPE_F16) { + if (ggml_cpu_has_riscv_v()) { + #if defined __riscv_zvfh + switch (__riscv_vlenb() * 8) { + case 128: { if (cur->ne[1] % 16 == 0) { return &f16_7x16x1_f16; } break; } + case 256: { if (cur->ne[1] % 32 == 0) { return &f16_7x32x1_f16; } break; } + case 512: { if (cur->ne[1] % 64 == 0) { return &f16_7x64x1_f16; } break; } + case 1024: { if (cur->ne[1] % 128 == 0) { return &f16_7x128x1_f16; } break; } + default: return nullptr; + } + #endif + } + } else if (cur->type == GGML_TYPE_F32) { + if (ggml_cpu_has_riscv_v()) { + #if defined __riscv_zvfh + switch (__riscv_vlenb() * 8) { + case 128: { if (cur->ne[1] % 16 == 0) { return &f32_7x16x1_f32; } break; } + case 256: { if (cur->ne[1] % 32 == 0) { return &f32_7x32x1_f32; } break; } + case 512: { if (cur->ne[1] % 64 == 0) { return &f32_7x64x1_f32; } break; } + case 1024: { if (cur->ne[1] % 128 == 0) { return &f32_7x128x1_f32; } break; } + default: return nullptr; + } + #endif + } } return nullptr; diff --git a/ggml/src/ggml-cpu/repack.h b/ggml/src/ggml-cpu/repack.h index cb21edf623..16d5a2f313 100644 --- a/ggml/src/ggml-cpu/repack.h +++ b/ggml/src/ggml-cpu/repack.h @@ -133,6 +133,23 @@ struct block_mxfp4x8 { }; static_assert(sizeof(block_mxfp4x8) == 8 + QK_MXFP4 * 4, "wrong mxfp4x8 block size/padding"); +template +struct block_f16 { + ggml_half d[N * K]; +}; + +using block_f16_32x1 = block_f16<32, 1>; +using block_f16_7x1 = block_f16<7, 1>; +using block_f16_4x1 = block_f16<4, 1>; + +template +struct block_f32 { + float d[N * K]; +}; + +using block_f32_32x1 = block_f32<32, 1>; +using block_f32_7x1 = block_f32<7, 1>; + #if defined(__cplusplus) extern "C" { #endif @@ -240,6 +257,50 @@ void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); #endif +// FP16 +void ggml_repack_mat_f16_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_repack_mat_f16_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_gemv_f16_1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f16_1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f16_1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f16_1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_4x1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_repack_mat_f16_4x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_repack_mat_f16_7x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_gemv_f16_1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f16_1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f16_1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f16_1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_4x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); + +// FP32 +void ggml_repack_mat_f32_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_gemv_f32_1x16_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f32_1x32_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f32_1x64_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f32_1x128_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x16_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x32_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x64_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x128_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_repack_mat_f32_7x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_gemv_f32_1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f32_1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f32_1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f32_1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); + #if defined(__cplusplus) } // extern "C" #endif From fd94e4cdca0af941015fadba28d12ebbf712772e Mon Sep 17 00:00:00 2001 From: Taimur Ahmad Date: Tue, 23 Dec 2025 15:13:09 +0500 Subject: [PATCH 2/2] ggml-cpu: add repack GEMM and GEMV for floating-point (#4) --- ggml/src/ggml-cpu/arch-fallback.h | 28 --- ggml/src/ggml-cpu/arch/riscv/repack.cpp | 242 +++++++++--------------- ggml/src/ggml-cpu/repack.cpp | 58 ++---- ggml/src/ggml-cpu/repack.h | 51 ++--- 4 files changed, 140 insertions(+), 239 deletions(-) diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 019fd056c8..6b36db2a36 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -37,8 +37,6 @@ #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 -#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 -#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0 @@ -78,33 +76,15 @@ #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K -#define ggml_gemv_f16_1x16_f16_generic ggml_gemv_f16_1x16_f16 -#define ggml_gemv_f16_1x32_f16_generic ggml_gemv_f16_1x32_f16 -#define ggml_gemv_f16_1x64_f16_generic ggml_gemv_f16_1x64_f16 -#define ggml_gemv_f16_1x128_f16_generic ggml_gemv_f16_1x128_f16 -#define ggml_gemv_f32_1x16_f32_generic ggml_gemv_f32_1x16_f32 -#define ggml_gemv_f32_1x32_f32_generic ggml_gemv_f32_1x32_f32 -#define ggml_gemv_f32_1x64_f32_generic ggml_gemv_f32_1x64_f32 -#define ggml_gemv_f32_1x128_f32_generic ggml_gemv_f32_1x128_f32 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K -#define ggml_gemm_f16_7x1x16_f16_generic ggml_gemm_f16_7x1x16_f16 -#define ggml_gemm_f16_7x1x32_f16_generic ggml_gemm_f16_7x1x32_f16 -#define ggml_gemm_f16_7x1x64_f16_generic ggml_gemm_f16_7x1x64_f16 -#define ggml_gemm_f16_7x1x128_f16_generic ggml_gemm_f16_7x1x128_f16 -#define ggml_gemm_f32_7x1x16_f32_generic ggml_gemm_f32_7x1x16_f32 -#define ggml_gemm_f32_7x1x32_f32_generic ggml_gemm_f32_7x1x32_f32 -#define ggml_gemm_f32_7x1x64_f32_generic ggml_gemm_f32_7x1x64_f32 -#define ggml_gemm_f32_7x1x128_f32_generic ggml_gemm_f32_7x1x128_f32 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) // quants.c #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 -#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 -#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K @@ -140,8 +120,6 @@ #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 -#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 -#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0 @@ -187,8 +165,6 @@ #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 -#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 -#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0 @@ -281,8 +257,6 @@ #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 -#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 -#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0 @@ -336,8 +310,6 @@ #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 -#define ggml_repack_mat_f16_7x1_generic ggml_repack_mat_f16_7x1 -#define ggml_repack_mat_f32_7x1_generic ggml_repack_mat_f32_7x1 #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0 #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0 diff --git a/ggml/src/ggml-cpu/arch/riscv/repack.cpp b/ggml/src/ggml-cpu/arch/riscv/repack.cpp index 6058c9b416..388076f9c0 100644 --- a/ggml/src/ggml-cpu/arch/riscv/repack.cpp +++ b/ggml/src/ggml-cpu/arch/riscv/repack.cpp @@ -665,6 +665,97 @@ void ggml_gemv_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v __riscv_vse32_v_f32m2(s + col_tile, v_sumf, vl); } } + + +template +static inline void ggml_gemv_f16_1xM_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + GGML_UNUSED(bs); + + const int nb = n / 1; + + assert (nr == 1); + assert(n % 1 == 0); + assert(nc % ncols_interleaved == 0); + + const _Float16 * a_ptr = (const _Float16 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_f16 * b_ptr = (const block_f16 *) vx + (x * nb); + + // Accumulators + vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + + for (int l = 0; l < nb; l++) { + vfloat16m2_t b_0 = __riscv_vle16_v_f16m2((const _Float16 *)&b_ptr[l].d[0], ncols_interleaved); + + sumf_0 = __riscv_vfwmacc_vf_f32m4(sumf_0, *(const _Float16*)(&a_ptr[l]), b_0, ncols_interleaved); + } + + __riscv_vse32_v_f32m4(&s[x * ncols_interleaved], sumf_0, ncols_interleaved); + } + + return; +} + +void ggml_gemv_f16_1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f16_1xM_f16<16>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f16_1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f16_1xM_f16<32>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f16_1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f16_1xM_f16<64>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f16_1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f16_1xM_f16<128>(n, s, bs, vx, vy, nr, nc); +} + +template +static inline void ggml_gemv_f32_1xM_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + GGML_UNUSED(bs); + + const int nb = n / 1; + + assert (nr == 1); + assert(n % 1 == 0); + assert(nc % ncols_interleaved == 0); + + const float * a_ptr = (const float *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_f32 * b_ptr = (const block_f32 *) vx + (x * nb); + + // Accumulators + vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); + + for (int l = 0; l < nb; l++) { + vfloat32m4_t b_0 = __riscv_vle32_v_f32m4((const float *)&b_ptr[l].d[0], ncols_interleaved); + + sumf_0 = __riscv_vfmacc_vf_f32m4(sumf_0, *(const float*)(&a_ptr[l]), b_0, ncols_interleaved); + } + + __riscv_vse32_v_f32m4(&s[x * ncols_interleaved], sumf_0, ncols_interleaved); + } + + return; +} + +void ggml_gemv_f32_1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_1xM_f32<16>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_1xM_f32<32>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_1xM_f32<64>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_1xM_f32<128>(n, s, bs, vx, vy, nr, nc); +} #endif void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -1700,125 +1791,7 @@ void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v } } } -#endif -template -static inline void ggml_gemv_f16_1xM_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { - const int nb = n / 1; - - assert (nr == 1); - assert(n % 1 == 0); - assert(nc % ncols_interleaved == 0); - - const _Float16 * a_ptr = (const _Float16 *) vy; - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_f16 * b_ptr = (const block_f16 *) vx + (x * nb); - - // Accumulators - vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); - - for (int l = 0; l < nb; l++) { - vfloat16m2_t b_0 = __riscv_vle16_v_f16m2((const _Float16 *)&b_ptr[l].d[0], ncols_interleaved); - - sumf_0 = __riscv_vfwmacc_vf_f32m4(sumf_0, *(const _Float16*)(&a_ptr[l]), b_0, ncols_interleaved); - } - - __riscv_vse32_v_f32m4(&s[x * ncols_interleaved], sumf_0, ncols_interleaved); - } - - return; -} - -void ggml_gemv_f16_1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic - ggml_gemv_f16_1xM_f16<16>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemv_f16_1x16_f16_generic(n, s, bs, vx, vy, nr, nc); -} - -void ggml_gemv_f16_1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic - ggml_gemv_f16_1xM_f16<32>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemv_f16_1x32_f16_generic(n, s, bs, vx, vy, nr, nc); -} - -void ggml_gemv_f16_1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic - ggml_gemv_f16_1xM_f16<64>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemv_f16_1x64_f16_generic(n, s, bs, vx, vy, nr, nc); -} - -void ggml_gemv_f16_1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic - ggml_gemv_f16_1xM_f16<128>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemv_f16_1x128_f16_generic(n, s, bs, vx, vy, nr, nc); -} - -template -static inline void ggml_gemv_f32_1xM_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { - const int nb = n / 1; - - assert (nr == 1); - assert(n % 1 == 0); - assert(nc % ncols_interleaved == 0); - - const float * a_ptr = (const float *) vy; - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_f32 * b_ptr = (const block_f32 *) vx + (x * nb); - - // Accumulators - vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4(0.0f, ncols_interleaved); - - for (int l = 0; l < nb; l++) { - vfloat32m4_t b_0 = __riscv_vle32_v_f32m4((const float *)&b_ptr[l].d[0], ncols_interleaved); - - sumf_0 = __riscv_vfmacc_vf_f32m4(sumf_0, *(const float*)(&a_ptr[l]), b_0, ncols_interleaved); - } - - __riscv_vse32_v_f32m4(&s[x * ncols_interleaved], sumf_0, ncols_interleaved); - } - - return; -} - -void ggml_gemv_f32_1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic - ggml_gemv_f32_1xM_f32<16>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemv_f32_1x16_f32_generic(n, s, bs, vx, vy, nr, nc); -} - -void ggml_gemv_f32_1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic - ggml_gemv_f32_1xM_f32<32>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemv_f32_1x32_f32_generic(n, s, bs, vx, vy, nr, nc); -} - -void ggml_gemv_f32_1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic - ggml_gemv_f32_1xM_f32<64>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemv_f32_1x64_f32_generic(n, s, bs, vx, vy, nr, nc); -} - -void ggml_gemv_f32_1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic - ggml_gemv_f32_1xM_f32<128>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemv_f32_1x128_f32_generic(n, s, bs, vx, vy, nr, nc); -} template static inline void ggml_gemm_f16_7x1xM_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -1867,35 +1840,19 @@ static inline void ggml_gemm_f16_7x1xM_f16(int n, float * GGML_RESTRICT s, size_ } void ggml_gemm_f16_7x1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic ggml_gemm_f16_7x1xM_f16<16>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemm_f16_7x1x16_f16_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemm_f16_7x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic ggml_gemm_f16_7x1xM_f16<32>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemm_f16_7x1x32_f16_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemm_f16_7x1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic ggml_gemm_f16_7x1xM_f16<64>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemm_f16_7x1x64_f16_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemm_f16_7x1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic ggml_gemm_f16_7x1xM_f16<128>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemm_f16_7x1x128_f16_generic(n, s, bs, vx, vy, nr, nc); } template @@ -1945,33 +1902,18 @@ static inline void ggml_gemm_f32_7x1xM_f32(int n, float * GGML_RESTRICT s, size_ } void ggml_gemm_f32_7x1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic ggml_gemm_f32_7x1xM_f32<16>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemm_f32_7x1x16_f32_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemm_f32_7x1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic ggml_gemm_f32_7x1xM_f32<32>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemm_f32_7x1x32_f32_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemm_f32_7x1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic ggml_gemm_f32_7x1xM_f32<64>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemm_f32_7x1x64_f32_generic(n, s, bs, vx, vy, nr, nc); } void ggml_gemm_f32_7x1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic ggml_gemm_f32_7x1xM_f32<128>(n, s, bs, vx, vy, nr, nc); - return; -#endif - ggml_gemm_f32_7x1x128_f32_generic(n, s, bs, vx, vy, nr, nc); } +#endif diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 3f720a82ef..9a17c5cd65 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -164,6 +164,14 @@ void ggml_quantize_mat_q8_K_4x1_generic(const float * GGML_RESTRICT x, void * GG } } } + +void ggml_repack_mat_f16_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + ggml_repack_mat_f16_NxK_generic<7, 1>(x, vy, k); +} + +void ggml_repack_mat_f32_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + ggml_repack_mat_f32_NxK_generic<7, 1>(x, vy, k); +} #endif void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { @@ -344,16 +352,6 @@ void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GG } } -#if defined __riscv_zvfh -void ggml_repack_mat_f16_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - ggml_repack_mat_f16_NxK_generic<7, 1>(x, vy, k); -} - -void ggml_repack_mat_f32_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - ggml_repack_mat_f32_NxK_generic<7, 1>(x, vy, k); -} -#endif - } // extern "C" template @@ -384,20 +382,18 @@ template <> void ggml_repack_mat_t<4, 8, GGML_TYPE_Q8_K>(const float * GGML_REST } #if defined __riscv_zvfh -template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { +template <> void ggml_repack_mat_t<4, 1, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { assert(nrow == 4); UNUSED(nrow); ggml_quantize_mat_q8_0_4x1(x, vy, n_per_row); } -template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { +template <> void ggml_repack_mat_t<4, 1, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { assert(nrow == 4); UNUSED(nrow); ggml_quantize_mat_q8_K_4x1(x, vy, n_per_row); } -#endif -#if defined __riscv_zvfh template <> void ggml_repack_mat_t<7, 1, GGML_TYPE_F16>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { assert(nrow == 7); UNUSED(nrow); @@ -1845,9 +1841,7 @@ void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } } } -#endif -#if defined __riscv_zvfh void ggml_gemv_f16_1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { ggml_gemv_f16_KxM_f16_generic<1, 16>(n, s, bs, vx, vy, nr, nc); } @@ -2944,9 +2938,7 @@ void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } } } -#endif -#if defined __riscv_zvfh void ggml_gemm_f16_7x1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { ggml_gemm_f16_NxKxM_f16_generic<7, 1, 16>(n, s, bs, vx, vy, nr, nc); } @@ -4286,9 +4278,7 @@ template <> int repack(struct ggml_tensor * t, const void * d template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { return repack_q2_K_to_q2_K_16_bl(t, 1, data, data_size); } -#endif -#if defined __riscv_zvfh template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { return repack_f16_to_f16_MxK_bl<16, 1>(t, data, data_size); } @@ -4411,9 +4401,7 @@ template <> void gemv(int n, float * s, size_ template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemv_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc); } -#endif -#if defined __riscv_zvfh template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemv_f16_1x16_f16(n, s, bs, vx, vy, nr, nc); } @@ -4523,28 +4511,26 @@ template <> void gemm(int n, float * s, siz } #if defined __riscv_zvfh -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc); } -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc); } -#endif -#if defined __riscv_zvfh template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_f16_7x1x16_f16(n, s, bs, vx, vy, nr, nc); } @@ -4991,23 +4977,19 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons // These implement outer-product style matrix multiplication kernels with // an interleave of 1. #if defined __riscv_zvfh - static const ggml::cpu::repack::tensor_traits q4_0_16x1_q8_0; - static const ggml::cpu::repack::tensor_traits q4_K_16x1_q8_K; - static const ggml::cpu::repack::tensor_traits iq4_nl_16x1_q8_0; - static const ggml::cpu::repack::tensor_traits q8_0_16x1_q8_0; - static const ggml::cpu::repack::tensor_traits q2_K_16x1_q8_K; -#endif + static const ggml::cpu::repack::tensor_traits q4_0_16x1_q8_0; + static const ggml::cpu::repack::tensor_traits q4_K_16x1_q8_K; + static const ggml::cpu::repack::tensor_traits iq4_nl_16x1_q8_0; + static const ggml::cpu::repack::tensor_traits q8_0_16x1_q8_0; + static const ggml::cpu::repack::tensor_traits q2_K_16x1_q8_K; // instance for F16 -#if defined __riscv_zvfh static const ggml::cpu::repack::tensor_traits f16_7x16x1_f16; static const ggml::cpu::repack::tensor_traits f16_7x32x1_f16; static const ggml::cpu::repack::tensor_traits f16_7x64x1_f16; static const ggml::cpu::repack::tensor_traits f16_7x128x1_f16; -#endif // instance for F32 -#if defined __riscv_zvfh static const ggml::cpu::repack::tensor_traits f32_7x16x1_f32; static const ggml::cpu::repack::tensor_traits f32_7x32x1_f32; static const ggml::cpu::repack::tensor_traits f32_7x64x1_f32; diff --git a/ggml/src/ggml-cpu/repack.h b/ggml/src/ggml-cpu/repack.h index 16d5a2f313..fcaba7b63e 100644 --- a/ggml/src/ggml-cpu/repack.h +++ b/ggml/src/ggml-cpu/repack.h @@ -149,6 +149,7 @@ struct block_f32 { using block_f32_32x1 = block_f32<32, 1>; using block_f32_7x1 = block_f32<7, 1>; +using block_f32_4x1 = block_f32<4, 1>; #if defined(__cplusplus) extern "C" { @@ -190,6 +191,8 @@ void ggml_gemm_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v void ggml_gemm_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); + +// RISC-V #if defined __riscv_zvfh void ggml_quantize_mat_q8_0_4x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void ggml_quantize_mat_q8_K_4x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); @@ -203,6 +206,28 @@ void ggml_gemm_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); + +// FP16 +void ggml_repack_mat_f16_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_gemv_f16_1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f16_1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f16_1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f16_1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f16_7x1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); + +// FP32 +void ggml_repack_mat_f32_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_gemv_f32_1x16_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f32_1x32_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f32_1x64_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_f32_1x128_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x16_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x32_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x64_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_f32_7x1x128_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); #endif // Native implementations @@ -242,6 +267,8 @@ void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); + +// RISC-V #if defined __riscv_zvfh void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void ggml_quantize_mat_q8_K_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); @@ -255,42 +282,19 @@ void ggml_gemm_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -#endif // FP16 -void ggml_repack_mat_f16_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); -void ggml_repack_mat_f16_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); -void ggml_gemv_f16_1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_f16_1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_f16_1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_f16_1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f16_4x1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f16_7x1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f16_7x1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f16_7x1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f16_7x1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_repack_mat_f16_4x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void ggml_repack_mat_f16_7x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void ggml_gemv_f16_1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_f16_1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_f16_1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_f16_1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f16_4x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); // FP32 -void ggml_repack_mat_f32_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); -void ggml_gemv_f32_1x16_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_f32_1x32_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_f32_1x64_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_f32_1x128_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f32_7x1x16_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f32_7x1x32_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f32_7x1x64_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f32_7x1x128_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_repack_mat_f32_7x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void ggml_gemv_f32_1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_f32_1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); @@ -300,6 +304,7 @@ void ggml_gemm_f32_7x1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const v void ggml_gemm_f32_7x1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f32_7x1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f32_7x1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +#endif #if defined(__cplusplus) } // extern "C"