diff --git a/ggml/src/ggml-cpu/arch/riscv/repack.cpp b/ggml/src/ggml-cpu/arch/riscv/repack.cpp index 358e43f8d2..c1541d1c03 100644 --- a/ggml/src/ggml-cpu/arch/riscv/repack.cpp +++ b/ggml/src/ggml-cpu/arch/riscv/repack.cpp @@ -343,6 +343,8 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo template static inline void ggml_gemv_f16_1xM_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + GGML_UNUSED(bs); + const int nb = n / 1; assert (nr == 1); @@ -369,39 +371,41 @@ static inline void ggml_gemv_f16_1xM_f16(int n, float * GGML_RESTRICT s, size_t } void ggml_gemv_f16_1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemv_f16_1xM_f16<16>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemv_f16_1x16_f16_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemv_f16_1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemv_f16_1xM_f16<32>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemv_f16_1x32_f16_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemv_f16_1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemv_f16_1xM_f16<64>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemv_f16_1x64_f16_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemv_f16_1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemv_f16_1xM_f16<128>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemv_f16_1x128_f16_generic(n, s, bs, vx, vy, nr, nc); +#endif } template static inline void ggml_gemv_f32_1xM_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + GGML_UNUSED(bs); + const int nb = n / 1; assert (nr == 1); @@ -428,35 +432,35 @@ static inline void ggml_gemv_f32_1xM_f32(int n, float * GGML_RESTRICT s, size_t } void ggml_gemv_f32_1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemv_f32_1xM_f32<16>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemv_f32_1x16_f32_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemv_f32_1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemv_f32_1xM_f32<32>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemv_f32_1x32_f32_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemv_f32_1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemv_f32_1xM_f32<64>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemv_f32_1x64_f32_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemv_f32_1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemv_f32_1xM_f32<128>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemv_f32_1x128_f32_generic(n, s, bs, vx, vy, nr, nc); +#endif } template @@ -506,35 +510,35 @@ static inline void ggml_gemm_f16_7x1xM_f16(int n, float * GGML_RESTRICT s, size_ } void ggml_gemm_f16_7x1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemm_f16_7x1xM_f16<16>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemm_f16_7x1x16_f16_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemm_f16_7x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemm_f16_7x1xM_f16<32>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemm_f16_7x1x32_f16_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemm_f16_7x1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemm_f16_7x1xM_f16<64>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemm_f16_7x1x64_f16_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemm_f16_7x1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemm_f16_7x1xM_f16<128>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemm_f16_7x1x128_f16_generic(n, s, bs, vx, vy, nr, nc); +#endif } template @@ -584,33 +588,33 @@ static inline void ggml_gemm_f32_7x1xM_f32(int n, float * GGML_RESTRICT s, size_ } void ggml_gemm_f32_7x1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemm_f32_7x1xM_f32<16>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemm_f32_7x1x16_f32_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemm_f32_7x1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemm_f32_7x1xM_f32<32>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemm_f32_7x1x32_f32_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemm_f32_7x1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemm_f32_7x1xM_f32<64>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemm_f32_7x1x64_f32_generic(n, s, bs, vx, vy, nr, nc); +#endif } void ggml_gemm_f32_7x1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { -#if defined __riscv_v_intrinsic +#if defined __riscv_zvfh ggml_gemm_f32_7x1xM_f32<128>(n, s, bs, vx, vy, nr, nc); - return; -#endif +#else ggml_gemm_f32_7x1x128_f32_generic(n, s, bs, vx, vy, nr, nc); +#endif } diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index cee953d23b..a40f2ce801 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -31,7 +31,7 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -// Helper template functions for `fp16` and `fp32`. +// Helper functions for `fp16` and `fp32`. template static inline void ggml_repack_mat_f16_NxK_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { @@ -262,6 +262,7 @@ void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GG } } +#if defined __riscv_zvfh void ggml_repack_mat_f16_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { ggml_repack_mat_f16_NxK_generic<7, 1>(x, vy, k); } @@ -269,6 +270,7 @@ void ggml_repack_mat_f16_7x1_generic(const float * GGML_RESTRICT x, void * GGML_ void ggml_repack_mat_f32_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { ggml_repack_mat_f32_NxK_generic<7, 1>(x, vy, k); } +#endif } // extern "C" @@ -299,6 +301,7 @@ template <> void ggml_repack_mat_t<4, 8, GGML_TYPE_Q8_K>(const float * GGML_REST ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row); } +#if defined __riscv_zvfh template <> void ggml_repack_mat_t<7, 1, GGML_TYPE_F16>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { assert(nrow == 7); UNUSED(nrow); @@ -310,6 +313,7 @@ template <> void ggml_repack_mat_t<7, 1, GGML_TYPE_F32>(const float * GGML_RESTR UNUSED(nrow); ggml_repack_mat_f32_7x1(x, vy, n_per_row); } +#endif template static inline void ggml_gemv_f16_KxM_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { @@ -333,7 +337,7 @@ static inline void ggml_gemv_f16_KxM_f16_generic(int n, float * GGML_RESTRICT s, for (int l = 0; l < nb; l++) { for (int j = 0; j < ncols_interleaved; j++) { for (int k = 0; k < interleave_size; k++) { - sumf[j] += GGML_FP16_TO_FP32(b_ptr[l].d[j * interleave_size + k]) * GGML_FP16_TO_FP32(a_ptr[l + k]); + sumf[j] += GGML_FP16_TO_FP32(b_ptr[l].d[j * interleave_size + k]) * GGML_FP16_TO_FP32(a_ptr[l * interleave_size + k]); } } } @@ -363,7 +367,7 @@ static inline void ggml_gemv_f32_KxM_f32_generic(int n, float * GGML_RESTRICT s, for (int l = 0; l < nb; l++) { for (int j = 0; j < ncols_interleaved; j++) { for (int k = 0; k < interleave_size; k++) { - sumf[j] += b_ptr[l].d[j * interleave_size + k] * a_ptr[l + k]; + sumf[j] += b_ptr[l].d[j * interleave_size + k] * a_ptr[l * interleave_size + k]; } } } @@ -375,7 +379,7 @@ template static inline void ggml_gemm_f16_NxKxM_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int nb = n / interleave_size; - assert (nr % nrows == 0); + assert(nr % nrows == 0); assert(n % interleave_size == 0); assert(nc % ncols_interleaved == 0); @@ -395,7 +399,7 @@ static inline void ggml_gemm_f16_NxKxM_f16_generic(int n, float * GGML_RESTRICT for (int m = 0; m < nrows; m++) { for (int j = 0; j < ncols_interleaved; j++) { for (int k = 0; k < interleave_size; k++) { - sumf[m][j] += b_ptr[l].d[j * interleave_size + k] * a_ptr[l].d[m * interleave_size + k]; + sumf[m][j] += GGML_FP16_TO_FP32(b_ptr[l].d[j * interleave_size + k]) * GGML_FP16_TO_FP32(a_ptr[l].d[m * interleave_size + k]); } } } @@ -412,7 +416,7 @@ template static inline void ggml_gemm_f32_NxKxM_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int nb = n / interleave_size; - assert (nr % nrows == 0); + assert(nr % nrows == 0); assert(n % interleave_size == 0); assert(nc % ncols_interleaved == 0); @@ -1135,7 +1139,7 @@ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } } -void ggml_gemv_q8_0_4x4_q8_0_generic(int n, +#if defined __riscv_zvfhvoid ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, @@ -1182,53 +1186,23 @@ void ggml_gemv_q8_0_4x4_q8_0_generic(int n, } } -void ggml_gemv_q8_0_4x8_q8_0_generic(int n, - float * GGML_RESTRICT s, - size_t bs, - const void * GGML_RESTRICT vx, - const void * GGML_RESTRICT vy, - int nr, - int nc) { - const int qk = QK8_0; - const int nb = n / qk; - const int ncols_interleaved = 4; - const int blocklen = 8; - - assert(nr == 1); - assert(n % qk == 0); - assert(nc % ncols_interleaved == 0); - - UNUSED(bs); - UNUSED(nr); - - float sumf[4]; - int sumi; - - const block_q8_0 * a_ptr = (const block_q8_0 *) vy; - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb); - - for (int j = 0; j < ncols_interleaved; j++) { - sumf[j] = 0.0; - } - for (int l = 0; l < nb; l++) { - for (int k = 0; k < (qk / blocklen); k++) { - for (int j = 0; j < ncols_interleaved; j++) { - sumi = 0; - for (int i = 0; i < blocklen; ++i) { - const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i]; - sumi += v0 * a_ptr[l].qs[k * blocklen + i]; - } - sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); - } - } - } - for (int j = 0; j < ncols_interleaved; j++) { - s[x * ncols_interleaved + j] = sumf[j]; - } - } +void ggml_gemv_f32_1x16_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_KxM_f32_generic<1, 16>(n, s, bs, vx, vy, nr, nc); } +void ggml_gemv_f32_1x32_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_KxM_f32_generic<1, 32>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x64_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_KxM_f32_generic<1, 64>(n, s, bs, vx, vy, nr, nc); +} + +void ggml_gemv_f32_1x128_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + ggml_gemv_f32_KxM_f32_generic<1, 128>(n, s, bs, vx, vy, nr, nc); +} +#endif + void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; @@ -1955,6 +1929,7 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs } } +#if defined __riscv_zvfh void ggml_gemm_f16_7x1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { ggml_gemm_f16_NxKxM_f16_generic<7, 1, 16>(n, s, bs, vx, vy, nr, nc); } @@ -1986,6 +1961,7 @@ void ggml_gemm_f32_7x1x64_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, void ggml_gemm_f32_7x1x128_f32_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { ggml_gemm_f32_NxKxM_f32_generic<7, 1, 128>(n, s, bs, vx, vy, nr, nc); } +#endif } // extern "C" @@ -2670,14 +2646,14 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b GGML_UNUSED(data_size); } -template -static int repack_f16_to_f16_N_bl(struct ggml_tensor * t, const void * GGML_RESTRICT data, size_t data_size) { +template +static int repack_f16_to_f16_MxK_bl(struct ggml_tensor * t, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_F16); const ggml_half * src = (const ggml_half *)data; - block_f16 * dst = ( block_f16 *)t->data; + block_f16 * dst = ( block_f16 *)t->data; - ggml_half dst_tmp[nrows_interleaved * interleave_size]; + ggml_half dst_tmp[ncols_interleaved * interleave_size]; int nrow = ggml_nrows(t); int row_size = t->ne[0]; @@ -2685,19 +2661,19 @@ static int repack_f16_to_f16_N_bl(struct ggml_tensor * t, const void * GGML_REST GGML_ASSERT(data_size == nrow * nblocks * interleave_size * sizeof(ggml_half)); - if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % interleave_size != 0) { + if (t->ne[1] % ncols_interleaved != 0 || t->ne[0] % interleave_size != 0) { return -1; } - for (int b = 0; b < nrow; b += nrows_interleaved) { + for (int b = 0; b < nrow; b += ncols_interleaved) { for (int i = 0; i < nblocks; i++) { - for (int j = 0; j < nrows_interleaved; j++) { + for (int j = 0; j < ncols_interleaved; j++) { for (int k = 0; k < interleave_size; k++) { dst_tmp[j * interleave_size + k] = src[(j + b) * row_size + i * interleave_size + k]; } } - block_f16 out; - memcpy(&out.d, dst_tmp, sizeof(ggml_half) * nrows_interleaved * interleave_size); + block_f16 out; + memcpy(&out.d, dst_tmp, sizeof(ggml_half) * ncols_interleaved * interleave_size); *dst = out; dst++; } @@ -2706,14 +2682,14 @@ static int repack_f16_to_f16_N_bl(struct ggml_tensor * t, const void * GGML_REST return 0; } -template -static int repack_f32_to_f32_N_bl(struct ggml_tensor * t, const void * GGML_RESTRICT data, size_t data_size) { +template +static int repack_f32_to_f32_MxK_bl(struct ggml_tensor * t, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_F32); const float * src = (const float *)data; - block_f32 * dst = ( block_f32 *)t->data; + block_f32 * dst = ( block_f32 *)t->data; - float dst_tmp[nrows_interleaved * interleave_size]; + float dst_tmp[ncols_interleaved * interleave_size]; int nrow = ggml_nrows(t); int row_size = t->ne[0]; @@ -2721,19 +2697,19 @@ static int repack_f32_to_f32_N_bl(struct ggml_tensor * t, const void * GGML_REST GGML_ASSERT(data_size == nrow * nblocks * interleave_size * sizeof(float)); - if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % interleave_size != 0) { + if (t->ne[1] % ncols_interleaved != 0 || t->ne[0] % interleave_size != 0) { return -1; } - for (int b = 0; b < nrow; b += nrows_interleaved) { + for (int b = 0; b < nrow; b += ncols_interleaved) { for (int i = 0; i < nblocks; i++) { - for (int j = 0; j < nrows_interleaved; j++) { + for (int j = 0; j < ncols_interleaved; j++) { for (int k = 0; k < interleave_size; k++) { dst_tmp[j * interleave_size + k] = src[(j + b) * row_size + i * interleave_size + k]; } } - block_f32 out; - memcpy(&out.d, dst_tmp, sizeof(float) * nrows_interleaved * interleave_size); + block_f32 out; + memcpy(&out.d, dst_tmp, sizeof(float) * ncols_interleaved * interleave_size); *dst = out; dst++; } @@ -2793,31 +2769,33 @@ template <> int repack(struct ggml_tensor * t, const void * return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size); } +#if defined __riscv_zvfh template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_f16_to_f16_N_bl<16, 1>(t, data, data_size); + return repack_f16_to_f16_MxK_bl<16, 1>(t, data, data_size); } template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_f16_to_f16_N_bl<32, 1>(t, data, data_size); + return repack_f16_to_f16_MxK_bl<32, 1>(t, data, data_size); } template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_f16_to_f16_N_bl<64, 1>(t, data, data_size); + return repack_f16_to_f16_MxK_bl<64, 1>(t, data, data_size); } template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_f16_to_f16_N_bl<128, 1>(t, data, data_size); + return repack_f16_to_f16_MxK_bl<128, 1>(t, data, data_size); } template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_f32_to_f32_N_bl<16, 1>(t, data, data_size); + return repack_f32_to_f32_MxK_bl<16, 1>(t, data, data_size); } template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_f32_to_f32_N_bl<32, 1>(t, data, data_size); + return repack_f32_to_f32_MxK_bl<32, 1>(t, data, data_size); } template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_f32_to_f32_N_bl<64, 1>(t, data, data_size); + return repack_f32_to_f32_MxK_bl<64, 1>(t, data, data_size); } template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_f32_to_f32_N_bl<128, 1>(t, data, data_size); + return repack_f32_to_f32_MxK_bl<128, 1>(t, data, data_size); } +#endif // gemv template @@ -2870,6 +2848,7 @@ template <> void gemv(int n, float * s, size ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc); } +#if defined __riscv_zvfh template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemv_f16_1x16_f16(n, s, bs, vx, vy, nr, nc); } @@ -2901,6 +2880,7 @@ template <> void gemv(int n, float * s, size_t bs, template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemv_f32_1x128_f32(n, s, bs, vx, vy, nr, nc); } +#endif // gemm template @@ -2953,6 +2933,7 @@ template <> void gemm(int n, float * s, s ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc); } +#if defined __riscv_zvfh template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_f16_7x1x16_f16(n, s, bs, vx, vy, nr, nc); } @@ -2984,6 +2965,7 @@ template <> void gemm(int n, float * s, size_t b template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_f32_7x1x128_f32(n, s, bs, vx, vy, nr, nc); } +#endif class tensor_traits_base : public ggml::cpu::tensor_traits { public: @@ -3072,7 +3054,7 @@ template wdata + params->wsize); - // If there are more than three rows in src1, use gemm; otherwise, use gemv. + // If there are more than `NB_ROWS` rows in src1, use gemm; otherwise, use gemv. if (nrows > (NB_ROWS - 1)) { gemm(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0, src0_ptr + src0_start * nb01, src1_ptr, diff --git a/ggml/src/ggml-cpu/repack.h b/ggml/src/ggml-cpu/repack.h index b2044ff7b5..1badee46a4 100644 --- a/ggml/src/ggml-cpu/repack.h +++ b/ggml/src/ggml-cpu/repack.h @@ -177,25 +177,22 @@ void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +#ifdef __riscv_zvfh // FP16 -void ggml_repack_mat_f16_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void ggml_repack_mat_f16_7x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void ggml_gemv_f16_1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_f16_1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_f16_1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_f16_1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f16_4x1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x16_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x32_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x64_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x128_f16_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_repack_mat_f16_4x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void ggml_repack_mat_f16_7x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void ggml_gemv_f16_1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_f16_1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_f16_1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_f16_1x128_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_f16_4x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x16_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x32_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f16_7x1x64_f16(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); @@ -220,6 +217,7 @@ void ggml_gemm_f32_7x1x16_f32(int n, float * GGML_RESTRICT s, size_t bs, const v void ggml_gemm_f32_7x1x32_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f32_7x1x64_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_f32_7x1x128_f32(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +#endif #if defined(__cplusplus) } // extern "C"