diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index c7348cc26c..97be06ef4b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3296,12 +3296,30 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { __m128 y_vec = _mm_cvtph_ps(x_vec); _mm_storeu_ps(y + i, y_vec); } -#elif defined(__riscv_zvfh) - for (int vl; i < n; i += vl) { - vl = __riscv_vsetvl_e16m1(n - i); - vfloat16m1_t vx = __riscv_vle16_v_f16m1((_Float16 *)&x[i], vl); - vfloat32m2_t vy = __riscv_vfwcvt_f_f_v_f32m2(vx, vl); - __riscv_vse32_v_f32m2(&y[i], vy, vl); +#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin) + // calculate step size + const int epr = __riscv_vsetvlmax_e16m4(); + const int step = epr * 2; + const int np = (n & ~(step - 1)); + + // unroll by 2 + for (; i < np; i += step) { + vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr); + vfloat32m8_t ay0 = __riscv_vfwcvt_f_f_v_f32m8(ax0, epr); + __riscv_vse32_v_f32m8(y + i, ay0, epr); + + vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr); + vfloat32m8_t ay1 = __riscv_vfwcvt_f_f_v_f32m8(ax1, epr); + __riscv_vse32_v_f32m8(y + i + epr, ay1, epr); + } + + // leftovers + int vl; + for (i = np; i < n; i += vl) { + vl = __riscv_vsetvl_e16m4(n - i); + vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl); + vfloat32m8_t ay0 = __riscv_vfwcvt_f_f_v_f32m8(ax0, vl); + __riscv_vse32_v_f32m8(y + i, ay0, vl); } #endif @@ -3347,6 +3365,31 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) { (const __m128i *)(x + i))), 16))); } +#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin) + // calculate step size + const int epr = __riscv_vsetvlmax_e16m4(); + const int step = epr * 2; + const int np = (n & ~(step - 1)); + + // unroll by 2 + for (; i < np; i += step) { + vbfloat16m4_t ax0 = __riscv_vle16_v_bf16m4((const __bf16*)x + i, epr); + vfloat32m8_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax0, epr); + __riscv_vse32_v_f32m8(y + i, ay0, epr); + + vbfloat16m4_t ax1 = __riscv_vle16_v_bf16m4((const __bf16*)x + i + epr, epr); + vfloat32m8_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax1, epr); + __riscv_vse32_v_f32m8(y + i + epr, ay1, epr); + } + + // leftovers + int vl; + for (i = np; i < n; i += vl) { + vl = __riscv_vsetvl_e16m4(n - i); + vbfloat16m4_t ax0 = __riscv_vle16_v_bf16m4((const __bf16*)x + i, vl); + vfloat32m8_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax0, vl); + __riscv_vse32_v_f32m8(y + i, ay0, vl); + } #endif for (; i < n; i++) { y[i] = GGML_BF16_TO_FP32(x[i]);