ggml-cpu: add rvv ggml_quantize_mat_4x8 for q8_0
Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>
This commit is contained in:
parent
c0204a0893
commit
4febe1b725
|
|
@ -24,6 +24,94 @@
|
|||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
assert(QK8_0 == 32);
|
||||
assert(k % QK8_0 == 0);
|
||||
const int nb = k / QK8_0;
|
||||
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
|
||||
const size_t vl_calc = __riscv_vsetvl_e32m8(QK8_0);
|
||||
const size_t vl_save = __riscv_vsetvl_e64m2(4);
|
||||
vfloat32m1_t v_scalar_zero = __riscv_vfmv_s_f_f32m1(0.0f, __riscv_vsetvl_e32m1(1));
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float *x_block_base = x + i * QK8_0;
|
||||
vint8m2_t q_r0, q_r1, q_r2, q_r3;
|
||||
{
|
||||
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 0 * k, vl_calc);
|
||||
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
|
||||
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
|
||||
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
|
||||
|
||||
float d = amax / 127.0f;
|
||||
y[i].d[0] = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
float id = d ? 1.0f / d : 0.0f;
|
||||
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
|
||||
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
|
||||
q_r0 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
|
||||
}
|
||||
asm volatile ("" ::: "memory");
|
||||
|
||||
{
|
||||
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 1 * k, vl_calc);
|
||||
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
|
||||
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
|
||||
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
|
||||
|
||||
float d = amax / 127.0f;
|
||||
y[i].d[1] = GGML_CPU_FP32_TO_FP16(d);
|
||||
float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
|
||||
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
|
||||
q_r1 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
|
||||
}
|
||||
asm volatile ("" ::: "memory");
|
||||
{
|
||||
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 2 * k, vl_calc);
|
||||
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
|
||||
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
|
||||
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
|
||||
|
||||
float d = amax / 127.0f;
|
||||
y[i].d[2] = GGML_CPU_FP32_TO_FP16(d);
|
||||
float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
|
||||
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
|
||||
q_r2 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
|
||||
}
|
||||
asm volatile ("" ::: "memory");
|
||||
{
|
||||
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 3 * k, vl_calc);
|
||||
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
|
||||
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
|
||||
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
|
||||
|
||||
float d = amax / 127.0f;
|
||||
y[i].d[3] = GGML_CPU_FP32_TO_FP16(d);
|
||||
float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
|
||||
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
|
||||
q_r3 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
|
||||
}
|
||||
vint64m2_t v_q64_r0 = __riscv_vreinterpret_v_i8m2_i64m2(q_r0);
|
||||
vint64m2_t v_q64_r1 = __riscv_vreinterpret_v_i8m2_i64m2(q_r1);
|
||||
vint64m2_t v_q64_r2 = __riscv_vreinterpret_v_i8m2_i64m2(q_r2);
|
||||
vint64m2_t v_q64_r3 = __riscv_vreinterpret_v_i8m2_i64m2(q_r3);
|
||||
vint64m2x4_t v_quant_tuple = __riscv_vcreate_v_i64m2x4(v_q64_r0, v_q64_r1, v_q64_r2, v_q64_r3);
|
||||
__riscv_vsseg4e64_v_i64m2x4((int64_t*)y[i].qs, v_quant_tuple, vl_save);
|
||||
}
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(y);
|
||||
ggml_quantize_mat_q8_0_4x4_generic(x, vy, k);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
|
|||
Loading…
Reference in New Issue