ggml-cpu: add rvv ggml_quantize_mat_4x8 for q8_0

Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>
This commit is contained in:
taimur-10x 2025-12-15 19:27:20 +05:00
parent c0204a0893
commit 4febe1b725
1 changed files with 88 additions and 0 deletions

View File

@ -24,6 +24,94 @@
#define UNUSED GGML_UNUSED
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(QK8_0 == 32);
assert(k % QK8_0 == 0);
const int nb = k / QK8_0;
#if defined(__riscv_v_intrinsic)
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
const size_t vl_calc = __riscv_vsetvl_e32m8(QK8_0);
const size_t vl_save = __riscv_vsetvl_e64m2(4);
vfloat32m1_t v_scalar_zero = __riscv_vfmv_s_f_f32m1(0.0f, __riscv_vsetvl_e32m1(1));
for (int i = 0; i < nb; i++) {
const float *x_block_base = x + i * QK8_0;
vint8m2_t q_r0, q_r1, q_r2, q_r3;
{
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 0 * k, vl_calc);
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
float d = amax / 127.0f;
y[i].d[0] = GGML_CPU_FP32_TO_FP16(d);
float id = d ? 1.0f / d : 0.0f;
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
q_r0 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
}
asm volatile ("" ::: "memory");
{
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 1 * k, vl_calc);
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
float d = amax / 127.0f;
y[i].d[1] = GGML_CPU_FP32_TO_FP16(d);
float id = d ? 1.0f / d : 0.0f;
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
q_r1 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
}
asm volatile ("" ::: "memory");
{
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 2 * k, vl_calc);
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
float d = amax / 127.0f;
y[i].d[2] = GGML_CPU_FP32_TO_FP16(d);
float id = d ? 1.0f / d : 0.0f;
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
q_r2 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
}
asm volatile ("" ::: "memory");
{
vfloat32m8_t v_src = __riscv_vle32_v_f32m8(x_block_base + 3 * k, vl_calc);
vfloat32m8_t v_abs = __riscv_vfabs_v_f32m8(v_src, vl_calc);
vfloat32m1_t v_max = __riscv_vfredmax_vs_f32m8_f32m1(v_abs, v_scalar_zero, vl_calc);
float amax = __riscv_vfmv_f_s_f32m1_f32(v_max);
float d = amax / 127.0f;
y[i].d[3] = GGML_CPU_FP32_TO_FP16(d);
float id = d ? 1.0f / d : 0.0f;
vfloat32m8_t v_scaled = __riscv_vfmul_vf_f32m8(v_src, id, vl_calc);
vint16m4_t v_i16 = __riscv_vfncvt_x_f_w_i16m4_rm(v_scaled, 4, vl_calc);
q_r3 = __riscv_vncvt_x_x_w_i8m2(v_i16, vl_calc);
}
vint64m2_t v_q64_r0 = __riscv_vreinterpret_v_i8m2_i64m2(q_r0);
vint64m2_t v_q64_r1 = __riscv_vreinterpret_v_i8m2_i64m2(q_r1);
vint64m2_t v_q64_r2 = __riscv_vreinterpret_v_i8m2_i64m2(q_r2);
vint64m2_t v_q64_r3 = __riscv_vreinterpret_v_i8m2_i64m2(q_r3);
vint64m2x4_t v_quant_tuple = __riscv_vcreate_v_i64m2x4(v_q64_r0, v_q64_r1, v_q64_r2, v_q64_r3);
__riscv_vsseg4e64_v_i64m2x4((int64_t*)y[i].qs, v_quant_tuple, vl_save);
}
#else
UNUSED(nb);
UNUSED(y);
ggml_quantize_mat_q8_0_4x4_generic(x, vy, k);
#endif
}
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;