fix bug of ggml_gemv_q4_K_4x8_q8_K_generic

This commit is contained in:
yuanjia 2025-12-02 15:07:29 +08:00 committed by hongyang
parent 8a4e25d796
commit 86be98c9d1
2 changed files with 21 additions and 22 deletions

View File

@ -597,21 +597,21 @@ void ggml_gemv_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int8_t * GGML_RESTRICT q8_ptr = q8->qs;
int32x4_t prod = vdupq_n_s32(0);
const int16x8_t q8_sums = vpaddq_s16(vld1q_s16(q8->bsums), vld1q_s16(q8->bsums + 8));
// when using vgetq_lane_s16, its index must be a constant, which cannot be used in a loop, so use vst1q_s16 instead.
// When using vgetq_lane_s16, its index must be a constant, which cannot be used in a loop, so use vst1q_s16 instead.
int16_t tmp_arry[8];
vst1q_s16(tmp_arry, q8_sums);
for (int j = 0; j < QK_K / 32; ++j) {
int32x4_t sum0 = vdupq_n_s32(0);
int32x4_t sum1 = vdupq_n_s32(0);
// each block: scales0 scales1 scales2 scales3 mins0 mins1 mins2 mins3
int16x8_t scales_mins = vmovl_s8(vld1_s8((const int8_t *)q4->scales + 8 * j)) ;
// Each block: scales0 scales1 scales2 scales3 mins0 mins1 mins2 mins3
int16x8_t scales_mins = vmovl_s8(vld1_s8((const int8_t *)q4->scales + 8 * j));
prod = vmlal_s16(prod, vdup_n_s16(tmp_arry[j]), vget_high_s16(scales_mins));
uint8x16_t q4_0 = vld1q_u8((const uint8_t *) q4_ptr);
uint8x16_t q4_1 = vld1q_u8((const uint8_t *) q4_ptr + 16);
uint8x16_t q4_2 = vld1q_u8((const uint8_t *) q4_ptr + 32);
uint8x16_t q4_3 = vld1q_u8((const uint8_t *) q4_ptr + 48);
q4_ptr += 64;
int8x16_t q8_0 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr); // 8 个 8-bit
int8x16_t q8_0 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr);
int8x16_t q8_1 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr + 1);
int8x16_t q8_2 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr + 2);
int8x16_t q8_3 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr + 3);

View File

@ -408,8 +408,8 @@ void ggml_gemv_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
UNUSED(ncols_interleaved);
UNUSED(blocklen);
uint8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols)
uint8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols)
int8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols)
int8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols)
float sumf[4]; // 1x4 unit: final result
float sum_minf[4]; // 1x4 unit: final minus result
int sumi1;
@ -434,23 +434,22 @@ void ggml_gemv_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
for (int j = 0; j < ncols_interleaved; j++) {
for (int i = 0; i < 8; i++) {
scales[j][i] = b_ptr[n].scales[i * 8 + j];
mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved];
mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved];
}
}
// core loop: each iteration works on an interleaved unit (four 8-byte segments from 4 cols)
for (int k = 0; k < (qk / (2 * blocklen); k++)) {
for (int k = 0; k < qk / (2 * blocklen); k++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi1 = 0;
sumi2 = 0;
sumi = 0;
int8_t scale = scales[j][k / 2];
for (int i = 0; i < blocklen; i++) {
const int v0 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xf);
const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & >> 4);
sumi1 = (v0 * a_ptr[n].qs((k / 2) * 32 + (k % 2) * blocklen + i));
sumi2 = (v0 * a_ptr[n].qs((k / 2) * 32 + (k % 2) * blocklen + i + 16));
uint8_t scale = scales[j][k / 2];
sumi += sumi1 * scale + sumi2 * scale;
const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
sumi1 = v0 * a_ptr[n].qs[(k / 2) * 32 + (k % 2) * blocklen + i];
sumi2 = v1 * a_ptr[n].qs[(k / 2) * 32 + (k % 2) * blocklen + i + 16];
sumi += scale * (sumi1 + sumi2);
}
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[n].d[j]) * a_ptr[n].d;
}
@ -1045,8 +1044,8 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
UNUSED(ncols_interleaved);
UNUSED(blocklen);
uint8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols)
uint8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols)
int8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols)
int8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols)
float sumf[4][4]; // 4x4 unit: final result
float sum_minf[4][4]; // 4x4 unit: final minus result
int sumi1;
@ -1080,19 +1079,19 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
}
// core loop: each iteration works on an interleaved unit (four 8-byte segments from 4 cols)
for (int k = 0; k < (qk / (2 * blocklen); k++)) {
for (int k = 0; k < qk / (2 * blocklen); k++) {
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi1 = 0;
sumi2 = 0;
sumi = 0;
int8_t scale = scales[j][k / 2];
for (int i = 0; i < blocklen; i++) {
const int v0 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xf);
const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & >> 4);
sumi1 = (v0 * a_ptr[n].qs((k / 2) * 128 + (k % 2) * 4 * blocklen + i));
sumi2 = (v0 * a_ptr[n].qs((k / 2) * 128 + (k % 2) * 4 * blocklen + i + 64));
uint8_t scale = scales[j][k / 2];
sumi += sumi1 * scale + sumi2 * scale;
const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
sumi1 = v0 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i];
sumi2 = v1 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i + 64];
sumi += scale * (sumi1 + sumi2);
}
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[n].d[j]) * a_ptr[n].d[m];
}