fix bug of ggml_gemv_q4_K_4x8_q8_K_generic
This commit is contained in:
parent
8a4e25d796
commit
86be98c9d1
|
|
@ -597,21 +597,21 @@ void ggml_gemv_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const int8_t * GGML_RESTRICT q8_ptr = q8->qs;
|
||||
int32x4_t prod = vdupq_n_s32(0);
|
||||
const int16x8_t q8_sums = vpaddq_s16(vld1q_s16(q8->bsums), vld1q_s16(q8->bsums + 8));
|
||||
// when using vgetq_lane_s16, its index must be a constant, which cannot be used in a loop, so use vst1q_s16 instead.
|
||||
// When using vgetq_lane_s16, its index must be a constant, which cannot be used in a loop, so use vst1q_s16 instead.
|
||||
int16_t tmp_arry[8];
|
||||
vst1q_s16(tmp_arry, q8_sums);
|
||||
for (int j = 0; j < QK_K / 32; ++j) {
|
||||
int32x4_t sum0 = vdupq_n_s32(0);
|
||||
int32x4_t sum1 = vdupq_n_s32(0);
|
||||
// each block: scales0 scales1 scales2 scales3 mins0 mins1 mins2 mins3
|
||||
int16x8_t scales_mins = vmovl_s8(vld1_s8((const int8_t *)q4->scales + 8 * j)) ;
|
||||
// Each block: scales0 scales1 scales2 scales3 mins0 mins1 mins2 mins3
|
||||
int16x8_t scales_mins = vmovl_s8(vld1_s8((const int8_t *)q4->scales + 8 * j));
|
||||
prod = vmlal_s16(prod, vdup_n_s16(tmp_arry[j]), vget_high_s16(scales_mins));
|
||||
uint8x16_t q4_0 = vld1q_u8((const uint8_t *) q4_ptr);
|
||||
uint8x16_t q4_1 = vld1q_u8((const uint8_t *) q4_ptr + 16);
|
||||
uint8x16_t q4_2 = vld1q_u8((const uint8_t *) q4_ptr + 32);
|
||||
uint8x16_t q4_3 = vld1q_u8((const uint8_t *) q4_ptr + 48);
|
||||
q4_ptr += 64;
|
||||
int8x16_t q8_0 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr); // 8 个 8-bit
|
||||
int8x16_t q8_0 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr);
|
||||
int8x16_t q8_1 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr + 1);
|
||||
int8x16_t q8_2 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr + 2);
|
||||
int8x16_t q8_3 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr + 3);
|
||||
|
|
|
|||
|
|
@ -408,8 +408,8 @@ void ggml_gemv_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
uint8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols)
|
||||
uint8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols)
|
||||
int8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols)
|
||||
int8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols)
|
||||
float sumf[4]; // 1x4 unit: final result
|
||||
float sum_minf[4]; // 1x4 unit: final minus result
|
||||
int sumi1;
|
||||
|
|
@ -434,23 +434,22 @@ void ggml_gemv_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
for (int i = 0; i < 8; i++) {
|
||||
scales[j][i] = b_ptr[n].scales[i * 8 + j];
|
||||
mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved];
|
||||
mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved];
|
||||
}
|
||||
}
|
||||
|
||||
// core loop: each iteration works on an interleaved unit (four 8-byte segments from 4 cols)
|
||||
for (int k = 0; k < (qk / (2 * blocklen); k++)) {
|
||||
for (int k = 0; k < qk / (2 * blocklen); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi1 = 0;
|
||||
sumi2 = 0;
|
||||
sumi = 0;
|
||||
int8_t scale = scales[j][k / 2];
|
||||
for (int i = 0; i < blocklen; i++) {
|
||||
const int v0 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xf);
|
||||
const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & >> 4);
|
||||
sumi1 = (v0 * a_ptr[n].qs((k / 2) * 32 + (k % 2) * blocklen + i));
|
||||
sumi2 = (v0 * a_ptr[n].qs((k / 2) * 32 + (k % 2) * blocklen + i + 16));
|
||||
uint8_t scale = scales[j][k / 2];
|
||||
sumi += sumi1 * scale + sumi2 * scale;
|
||||
const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
||||
sumi1 = v0 * a_ptr[n].qs[(k / 2) * 32 + (k % 2) * blocklen + i];
|
||||
sumi2 = v1 * a_ptr[n].qs[(k / 2) * 32 + (k % 2) * blocklen + i + 16];
|
||||
sumi += scale * (sumi1 + sumi2);
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[n].d[j]) * a_ptr[n].d;
|
||||
}
|
||||
|
|
@ -1045,8 +1044,8 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
uint8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols)
|
||||
uint8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols)
|
||||
int8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols)
|
||||
int8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols)
|
||||
float sumf[4][4]; // 4x4 unit: final result
|
||||
float sum_minf[4][4]; // 4x4 unit: final minus result
|
||||
int sumi1;
|
||||
|
|
@ -1080,19 +1079,19 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
}
|
||||
|
||||
// core loop: each iteration works on an interleaved unit (four 8-byte segments from 4 cols)
|
||||
for (int k = 0; k < (qk / (2 * blocklen); k++)) {
|
||||
for (int k = 0; k < qk / (2 * blocklen); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi1 = 0;
|
||||
sumi2 = 0;
|
||||
sumi = 0;
|
||||
int8_t scale = scales[j][k / 2];
|
||||
for (int i = 0; i < blocklen; i++) {
|
||||
const int v0 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xf);
|
||||
const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & >> 4);
|
||||
sumi1 = (v0 * a_ptr[n].qs((k / 2) * 128 + (k % 2) * 4 * blocklen + i));
|
||||
sumi2 = (v0 * a_ptr[n].qs((k / 2) * 128 + (k % 2) * 4 * blocklen + i + 64));
|
||||
uint8_t scale = scales[j][k / 2];
|
||||
sumi += sumi1 * scale + sumi2 * scale;
|
||||
const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
||||
sumi1 = v0 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i];
|
||||
sumi2 = v1 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i + 64];
|
||||
sumi += scale * (sumi1 + sumi2);
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[n].d[j]) * a_ptr[n].d[m];
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue