diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp index 1793409820..6115800361 100644 --- a/ggml/src/ggml-cpu/arch/arm/repack.cpp +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -597,21 +597,21 @@ void ggml_gemv_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const int8_t * GGML_RESTRICT q8_ptr = q8->qs; int32x4_t prod = vdupq_n_s32(0); const int16x8_t q8_sums = vpaddq_s16(vld1q_s16(q8->bsums), vld1q_s16(q8->bsums + 8)); - // when using vgetq_lane_s16, its index must be a constant, which cannot be used in a loop, so use vst1q_s16 instead. + // When using vgetq_lane_s16, its index must be a constant, which cannot be used in a loop, so use vst1q_s16 instead. int16_t tmp_arry[8]; vst1q_s16(tmp_arry, q8_sums); for (int j = 0; j < QK_K / 32; ++j) { int32x4_t sum0 = vdupq_n_s32(0); int32x4_t sum1 = vdupq_n_s32(0); - // each block: scales0 scales1 scales2 scales3 mins0 mins1 mins2 mins3 - int16x8_t scales_mins = vmovl_s8(vld1_s8((const int8_t *)q4->scales + 8 * j)) ; + // Each block: scales0 scales1 scales2 scales3 mins0 mins1 mins2 mins3 + int16x8_t scales_mins = vmovl_s8(vld1_s8((const int8_t *)q4->scales + 8 * j)); prod = vmlal_s16(prod, vdup_n_s16(tmp_arry[j]), vget_high_s16(scales_mins)); uint8x16_t q4_0 = vld1q_u8((const uint8_t *) q4_ptr); uint8x16_t q4_1 = vld1q_u8((const uint8_t *) q4_ptr + 16); uint8x16_t q4_2 = vld1q_u8((const uint8_t *) q4_ptr + 32); uint8x16_t q4_3 = vld1q_u8((const uint8_t *) q4_ptr + 48); q4_ptr += 64; - int8x16_t q8_0 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr); // 8 δΈͺ 8-bit + int8x16_t q8_0 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr); int8x16_t q8_1 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr + 1); int8x16_t q8_2 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr + 2); int8x16_t q8_3 = (int8x16_t) vld1q_dup_s64((const int64_t *) q8_ptr + 3); diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index eed54473ea..696a7113a6 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -408,8 +408,8 @@ void ggml_gemv_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, UNUSED(ncols_interleaved); UNUSED(blocklen); - uint8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols) - uint8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols) + int8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols) + int8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols) float sumf[4]; // 1x4 unit: final result float sum_minf[4]; // 1x4 unit: final minus result int sumi1; @@ -434,23 +434,22 @@ void ggml_gemv_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, for (int j = 0; j < ncols_interleaved; j++) { for (int i = 0; i < 8; i++) { scales[j][i] = b_ptr[n].scales[i * 8 + j]; - mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved]; + mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved]; } } - // core loop: each iteration works on an interleaved unit (four 8-byte segments from 4 cols) - for (int k = 0; k < (qk / (2 * blocklen); k++)) { + for (int k = 0; k < qk / (2 * blocklen); k++) { for (int j = 0; j < ncols_interleaved; j++) { sumi1 = 0; sumi2 = 0; sumi = 0; + int8_t scale = scales[j][k / 2]; for (int i = 0; i < blocklen; i++) { const int v0 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xf); - const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & >> 4); - sumi1 = (v0 * a_ptr[n].qs((k / 2) * 32 + (k % 2) * blocklen + i)); - sumi2 = (v0 * a_ptr[n].qs((k / 2) * 32 + (k % 2) * blocklen + i + 16)); - uint8_t scale = scales[j][k / 2]; - sumi += sumi1 * scale + sumi2 * scale; + const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); + sumi1 = v0 * a_ptr[n].qs[(k / 2) * 32 + (k % 2) * blocklen + i]; + sumi2 = v1 * a_ptr[n].qs[(k / 2) * 32 + (k % 2) * blocklen + i + 16]; + sumi += scale * (sumi1 + sumi2); } sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[n].d[j]) * a_ptr[n].d; } @@ -1045,8 +1044,8 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, UNUSED(ncols_interleaved); UNUSED(blocklen); - uint8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols) - uint8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols) + int8_t scales[4][8]; // scales for 8 subblocks of 4 q4_k unit (4 cols) + int8_t mins[4][8]; // mins for 8 subblocks of 4 q4_k unit (4 cols) float sumf[4][4]; // 4x4 unit: final result float sum_minf[4][4]; // 4x4 unit: final minus result int sumi1; @@ -1080,19 +1079,19 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } // core loop: each iteration works on an interleaved unit (four 8-byte segments from 4 cols) - for (int k = 0; k < (qk / (2 * blocklen); k++)) { + for (int k = 0; k < qk / (2 * blocklen); k++) { for (int m = 0; m < 4; m++) { for (int j = 0; j < ncols_interleaved; j++) { sumi1 = 0; sumi2 = 0; sumi = 0; + int8_t scale = scales[j][k / 2]; for (int i = 0; i < blocklen; i++) { const int v0 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xf); - const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & >> 4); - sumi1 = (v0 * a_ptr[n].qs((k / 2) * 128 + (k % 2) * 4 * blocklen + i)); - sumi2 = (v0 * a_ptr[n].qs((k / 2) * 128 + (k % 2) * 4 * blocklen + i + 64)); - uint8_t scale = scales[j][k / 2]; - sumi += sumi1 * scale + sumi2 * scale; + const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); + sumi1 = v0 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i]; + sumi2 = v1 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i + 64]; + sumi += scale * (sumi1 + sumi2); } sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[n].d[j]) * a_ptr[n].d[m]; }