fix bug of ggml_gemm_q4_K_4x8_q8_K_generic

This commit is contained in:
yuanjia 2025-12-03 14:36:38 +08:00 committed by hongyang
parent 86be98c9d1
commit 49aa628d6b
2 changed files with 10 additions and 7 deletions

View File

@ -568,9 +568,11 @@ void ggml_gemv_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int qk = QK_K;
const int nb = n / qk;
const int ncols_interleaved = 4;
const int blocklen = 4;
const int blocklen = 8;
assert (n % qk == 0);
assert (nc % ncols_interleaved == 0);
UNUSED(s);
UNUSED(bs);
UNUSED(vx);
@ -580,6 +582,7 @@ void ggml_gemv_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const block_q4_Kx4 *GGML_RESTRICT q4 = (const block_q4_Kx4*) vx;
@ -2508,7 +2511,7 @@ void ggml_gemm_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int qk = QK_K;
const int nb = n / qk;
const int ncols_interleaved = 4;
const int blocklen = 8; // c implementation will use
const int blocklen = 8;
assert(n % qk == 0);
assert(nr % 4 == 0);

View File

@ -1074,7 +1074,7 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
for (int j = 0; j < ncols_interleaved; j++) {
for (int i = 0; i < 8; i++) {
scales[j][i] = b_ptr[n].scales[i * 8 + j];
mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved];
mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved];
}
}
@ -1089,8 +1089,8 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
for (int i = 0; i < blocklen; i++) {
const int v0 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xf);
const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
sumi1 = v0 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i];
sumi2 = v1 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i + 64];
sumi1 = v0 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + m * blocklen + i];
sumi2 = v1 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + m * blocklen + i + 64];
sumi += scale * (sumi1 + sumi2);
}
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[n].d[j]) * a_ptr[n].d[m];
@ -1102,8 +1102,8 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
for (int i = 0; i < QK_K / 32; i++) {
const int16_t *bsums = a_ptr[n].bsums + (i * 8) - ((i % 2) * 6) + (m * 4);
sum_minf[m][j] += mins[j][i] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[n].dmin[j]) * a_ptr[n].d[m];
const int16_t bsums = a_ptr[n].bsums[i * 2 + m * 16] + a_ptr[n].bsums[i * 2 + 1 + m * 16];
sum_minf[m][j] += mins[j][i] * bsums * GGML_CPU_FP16_TO_FP32(b_ptr[n].dmin[j]) * a_ptr[n].d[m];
}
}
}