From 49aa628d6bbda55993fda220937fcd1ffffc653c Mon Sep 17 00:00:00 2001 From: yuanjia Date: Wed, 3 Dec 2025 14:36:38 +0800 Subject: [PATCH] fix bug of ggml_gemm_q4_K_4x8_q8_K_generic --- ggml/src/ggml-cpu/arch/arm/repack.cpp | 7 +++++-- ggml/src/ggml-cpu/repack.cpp | 10 +++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp index 6115800361..a351d54823 100644 --- a/ggml/src/ggml-cpu/arch/arm/repack.cpp +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -568,9 +568,11 @@ void ggml_gemv_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const int qk = QK_K; const int nb = n / qk; const int ncols_interleaved = 4; - const int blocklen = 4; + const int blocklen = 8; + assert (n % qk == 0); assert (nc % ncols_interleaved == 0); + UNUSED(s); UNUSED(bs); UNUSED(vx); @@ -580,6 +582,7 @@ void ggml_gemv_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(nb); UNUSED(ncols_interleaved); UNUSED(blocklen); + #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { const block_q4_Kx4 *GGML_RESTRICT q4 = (const block_q4_Kx4*) vx; @@ -2508,7 +2511,7 @@ void ggml_gemm_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const int qk = QK_K; const int nb = n / qk; const int ncols_interleaved = 4; - const int blocklen = 8; // c implementation will use + const int blocklen = 8; assert(n % qk == 0); assert(nr % 4 == 0); diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 696a7113a6..74285101af 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1074,7 +1074,7 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, for (int j = 0; j < ncols_interleaved; j++) { for (int i = 0; i < 8; i++) { scales[j][i] = b_ptr[n].scales[i * 8 + j]; - mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved]; + mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved]; } } @@ -1089,8 +1089,8 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, for (int i = 0; i < blocklen; i++) { const int v0 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xf); const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); - sumi1 = v0 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i]; - sumi2 = v1 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i + 64]; + sumi1 = v0 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + m * blocklen + i]; + sumi2 = v1 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + m * blocklen + i + 64]; sumi += scale * (sumi1 + sumi2); } sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[n].d[j]) * a_ptr[n].d[m]; @@ -1102,8 +1102,8 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, for (int m = 0; m < 4; m++) { for (int j = 0; j < ncols_interleaved; j++) { for (int i = 0; i < QK_K / 32; i++) { - const int16_t *bsums = a_ptr[n].bsums + (i * 8) - ((i % 2) * 6) + (m * 4); - sum_minf[m][j] += mins[j][i] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[n].dmin[j]) * a_ptr[n].d[m]; + const int16_t bsums = a_ptr[n].bsums[i * 2 + m * 16] + a_ptr[n].bsums[i * 2 + 1 + m * 16]; + sum_minf[m][j] += mins[j][i] * bsums * GGML_CPU_FP16_TO_FP32(b_ptr[n].dmin[j]) * a_ptr[n].d[m]; } } }