fix bug of ggml_gemm_q4_K_4x8_q8_K_generic
This commit is contained in:
parent
86be98c9d1
commit
49aa628d6b
|
|
@ -568,9 +568,11 @@ void ggml_gemv_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
|
|
@ -580,6 +582,7 @@ void ggml_gemv_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
const block_q4_Kx4 *GGML_RESTRICT q4 = (const block_q4_Kx4*) vx;
|
||||
|
|
@ -2508,7 +2511,7 @@ void ggml_gemm_q4_K_4x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 8; // c implementation will use
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
|
|
|
|||
|
|
@ -1074,7 +1074,7 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
for (int i = 0; i < 8; i++) {
|
||||
scales[j][i] = b_ptr[n].scales[i * 8 + j];
|
||||
mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved];
|
||||
mins[j][i] = b_ptr[n].scales[i * 8 + j + ncols_interleaved];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1089,8 +1089,8 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
for (int i = 0; i < blocklen; i++) {
|
||||
const int v0 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xf);
|
||||
const int v1 = (int8_t)(b_ptr[n].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
||||
sumi1 = v0 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i];
|
||||
sumi2 = v1 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + i + 64];
|
||||
sumi1 = v0 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + m * blocklen + i];
|
||||
sumi2 = v1 * a_ptr[n].qs[(k / 2) * 128 + (k % 2) * 4 * blocklen + m * blocklen + i + 64];
|
||||
sumi += scale * (sumi1 + sumi2);
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[n].d[j]) * a_ptr[n].d[m];
|
||||
|
|
@ -1102,8 +1102,8 @@ void ggml_gemm_q4_K_4x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
for (int i = 0; i < QK_K / 32; i++) {
|
||||
const int16_t *bsums = a_ptr[n].bsums + (i * 8) - ((i % 2) * 6) + (m * 4);
|
||||
sum_minf[m][j] += mins[j][i] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[n].dmin[j]) * a_ptr[n].d[m];
|
||||
const int16_t bsums = a_ptr[n].bsums[i * 2 + m * 16] + a_ptr[n].bsums[i * 2 + 1 + m * 16];
|
||||
sum_minf[m][j] += mins[j][i] * bsums * GGML_CPU_FP16_TO_FP32(b_ptr[n].dmin[j]) * a_ptr[n].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue