From 0d6c4a0135a2616f4459ff96746c07842422acd5 Mon Sep 17 00:00:00 2001 From: Peter Engler Date: Mon, 12 Jan 2026 13:59:39 +0100 Subject: [PATCH] Recommended changes from PR --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 47 +++++++++++++-------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 84b8b83b00..4f1d429a23 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -1946,7 +1946,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo static const uint32_t kmask1 = 0x3f3f3f3f; static const uint32_t kmask2 = 0x0f0f0f0f; static const uint32_t kmask3 = 0x03030303; - static const uint32_t kmask_3 = 0x30303030; + static const uint32_t kmask4 = 0x30303030; assert (n % qk == 0); assert (nr % 4 == 0); @@ -2852,11 +2852,11 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_mat_0145_13_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_13, 221); //B10(28-31) B11(28-31) B10(28-31) B11(28-31) B14(28-31) B15(28-31) B14(28-31) B15(28-31) const __m256i rhs_mat_2367_13_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_13, 221); //B12(28-31) B13(28-31) B12(28-31) B13(28-31) B16(28-31) B17(28-31) B16(28-31) B17(28-31) - // Scales and Mins of corresponding sub blocks from different Q8_K structures are stored together + // Scales and Mins of corresponding sub blocks from different Q4_K structures are stored together // The below block is for eg to extract first sub block's scales and mins from different Q4_K structures for the sb loop - const uint32_t utmp_03 = ((utmp[2] >> 4) & kmask2) | ((utmp[1] >> 2) & kmask_3); + const uint32_t utmp_03 = ((utmp[2] >> 4) & kmask2) | ((utmp[1] >> 2) & kmask4); const uint32_t utmp_02 = utmp[1] & kmask1; - const uint32_t utmp_01 = (utmp[2] & kmask2) | ((utmp[0] >> 2) & kmask_3); + const uint32_t utmp_01 = (utmp[2] & kmask2) | ((utmp[0] >> 2) & kmask4); const uint32_t utmp_00 = utmp[0] & kmask1; // Scales of first sub block in the sb loop @@ -2864,9 +2864,9 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i scales_0 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(mins_and_scales_0, mins_and_scales_0)); // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures for the sb loop - const uint32_t utmp_13 = ((utmp[5] >> 4) & kmask2) | ((utmp[4] >> 2) & kmask_3); + const uint32_t utmp_13 = ((utmp[5] >> 4) & kmask2) | ((utmp[4] >> 2) & kmask4); const uint32_t utmp_12 = utmp[4] & kmask1; - const uint32_t utmp_11 = (utmp[5] & kmask2) | ((utmp[3] >> 2) & kmask_3); + const uint32_t utmp_11 = (utmp[5] & kmask2) | ((utmp[3] >> 2) & kmask4); const uint32_t utmp_10 = utmp[3] & kmask1; // Scales of second sub block in the sb loop @@ -3089,6 +3089,8 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // dmin values - Load the eight dmin values of block_q4_Kx8 const __m256 col_dmin_f32 = GGML_F32Cx8_LOAD(b_ptr[b].dmin); + const uint32_t *utmp = (const uint32_t*) (b_ptr[b].scales); + // Loop to iterate over the eight sub blocks of a super block - two sub blocks are processed per iteration for (int sb = 0; sb < QK_K / 64; sb++) { @@ -3189,31 +3191,25 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_mat_0145_13_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_13, 221); //B10(28-31) B11(28-31) B10(28-31) B11(28-31) B14(28-31) B15(28-31) B14(28-31) B15(28-31) const __m256i rhs_mat_2367_13_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_13, 221); //B12(28-31) B13(28-31) B12(28-31) B13(28-31) B16(28-31) B17(28-31) B16(28-31) B17(28-31) - uint32_t utmp_0[4], utmp_1[4]; - // Scales and Mins of corresponding sub blocks from different Q4_K structures are stored together // The below block is for eg to extract first sub block's scales and mins from different Q4_K structures for the sb loop - memcpy(utmp_0, b_ptr[b].scales + 24 * sb, 12); - utmp_0[3] = ((utmp_0[2] >> 4) & kmask2) | (((utmp_0[1] >> 6) & kmask3) << 4); - const uint32_t uaux_0 = utmp_0[1] & kmask1; - utmp_0[1] = (utmp_0[2] & kmask2) | (((utmp_0[0] >> 6) & kmask3) << 4); - utmp_0[2] = uaux_0; - utmp_0[0] &= kmask1; - - // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures when sb = 1 - memcpy(utmp_1, b_ptr[b].scales + 12 + sb * 24, 12); - utmp_1[3] = ((utmp_1[2] >> 4) & kmask2) | (((utmp_1[1] >> 6) & kmask3) << 4); - const uint32_t uaux_1 = utmp_1[1] & kmask1; - utmp_1[1] = (utmp_1[2] & kmask2) | (((utmp_1[0] >> 6) & kmask3) << 4); - utmp_1[2] = uaux_1; - utmp_1[0] &= kmask1; + const uint32_t utmp_03 = ((utmp[2] >> 4) & kmask2) | ((utmp[1] >> 2) & kmask4); + const uint32_t utmp_02 = utmp[1] & kmask1; + const uint32_t utmp_01 = (utmp[2] & kmask2) | ((utmp[0] >> 2) & kmask4); + const uint32_t utmp_00 = utmp[0] & kmask1; // Scales of first sub block in the sb loop - const __m128i mins_and_scales_0 = _mm_set_epi32(utmp_0[3], utmp_0[2], utmp_0[1], utmp_0[0]); + __m128i mins_and_scales_0 = _mm_set_epi32(utmp_03, utmp_02, utmp_01, utmp_00); const __m256i scales_0 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(mins_and_scales_0, mins_and_scales_0)); + // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures for the sb loop + const uint32_t utmp_13 = ((utmp[5] >> 4) & kmask2) | ((utmp[4] >> 2) & kmask4); + const uint32_t utmp_12 = utmp[4] & kmask1; + const uint32_t utmp_11 = (utmp[5] & kmask2) | ((utmp[3] >> 2) & kmask4); + const uint32_t utmp_10 = utmp[3] & kmask1; + // Scales of second sub block in the sb loop - const __m128i mins_and_scales_1 = _mm_set_epi32(utmp_1[3], utmp_1[2], utmp_1[1], utmp_1[0]); + __m128i mins_and_scales_1 = _mm_set_epi32(utmp_13, utmp_12, utmp_11, utmp_10); const __m256i scales_1 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(mins_and_scales_1, mins_and_scales_1)); // Mins of first and second sub block of Q4_K block are arranged side by side @@ -3225,6 +3221,8 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i scale_0145_1 = _mm256_shuffle_epi32(scales_1, 68); const __m256i scale_2367_1 = _mm256_shuffle_epi32(scales_1, 238); + utmp += 6; + // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3 // Loaded as set of 128 bit vectors and repeated into a 256 bit vector __m256i lhs_mat_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 256 * sb))); @@ -3396,6 +3394,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(kmask1); UNUSED(kmask2); UNUSED(kmask3); + UNUSED(kmask4); ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc); #endif }