From 0d6c4a0135a2616f4459ff96746c07842422acd5 Mon Sep 17 00:00:00 2001
From: Peter Engler <peter@engler-software.de>
Date: Mon, 12 Jan 2026 13:59:39 +0100
Subject: [PATCH] Recommended changes from PR

---
 ggml/src/ggml-cpu/arch/x86/repack.cpp | 47 +++++++++++++--------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp
index 84b8b83b00..4f1d429a23 100644
--- a/ggml/src/ggml-cpu/arch/x86/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp
@@ -1946,7 +1946,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
     static const uint32_t kmask1 = 0x3f3f3f3f;
     static const uint32_t kmask2 = 0x0f0f0f0f;
     static const uint32_t kmask3 = 0x03030303;
-    static const uint32_t kmask_3 = 0x30303030;
+    static const uint32_t kmask4 = 0x30303030;
 
     assert (n % qk == 0);
     assert (nr % 4 == 0);
@@ -2852,11 +2852,11 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     const __m256i rhs_mat_0145_13_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_13, 221); //B10(28-31) B11(28-31) B10(28-31) B11(28-31) B14(28-31) B15(28-31) B14(28-31) B15(28-31)
                     const __m256i rhs_mat_2367_13_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_13, 221); //B12(28-31) B13(28-31) B12(28-31) B13(28-31) B16(28-31) B17(28-31) B16(28-31) B17(28-31)
 
-                    // Scales and Mins of corresponding sub blocks from different Q8_K structures are stored together
+                    // Scales and Mins of corresponding sub blocks from different Q4_K structures are stored together
                     // The below block is for eg to extract first sub block's scales and mins from different Q4_K structures for the sb loop
-                    const uint32_t utmp_03 = ((utmp[2] >> 4) & kmask2) | ((utmp[1] >> 2) & kmask_3);
+                    const uint32_t utmp_03 = ((utmp[2] >> 4) & kmask2) | ((utmp[1] >> 2) & kmask4);
                     const uint32_t utmp_02 = utmp[1] & kmask1;
-                    const uint32_t utmp_01 = (utmp[2] & kmask2) | ((utmp[0] >> 2) & kmask_3);
+                    const uint32_t utmp_01 = (utmp[2] & kmask2) | ((utmp[0] >> 2) & kmask4);
                     const uint32_t utmp_00 = utmp[0] & kmask1;
 
                     // Scales of first sub block in the sb loop
@@ -2864,9 +2864,9 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     const __m256i scales_0 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(mins_and_scales_0, mins_and_scales_0));
 
                     // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures for the sb loop
-                    const uint32_t utmp_13 = ((utmp[5] >> 4) & kmask2) | ((utmp[4] >> 2) & kmask_3);
+                    const uint32_t utmp_13 = ((utmp[5] >> 4) & kmask2) | ((utmp[4] >> 2) & kmask4);
                     const uint32_t utmp_12 = utmp[4] & kmask1;
-                    const uint32_t utmp_11 = (utmp[5] & kmask2) | ((utmp[3] >> 2) & kmask_3);
+                    const uint32_t utmp_11 = (utmp[5] & kmask2) | ((utmp[3] >> 2) & kmask4);
                     const uint32_t utmp_10 = utmp[3] & kmask1;
 
                     // Scales of second sub block in the sb loop
@@ -3089,6 +3089,8 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                 // dmin values - Load the eight dmin values of block_q4_Kx8
                 const __m256 col_dmin_f32 = GGML_F32Cx8_LOAD(b_ptr[b].dmin);
 
+                const uint32_t *utmp = (const uint32_t*) (b_ptr[b].scales);
+
                 // Loop to iterate over the eight sub blocks of a super block - two sub blocks are processed per iteration
                 for (int sb = 0; sb < QK_K / 64; sb++) {
 
@@ -3189,31 +3191,25 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     const __m256i rhs_mat_0145_13_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_13, 221); //B10(28-31) B11(28-31) B10(28-31) B11(28-31) B14(28-31) B15(28-31) B14(28-31) B15(28-31)
                     const __m256i rhs_mat_2367_13_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_13, 221); //B12(28-31) B13(28-31) B12(28-31) B13(28-31) B16(28-31) B17(28-31) B16(28-31) B17(28-31)
 
-                    uint32_t utmp_0[4], utmp_1[4];
-
                     // Scales and Mins of corresponding sub blocks from different Q4_K structures are stored together
                     // The below block is for eg to extract first sub block's scales and mins from different Q4_K structures for the sb loop
-                    memcpy(utmp_0, b_ptr[b].scales + 24 * sb, 12);
-                    utmp_0[3] = ((utmp_0[2] >> 4) & kmask2) | (((utmp_0[1] >> 6) & kmask3) << 4);
-                    const uint32_t uaux_0 = utmp_0[1] & kmask1;
-                    utmp_0[1] = (utmp_0[2] & kmask2) | (((utmp_0[0] >> 6) & kmask3) << 4);
-                    utmp_0[2] = uaux_0;
-                    utmp_0[0] &= kmask1;
-
-                    // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures when sb = 1
-                    memcpy(utmp_1, b_ptr[b].scales + 12 + sb * 24, 12);
-                    utmp_1[3] = ((utmp_1[2] >> 4) & kmask2) | (((utmp_1[1] >> 6) & kmask3) << 4);
-                    const uint32_t uaux_1 = utmp_1[1] & kmask1;
-                    utmp_1[1] = (utmp_1[2] & kmask2) | (((utmp_1[0] >> 6) & kmask3) << 4);
-                    utmp_1[2] = uaux_1;
-                    utmp_1[0] &= kmask1;
+                    const uint32_t utmp_03 = ((utmp[2] >> 4) & kmask2) | ((utmp[1] >> 2) & kmask4);
+                    const uint32_t utmp_02 = utmp[1] & kmask1;
+                    const uint32_t utmp_01 = (utmp[2] & kmask2) | ((utmp[0] >> 2) & kmask4);
+                    const uint32_t utmp_00 = utmp[0] & kmask1;
 
                     // Scales of first sub block in the sb loop
-                    const __m128i mins_and_scales_0 = _mm_set_epi32(utmp_0[3], utmp_0[2], utmp_0[1], utmp_0[0]);
+                    __m128i mins_and_scales_0 = _mm_set_epi32(utmp_03, utmp_02, utmp_01, utmp_00);
                     const __m256i scales_0 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(mins_and_scales_0, mins_and_scales_0));
 
+                    // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures for the sb loop
+                    const uint32_t utmp_13 = ((utmp[5] >> 4) & kmask2) | ((utmp[4] >> 2) & kmask4);
+                    const uint32_t utmp_12 = utmp[4] & kmask1;
+                    const uint32_t utmp_11 = (utmp[5] & kmask2) | ((utmp[3] >> 2) & kmask4);
+                    const uint32_t utmp_10 = utmp[3] & kmask1;
+
                     // Scales of second sub block in the sb loop
-                    const __m128i mins_and_scales_1 = _mm_set_epi32(utmp_1[3], utmp_1[2], utmp_1[1], utmp_1[0]);
+                    __m128i mins_and_scales_1 = _mm_set_epi32(utmp_13, utmp_12, utmp_11, utmp_10);
                     const __m256i scales_1 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(mins_and_scales_1, mins_and_scales_1));
 
                     // Mins of first and second sub block of Q4_K block are arranged side by side
@@ -3225,6 +3221,8 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     const __m256i scale_0145_1 = _mm256_shuffle_epi32(scales_1, 68);
                     const __m256i scale_2367_1 = _mm256_shuffle_epi32(scales_1, 238);
 
+                    utmp += 6;
+
                     // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3
                     // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
                     __m256i lhs_mat_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 256 * sb)));
@@ -3396,6 +3394,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
     UNUSED(kmask1);
     UNUSED(kmask2);
     UNUSED(kmask3);
+    UNUSED(kmask4);
     ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 #endif
 }