From aedac0d7bc02587f1e1868edd56ecc1e2c73ad46 Mon Sep 17 00:00:00 2001 From: Manogna-Sree Date: Mon, 11 Aug 2025 00:08:05 -0700 Subject: [PATCH 01/23] Initial interleaving support for Q6_K Block Interleaving --- ggml/src/ggml-cpu/arch-fallback.h | 16 + ggml/src/ggml-cpu/arch/x86/repack.cpp | 1235 +++++++++++++++++++++++++ ggml/src/ggml-cpu/repack.cpp | 105 +++ ggml/src/ggml-cpu/repack.h | 12 + 4 files changed, 1368 insertions(+) diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 0775c87f98..effb7bad9b 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -41,6 +41,7 @@ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K +#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 @@ -49,6 +50,7 @@ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K +#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64) @@ -57,8 +59,12 @@ #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K +#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K +#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K +#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K +#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 @@ -89,6 +95,7 @@ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K +#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 @@ -97,6 +104,7 @@ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K +#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #elif defined(__loongarch64) @@ -117,6 +125,7 @@ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K +#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 @@ -125,6 +134,7 @@ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K +#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #elif defined(__riscv) @@ -152,6 +162,7 @@ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K +#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 @@ -159,6 +170,7 @@ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K +#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #elif defined(__s390x__) @@ -185,6 +197,7 @@ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K +#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 @@ -193,6 +206,7 @@ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K +#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #elif defined(__wasm__) @@ -221,6 +235,7 @@ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K +#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 @@ -229,6 +244,7 @@ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K +#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #endif diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 7dda9eea0c..d4a29058ba 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -1938,6 +1938,373 @@ void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK_K; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if defined(__AVX2__) + // Shuffle masks to rearrange delta values to multiply with appropriate scales + __m128i deltamask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); + // Permute mask used for easier vector processing at later stages + __m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + + const __m256i m3b = _mm256_set1_epi8(3); + const __m256i m4b = _mm256_set1_epi8(0xF); + const __m256i m32s = _mm256_set1_epi8(32); + + //Mask to get appropriate scales + __m128i scalemask1 = _mm_set_epi8(14,14,6,6,12,12,4,4,10,10,2,2,8,8,0,0); + __m128i scalemask2 = _mm_set_epi8(15,15,7,7,13,13,5,5,11,11,3,3,9,9,1,1); + + int64_t b_nb = n / QK_K; + + const block_q6_Kx8 * b_ptr_start = (const block_q6_Kx8 *)vx; + const block_q8_K * a_ptr_start = (const block_q8_K *)vy; + + // Process Q8_K blocks one by one + for (int64_t y = 0; y < nr; y++) { + + // Pointers to LHS blocks of block_q8_K format + const block_q8_K * a_ptr = a_ptr_start + (y * nb); + + // Take group of eight interleaved block_q6_K structures at each pass of the loop and perform dot product operation + for(int64_t x = 0; x < nc / 8; x++) { + + // Pointers to RHS blocks + const block_q6_Kx8 * b_ptr = b_ptr_start + (x * b_nb); + + // Master FP accumulators + __m256 acc_row = _mm256_setzero_ps(); + __m256 acc_min_rows = _mm256_setzero_ps(); + + for (int64_t b = 0; b < nb; b++) { + + // Load and convert to FP32 delta from block_q8_K + const __m256 row_scale_f32 = _mm256_set1_ps((a_ptr[b].d)); + + // Load the delta values for the 8 blocks interleaved in block_q6_Kx8 + // col_scale_f32 rearranged so as to multiply with appropriate quants + const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, deltamask); + + __m256i iacc_b = _mm256_setzero_si256(); + + // Processes eight sub blocks from each Q6_K in each iteration + for(int sb = 0; sb < QK_K / 128; sb++) { + + // Load the high bits(bit 5, 6) of eight block_q6_K for eight sub blocks quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7 + const __m256i rhs_raw_vec_qh_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + sb * 256)); + const __m256i rhs_raw_vec_qh_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 32 + sb * 256)); + const __m256i rhs_raw_vec_qh_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 64 + sb * 256)); + const __m256i rhs_raw_vec_qh_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 96 + sb * 256)); + const __m256i rhs_raw_vec_qh_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 128 + sb * 256)); + const __m256i rhs_raw_vec_qh_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 160 + sb * 256)); + const __m256i rhs_raw_vec_qh_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 192 + sb * 256)); + const __m256i rhs_raw_vec_qh_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 224 + sb * 256)); + + // 2-bit -> 8-bit + // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop + const __m256i rhs_vec_qh_0123_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_0, m3b), 4); //B00(0-7) B01(0-7) B02(0-7) B03(0-7) + const __m256i rhs_vec_qh_0123_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 2), m3b), 4); //B20(0-7) B21(0-7) B22(0-7) B23(0-7) + const __m256i rhs_vec_qh_0123_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 4), m3b), 4); //B40(0-7) B41(0-7) B42(0-7) B43(0-7) + const __m256i rhs_vec_qh_0123_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 6), m3b), 4); //B60(0-7) B61(0-7) B62(0-7) B63(0-7) + + const __m256i rhs_vec_qh_4567_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_0, m3b), 4); //B04(0-7) B05(0-7) B06(0-7) B07(0-7) + const __m256i rhs_vec_qh_4567_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 2), m3b), 4); //B24(0-7) B25(0-7) B26(0-7) B27(0-7) + const __m256i rhs_vec_qh_4567_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 4), m3b), 4); //B44(0-7) B45(0-7) B46(0-7) B47(0-7) + const __m256i rhs_vec_qh_4567_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 6), m3b), 4); //B64(0-7) B65(0-7) B66(0-7) B67(0-7) + + const __m256i rhs_vec_qh_0123_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_1, m3b), 4); //B00(8-15) B01(8-15) B02(8-15) B03(8-15) + const __m256i rhs_vec_qh_0123_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 2), m3b), 4); //B20(8-15) B21(8-15) B22(8-15) B23(8-15) + const __m256i rhs_vec_qh_0123_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 4), m3b), 4); //B40(8-15) B41(8-15) B42(8-15) B43(8-15) + const __m256i rhs_vec_qh_0123_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 6), m3b), 4); //B60(8-15) B61(8-15) B62(8-15) B63(8-15) + + const __m256i rhs_vec_qh_4567_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_1, m3b), 4); //B04(8-15) B05(8-15) B06(8-15) B07(8-15) + const __m256i rhs_vec_qh_4567_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 2), m3b), 4); //B24(8-15) B25(8-15) B26(8-15) B27(8-15) + const __m256i rhs_vec_qh_4567_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 4), m3b), 4); //B44(8-15) B45(8-15) B46(8-15) B47(8-15) + const __m256i rhs_vec_qh_4567_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 6), m3b), 4); //B64(8-15) B65(8-15) B66(8-15) B67(8-15) + + // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop + const __m256i rhs_vec_qh_0123_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_2, m3b), 4); //B10(0-7) B11(0-7) B12(0-7) B13(0-7) + const __m256i rhs_vec_qh_0123_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 2), m3b), 4); //B30(0-7) B31(0-7) B32(0-7) B33(0-7) + const __m256i rhs_vec_qh_0123_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 4), m3b), 4); //B50(0-7) B51(0-7) B52(0-7) B53(0-7) + const __m256i rhs_vec_qh_0123_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 6), m3b), 4); //B70(0-7) B71(0-7) B72(0-7) B73(0-7) + + const __m256i rhs_vec_qh_4567_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_2, m3b), 4); //B14(0-7) B15(0-7) B16(0-7) B17(0-7) + const __m256i rhs_vec_qh_4567_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 2), m3b), 4); //B34(0-7) B35(0-7) B36(0-7) B37(0-7) + const __m256i rhs_vec_qh_4567_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 4), m3b), 4); //B54(0-7) B55(0-7) B56(0-7) B57(0-7) + const __m256i rhs_vec_qh_4567_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 6), m3b), 4); //B74(0-7) B75(0-7) B76(0-7) B77(0-7) + + const __m256i rhs_vec_qh_0123_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_3, m3b), 4); //B10(8-15) B11(8-15) B12(8-15) B13(8-15) + const __m256i rhs_vec_qh_0123_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 2), m3b), 4); //B30(8-15) B31(8-15) B32(8-15) B33(8-15) + const __m256i rhs_vec_qh_0123_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 4), m3b), 4); //B50(8-15) B51(8-15) B52(8-15) B53(8-15) + const __m256i rhs_vec_qh_0123_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 6), m3b), 4); //B70(8-15) B71(8-15) B72(8-15) B73(8-15) + + const __m256i rhs_vec_qh_4567_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_3, m3b), 4); //B14(8-15) B15(8-15) B16(8-15) B17(8-15) + const __m256i rhs_vec_qh_4567_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 2), m3b), 4); //B34(8-15) B35(8-15) B36(8-15) B37(8-15) + const __m256i rhs_vec_qh_4567_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 4), m3b), 4); //B54(8-15) B55(8-15) B56(8-15) B57(8-15) + const __m256i rhs_vec_qh_4567_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 6), m3b), 4); //B74(8-15) B75(8-15) B76(8-15) B77(8-15) + + // Load the lower bits(bits 0 - 3) of eight block_q6_K for eight sub blocks quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7 + const __m256i rhs_raw_vec_ql_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + sb * 512)); // 0 - 8, +64 + const __m256i rhs_raw_vec_ql_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 32 + sb * 512)); // 0 - 8 + const __m256i rhs_raw_vec_ql_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 64 + sb * 512)); // 8 - 15 + const __m256i rhs_raw_vec_ql_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 96 + sb * 512)); // 8 - 15 + const __m256i rhs_raw_vec_ql_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 128 + sb * 512)); // 16 - 23 + const __m256i rhs_raw_vec_ql_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 160 + sb * 512)); // 16 - 23 + const __m256i rhs_raw_vec_ql_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 192 + sb * 512)); // 24 - 31 + const __m256i rhs_raw_vec_ql_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 224 + sb * 512)); // 24 - 31 + const __m256i rhs_raw_vec_ql_0123_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_vec_ql_4567_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_vec_ql_0123_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_vec_ql_4567_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_vec_ql_0123_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_vec_ql_4567_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_vec_ql_0123_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_vec_ql_4567_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 480 + sb * 512)); + + // 0 -7, 64 - 71 + const __m256i rhs_vec_0123_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_0, m4b), rhs_vec_qh_0123_00); + const __m256i rhs_vec_0123_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_0, 4), m4b), rhs_vec_qh_0123_40); + + const __m256i rhs_vec_4567_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_0, m4b), rhs_vec_qh_4567_00); + const __m256i rhs_vec_4567_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_0, 4), m4b), rhs_vec_qh_4567_40); + + // 8 - 15, 72 - 79 + const __m256i rhs_vec_0123_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_1, m4b), rhs_vec_qh_0123_01); + const __m256i rhs_vec_0123_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_1, 4), m4b), rhs_vec_qh_0123_41); + + const __m256i rhs_vec_4567_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_1, m4b), rhs_vec_qh_4567_01); + const __m256i rhs_vec_4567_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_1, 4), m4b), rhs_vec_qh_4567_41); + + // 16 - 23, 80 - 87 + const __m256i rhs_vec_0123_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_2, m4b), rhs_vec_qh_0123_10); + const __m256i rhs_vec_0123_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_2, 4), m4b), rhs_vec_qh_0123_50); + + const __m256i rhs_vec_4567_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_2, m4b), rhs_vec_qh_4567_10); + const __m256i rhs_vec_4567_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_2, 4), m4b), rhs_vec_qh_4567_50); + + // 24 - 31, 88 - 95 + const __m256i rhs_vec_0123_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_3, m4b), rhs_vec_qh_0123_11); + const __m256i rhs_vec_0123_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_3, 4), m4b), rhs_vec_qh_0123_51); + + const __m256i rhs_vec_4567_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_3, m4b), rhs_vec_qh_4567_11); + const __m256i rhs_vec_4567_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_3, 4), m4b), rhs_vec_qh_4567_51); + + // 32 - 39, 96 - 103 + const __m256i rhs_vec_0123_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_4, m4b), rhs_vec_qh_0123_20); + const __m256i rhs_vec_0123_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_4, 4), m4b), rhs_vec_qh_0123_60); + + const __m256i rhs_vec_4567_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_4, m4b), rhs_vec_qh_4567_20); + const __m256i rhs_vec_4567_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_4, 4), m4b), rhs_vec_qh_4567_60); + + // 40 - 47, 104 - 111 + const __m256i rhs_vec_0123_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_5, m4b), rhs_vec_qh_0123_21); + const __m256i rhs_vec_0123_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_5, 4), m4b), rhs_vec_qh_0123_61); + + const __m256i rhs_vec_4567_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_5, m4b), rhs_vec_qh_4567_21); + const __m256i rhs_vec_4567_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_5, 4), m4b), rhs_vec_qh_4567_61); + + // 48 - 55, 112 - 119 + const __m256i rhs_vec_0123_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_6, m4b), rhs_vec_qh_0123_30); + const __m256i rhs_vec_0123_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_6, 4), m4b), rhs_vec_qh_0123_70); + + const __m256i rhs_vec_4567_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_6, m4b), rhs_vec_qh_4567_30); + const __m256i rhs_vec_4567_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_6, 4), m4b), rhs_vec_qh_4567_70); + + // 56 - 63, 120 - 127 + const __m256i rhs_vec_0123_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_7, m4b), rhs_vec_qh_0123_31); + const __m256i rhs_vec_0123_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_7, 4), m4b), rhs_vec_qh_0123_71); + + const __m256i rhs_vec_4567_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_7, m4b), rhs_vec_qh_4567_31); + const __m256i rhs_vec_4567_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_7, 4), m4b), rhs_vec_qh_4567_71); + + //Scales of corresponding sub blocks from different Q6_K structures are stored together + //s00 s01 s10 s11 s20 s21 s30 s31 s40 s41 s50 s51 s60 s61 s70 s71 //s02 s03 //s04 s05 //s06 s07 + + const __m128i scales_01 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + sb * 64)); + const __m128i scales_23 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 16 + sb * 64)); + const __m128i scales_45 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 32 + sb * 64)); + const __m128i scales_67 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 48 + sb * 64)); + + // Scales of sub blocks in the sb loop + // Scales of the 0th sub block from each super block + __m128i scales_rearrange_0 = _mm_shuffle_epi8(scales_01, scalemask1); + __m256i scales_0 = _mm256_cvtepi8_epi16(scales_rearrange_0); + + // Scales of the 1st sub block from each super block + __m128i scales_rearrange_1 = _mm_shuffle_epi8(scales_01, scalemask2); + __m256i scales_1 = _mm256_cvtepi8_epi16(scales_rearrange_1); + + // Scales of the 2nd sub block from each super block + __m128i scales_rearrange_2 = _mm_shuffle_epi8(scales_23, scalemask1); + __m256i scales_2 = _mm256_cvtepi8_epi16(scales_rearrange_2); + + // Scales of the 3rd sub block from each super block + __m128i scales_rearrange_3 = _mm_shuffle_epi8(scales_23, scalemask2); + __m256i scales_3 = _mm256_cvtepi8_epi16(scales_rearrange_3); + + // Scales of the 4th sub block from each super block + __m128i scales_rearrange_4 = _mm_shuffle_epi8(scales_45, scalemask1); + __m256i scales_4 = _mm256_cvtepi8_epi16(scales_rearrange_4); + + // Scales of the 5th sub block from each super block + __m128i scales_rearrange_5 = _mm_shuffle_epi8(scales_45, scalemask2); + __m256i scales_5 = _mm256_cvtepi8_epi16(scales_rearrange_5); + + // Scales of the 6th sub block from each super block + __m128i scales_rearrange_6 = _mm_shuffle_epi8(scales_67, scalemask1); + __m256i scales_6 = _mm256_cvtepi8_epi16(scales_rearrange_6); + + // Scales of the 7th sub block from each super block + __m128i scales_rearrange_7 = _mm_shuffle_epi8(scales_67, scalemask2); + __m256i scales_7 = _mm256_cvtepi8_epi16(scales_rearrange_7); + + // Load the sub block values corresponding to sb in block_q8_K in batches of 16 bytes and replicate the same across 256 bit vector + __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + sb * 128))); + __m256i lhs_vec_1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 16 + sb * 128))); + __m256i lhs_vec_2 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 32 + sb * 128))); + __m256i lhs_vec_3 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 48 + sb * 128))); + __m256i lhs_vec_4 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 64 + sb * 128))); + __m256i lhs_vec_5 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 80 + sb * 128))); + __m256i lhs_vec_6 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 96 + sb * 128))); + __m256i lhs_vec_7 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 112 + sb * 128))); + + lhs_vec_0 = _mm256_permute2f128_si256(lhs_vec_0, lhs_vec_0, 0); + lhs_vec_1 = _mm256_permute2f128_si256(lhs_vec_1, lhs_vec_1, 0); + lhs_vec_2 = _mm256_permute2f128_si256(lhs_vec_2, lhs_vec_2, 0); + lhs_vec_3 = _mm256_permute2f128_si256(lhs_vec_3, lhs_vec_3, 0); + lhs_vec_4 = _mm256_permute2f128_si256(lhs_vec_4, lhs_vec_4, 0); + lhs_vec_5 = _mm256_permute2f128_si256(lhs_vec_5, lhs_vec_5, 0); + lhs_vec_6 = _mm256_permute2f128_si256(lhs_vec_6, lhs_vec_6, 0); + lhs_vec_7 = _mm256_permute2f128_si256(lhs_vec_7, lhs_vec_7, 0); + + __m256i lhs_vec_s_0 = _mm256_maddubs_epi16(lhs_vec_0, m32s); + __m256i lhs_vec_s_1 = _mm256_maddubs_epi16(lhs_vec_1, m32s); + __m256i lhs_vec_s_2 = _mm256_maddubs_epi16(lhs_vec_2, m32s); + __m256i lhs_vec_s_3 = _mm256_maddubs_epi16(lhs_vec_3, m32s); + __m256i lhs_vec_s_4 = _mm256_maddubs_epi16(lhs_vec_4, m32s); + __m256i lhs_vec_s_5 = _mm256_maddubs_epi16(lhs_vec_5, m32s); + __m256i lhs_vec_s_6 = _mm256_maddubs_epi16(lhs_vec_6, m32s); + __m256i lhs_vec_s_7 = _mm256_maddubs_epi16(lhs_vec_7, m32s); + + __m256i iacc_0 = _mm256_setzero_si256(); + __m256i iacc_1 = _mm256_setzero_si256(); + __m256i iacc_2 = _mm256_setzero_si256(); + __m256i iacc_3 = _mm256_setzero_si256(); + __m256i iacc_4 = _mm256_setzero_si256(); + __m256i iacc_5 = _mm256_setzero_si256(); + __m256i iacc_6 = _mm256_setzero_si256(); + __m256i iacc_7 = _mm256_setzero_si256(); + + // Dot product done within 32 bit lanes and accumulated in the same vector + // First done for 0th sub block and then for seven (1st - 7th) other sub blocks processed for each sb (sb < QK_K/128 loop) + // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3) + // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7) + // B0(8-11) B4(8-11) B1(8-11) B5(8-11) B2(8-11) B6(8-11) B3(8-11) B7(8-11) with A0(8-11) + // B0(12-15) B4(12-15) B1(12-15) B5(12-15) B2(12-15) B6(12-15) B3(12-15) B7(12-15) with A0(12-15) + + iacc_0 = _mm256_add_epi16(iacc_0, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_00 ,_mm256_shuffle_epi32(rhs_vec_4567_00, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)), _mm256_shuffle_epi32(lhs_vec_s_0, 0))); + iacc_0 = _mm256_add_epi16(iacc_0, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_00, 177) ,rhs_vec_4567_00, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)), _mm256_shuffle_epi32(lhs_vec_s_0, 85))); + + iacc_0 = _mm256_add_epi16(iacc_0, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_01 ,_mm256_shuffle_epi32(rhs_vec_4567_01, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)), _mm256_shuffle_epi32(lhs_vec_s_0, 170))); + iacc_0 = _mm256_add_epi16(iacc_0, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_01, 177) ,rhs_vec_4567_01, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)), _mm256_shuffle_epi32(lhs_vec_s_0, 255))); + + iacc_0 = _mm256_madd_epi16(iacc_0, scales_0); + + iacc_1 = _mm256_add_epi16(iacc_1, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_10 ,_mm256_shuffle_epi32(rhs_vec_4567_10, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)), _mm256_shuffle_epi32(lhs_vec_s_1, 0))); + iacc_1 = _mm256_add_epi16(iacc_1, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_10, 177) ,rhs_vec_4567_10, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)), _mm256_shuffle_epi32(lhs_vec_s_1, 85))); + + iacc_1 = _mm256_add_epi16(iacc_1, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_11 ,_mm256_shuffle_epi32(rhs_vec_4567_11, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)), _mm256_shuffle_epi32(lhs_vec_s_1, 170))); + iacc_1 = _mm256_add_epi16(iacc_1, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_11, 177) ,rhs_vec_4567_11, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)), _mm256_shuffle_epi32(lhs_vec_s_1, 255))); + + iacc_1 = _mm256_madd_epi16(iacc_1, scales_1); + + iacc_2 = _mm256_add_epi16(iacc_2, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_20 ,_mm256_shuffle_epi32(rhs_vec_4567_20, 177), 170), _mm256_shuffle_epi32(lhs_vec_2, 0)), _mm256_shuffle_epi32(lhs_vec_s_2, 0))); + iacc_2 = _mm256_add_epi16(iacc_2, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_20, 177) ,rhs_vec_4567_20, 170), _mm256_shuffle_epi32(lhs_vec_2, 85)), _mm256_shuffle_epi32(lhs_vec_s_2, 85))); + + iacc_2 = _mm256_add_epi16(iacc_2, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_21 ,_mm256_shuffle_epi32(rhs_vec_4567_21, 177), 170), _mm256_shuffle_epi32(lhs_vec_2, 170)), _mm256_shuffle_epi32(lhs_vec_s_2, 170))); + iacc_2 = _mm256_add_epi16(iacc_2, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_21, 177) ,rhs_vec_4567_21, 170), _mm256_shuffle_epi32(lhs_vec_2, 255)), _mm256_shuffle_epi32(lhs_vec_s_2, 255))); + + iacc_2 = _mm256_madd_epi16(iacc_2, scales_2); + + iacc_3 = _mm256_add_epi16(iacc_3, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_30 ,_mm256_shuffle_epi32(rhs_vec_4567_30, 177), 170), _mm256_shuffle_epi32(lhs_vec_3, 0)), _mm256_shuffle_epi32(lhs_vec_s_3, 0))); + iacc_3 = _mm256_add_epi16(iacc_3, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_30, 177) ,rhs_vec_4567_30, 170), _mm256_shuffle_epi32(lhs_vec_3, 85)), _mm256_shuffle_epi32(lhs_vec_s_3, 85))); + + iacc_3 = _mm256_add_epi16(iacc_3, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_31 ,_mm256_shuffle_epi32(rhs_vec_4567_31, 177), 170), _mm256_shuffle_epi32(lhs_vec_3, 170)), _mm256_shuffle_epi32(lhs_vec_s_3, 170))); + iacc_3 = _mm256_add_epi16(iacc_3, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_31, 177) ,rhs_vec_4567_31, 170), _mm256_shuffle_epi32(lhs_vec_3, 255)), _mm256_shuffle_epi32(lhs_vec_s_3, 255))); + + iacc_3 = _mm256_madd_epi16(iacc_3, scales_3); + + iacc_4 = _mm256_add_epi16(iacc_4, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_40 ,_mm256_shuffle_epi32(rhs_vec_4567_40, 177), 170), _mm256_shuffle_epi32(lhs_vec_4, 0)), _mm256_shuffle_epi32(lhs_vec_s_4, 0))); + iacc_4 = _mm256_add_epi16(iacc_4, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_40, 177) ,rhs_vec_4567_40, 170), _mm256_shuffle_epi32(lhs_vec_4, 85)), _mm256_shuffle_epi32(lhs_vec_s_4, 85))); + + iacc_4 = _mm256_add_epi16(iacc_4, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_41 ,_mm256_shuffle_epi32(rhs_vec_4567_41, 177), 170), _mm256_shuffle_epi32(lhs_vec_4, 170)), _mm256_shuffle_epi32(lhs_vec_s_4, 170))); + iacc_4 = _mm256_add_epi16(iacc_4, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_41, 177) ,rhs_vec_4567_41, 170), _mm256_shuffle_epi32(lhs_vec_4, 255)), _mm256_shuffle_epi32(lhs_vec_s_4, 255))); + + iacc_4 = _mm256_madd_epi16(iacc_4, scales_4); + + iacc_5 = _mm256_add_epi16(iacc_5, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_50 ,_mm256_shuffle_epi32(rhs_vec_4567_50, 177), 170), _mm256_shuffle_epi32(lhs_vec_5, 0)), _mm256_shuffle_epi32(lhs_vec_s_5, 0))); + iacc_5 = _mm256_add_epi16(iacc_5, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_50, 177) ,rhs_vec_4567_50, 170), _mm256_shuffle_epi32(lhs_vec_5, 85)), _mm256_shuffle_epi32(lhs_vec_s_5, 85))); + + iacc_5 = _mm256_add_epi16(iacc_5, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_51 ,_mm256_shuffle_epi32(rhs_vec_4567_51, 177), 170), _mm256_shuffle_epi32(lhs_vec_5, 170)), _mm256_shuffle_epi32(lhs_vec_s_5, 170))); + iacc_5 = _mm256_add_epi16(iacc_5, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_51, 177) ,rhs_vec_4567_51, 170), _mm256_shuffle_epi32(lhs_vec_5, 255)), _mm256_shuffle_epi32(lhs_vec_s_5, 255))); + + iacc_5 = _mm256_madd_epi16(iacc_5, scales_5); + + iacc_6 = _mm256_add_epi16(iacc_6, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_60 ,_mm256_shuffle_epi32(rhs_vec_4567_60, 177), 170), _mm256_shuffle_epi32(lhs_vec_6, 0)), _mm256_shuffle_epi32(lhs_vec_s_6, 0))); + iacc_6 = _mm256_add_epi16(iacc_6, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_60, 177) ,rhs_vec_4567_60, 170), _mm256_shuffle_epi32(lhs_vec_6, 85)), _mm256_shuffle_epi32(lhs_vec_s_6, 85))); + + iacc_6 = _mm256_add_epi16(iacc_6, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_61 ,_mm256_shuffle_epi32(rhs_vec_4567_61, 177), 170), _mm256_shuffle_epi32(lhs_vec_6, 170)), _mm256_shuffle_epi32(lhs_vec_s_6, 170))); + iacc_6 = _mm256_add_epi16(iacc_6, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_61, 177) ,rhs_vec_4567_61, 170), _mm256_shuffle_epi32(lhs_vec_6, 255)), _mm256_shuffle_epi32(lhs_vec_s_6, 255))); + + iacc_6 = _mm256_madd_epi16(iacc_6, scales_6); + + iacc_7 = _mm256_add_epi16(iacc_7, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_70 ,_mm256_shuffle_epi32(rhs_vec_4567_70, 177), 170), _mm256_shuffle_epi32(lhs_vec_7, 0)), _mm256_shuffle_epi32(lhs_vec_s_7, 0))); + iacc_7 = _mm256_add_epi16(iacc_7, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_70, 177) ,rhs_vec_4567_70, 170), _mm256_shuffle_epi32(lhs_vec_7, 85)), _mm256_shuffle_epi32(lhs_vec_s_7, 85))); + + iacc_7 = _mm256_add_epi16(iacc_7, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_71 ,_mm256_shuffle_epi32(rhs_vec_4567_71, 177), 170), _mm256_shuffle_epi32(lhs_vec_7, 170)), _mm256_shuffle_epi32(lhs_vec_s_7, 170))); + iacc_7 = _mm256_add_epi16(iacc_7, _mm256_sub_epi16(_mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_71, 177) ,rhs_vec_4567_71, 170), _mm256_shuffle_epi32(lhs_vec_7, 255)), _mm256_shuffle_epi32(lhs_vec_s_7, 255))); + + iacc_7 = _mm256_madd_epi16(iacc_7, scales_7); + + // Accumulate the iacc value for one sb + __m256i iacc_sb = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(iacc_0, iacc_1), _mm256_add_epi32(iacc_2, iacc_3)), _mm256_add_epi32(_mm256_add_epi32(iacc_4, iacc_5), _mm256_add_epi32(iacc_6, iacc_7))); + + // Accumulate for the complete block + iacc_b = _mm256_add_epi32(iacc_b, iacc_sb); + } + + //Multiply-Add with scale values for complete super block + acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_b), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row); + } + // Accumulated output values permuted so as to be stored in appropriate order post accumulation + acc_row = _mm256_permutevar8x32_ps(acc_row, finalpermutemask); + _mm256_storeu_ps(s + (y * nr + x * 8), acc_row); + } + } +#else + + ggml_gemv_q6_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc); + +#endif +} + + void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { #if defined(__AVX2__) || defined(__AVX512F__) { @@ -6305,3 +6672,871 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } + +void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK_K; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if defined(__AVX2__) + const block_q6_Kx8 * b_ptr_start = (const block_q6_Kx8 * ) vx; + const block_q8_Kx4 * a_ptr_start = (const block_q8_Kx4 * ) vy; + int64_t b_nb = n / QK_K; + int64_t y = 0; + + // Mask to mask out nibbles from packed bytes + // Permute mask used for easier vector processing at later stages + __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4); + int64_t xstart = 0; + int anr = nr - nr % 16;; // Used to align nr with boundary of 16 + + // Mask to mask out nibbles from packed bytes + const __m256i m4 = _mm256_set1_epi8(0xF); + const __m256i m2 = _mm256_set1_epi8(3); + const __m256i m32s = _mm256_set1_epi8(32); + + //Mask to get appropriate scales + __m128i scalesmask1_sse = _mm_set_epi8(14,14,12,12,10,10,8,8,6,6,4,4,2,2,0,0); + __m128i scalesmask2_sse = _mm_set_epi8(15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1); + + + __m256i scalesmask1 = _mm256_castsi128_si256(scalesmask1_sse); + scalesmask1 = _mm256_permute2f128_si256(scalesmask1, scalesmask1, 0); + __m256i scalesmask2 = _mm256_castsi128_si256(scalesmask2_sse); + scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0); + + for (; y < anr / 4; y += 4){ + + const block_q8_Kx4 * a_ptrs[4]; + + a_ptrs[0] = a_ptr_start + (y * nb); + for (int i = 0; i < 3; ++i) { + a_ptrs[i + 1] = a_ptrs[i] + nb; + } + // Take group of eight block_q6_kx8 structures at each pass of the loop and perform dot product operation + for (int64_t x = xstart; x < nc / 8; x++) { + + const block_q6_Kx8 * b_ptr = b_ptr_start + (x * b_nb); + + // Master FP accumulators + __m256 acc_rows[16]; + for (int i = 0; i < 16; i++) { + acc_rows[i] = _mm256_setzero_ps(); + } + + // For super block + for (int64_t b = 0; b < nb; b++) { + // Delta values - Load the eight scale values of block_q6_kx8 + const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d); + + for (int sb = 0; sb < QK_K / 128; sb++) { + const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + sb * 512)); + const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 224 + sb * 512)); + + const __m256i rhs_raw_mat_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_mat_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_mat_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_mat_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_mat_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_mat_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_mat_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_mat_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 480 + sb * 512)); + + const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + sb * 256)); + const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 32 + sb * 256)); + const __m256i rhs_raw_hbit_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 64 + sb * 256)); + const __m256i rhs_raw_hbit_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 96 + sb * 256)); + const __m256i rhs_raw_hbit_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 128 + sb * 256)); + const __m256i rhs_raw_hbit_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 160 + sb * 256)); + const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 192 + sb * 256)); + const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 224 + sb * 256)); + + // Indices 0 through 7 (first block): + const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); + const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240); + const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240); + const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240); + + // Indices 4 through 7 (second block): + const __m256i rhs_raw_mat_0145_4 = _mm256_blend_epi32(rhs_raw_mat_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_4, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_4, requiredOrder), rhs_raw_mat_4567_4, 240); + const __m256i rhs_raw_mat_0145_5 = _mm256_blend_epi32(rhs_raw_mat_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_5, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_5, requiredOrder), rhs_raw_mat_4567_5, 240); + const __m256i rhs_raw_mat_0145_6 = _mm256_blend_epi32(rhs_raw_mat_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_6, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_6, requiredOrder), rhs_raw_mat_4567_6, 240); + const __m256i rhs_raw_mat_0145_7 = _mm256_blend_epi32(rhs_raw_mat_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_7, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_7, requiredOrder), rhs_raw_mat_4567_7, 240); + + const __m256i rhs_raw_hbit_0145_0 = _mm256_blend_epi32(rhs_raw_hbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_0, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_0, requiredOrder), rhs_raw_hbit_4567_0, 240); + const __m256i rhs_raw_hbit_0145_1 = _mm256_blend_epi32(rhs_raw_hbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_1, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_1, requiredOrder), rhs_raw_hbit_4567_1, 240); + const __m256i rhs_raw_hbit_0145_2 = _mm256_blend_epi32(rhs_raw_hbit_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_2, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_2, requiredOrder), rhs_raw_hbit_4567_2, 240); + const __m256i rhs_raw_hbit_0145_3 = _mm256_blend_epi32(rhs_raw_hbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_3, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_3, requiredOrder), rhs_raw_hbit_4567_3, 240); + + // 2-bit -> 8-bit + // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop + const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) + const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) + const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) + const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + + const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) + const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) + const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) + const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + + const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) + const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) + const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) + const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + + const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) + const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) + const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) + const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + + // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop + const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) + const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) + const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) + const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + + const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) + const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) + const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) + const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + + const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) + const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) + const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) + const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + + const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) + const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) + const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) + const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + + // 0 -7, 64 - 71 + const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_0, m4), rhs_hbit_0145_00); + const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4), rhs_hbit_0145_40); + + const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_0, m4), rhs_hbit_2367_00); + const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4), rhs_hbit_2367_40); + + // 8 - 15, 72 - 79 + const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_1, m4), rhs_hbit_0145_01); + const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4), rhs_hbit_0145_41); + + const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_1, m4), rhs_hbit_2367_01); + const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4), rhs_hbit_2367_41); + + // 16 - 23, 80 - 87 + const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_2, m4), rhs_hbit_0145_10); + const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_2, 4), m4), rhs_hbit_0145_50); + + const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_2, m4), rhs_hbit_2367_10); + const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_2, 4), m4), rhs_hbit_2367_50); + + // 24 - 31, 88 - 95 + const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_3, m4), rhs_hbit_0145_11); + const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_3, 4), m4), rhs_hbit_0145_51); + + const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_3, m4), rhs_hbit_2367_11); + const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_3, 4), m4), rhs_hbit_2367_51); + + // 32 - 39, 96 - 103 + const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_4, m4), rhs_hbit_0145_20); + const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_4, 4), m4), rhs_hbit_0145_60); + + const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_4, m4), rhs_hbit_2367_20); + const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_4, 4), m4), rhs_hbit_2367_60); + + // 40 - 47, 104 - 111 + const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_5, m4), rhs_hbit_0145_21); + const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_5, 4), m4), rhs_hbit_0145_61); + + const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_5, m4), rhs_hbit_2367_21); + const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_5, 4), m4), rhs_hbit_2367_61); + + // 48 - 55, 112 - 119 + const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_6, m4), rhs_hbit_0145_30); + const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_6, 4), m4), rhs_hbit_0145_70); + + const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_6, m4), rhs_hbit_2367_30); + const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_6, 4), m4), rhs_hbit_2367_70); + + // 56 - 63, 120 - 127 + const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_7, m4), rhs_hbit_0145_31); + const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_7, 4), m4), rhs_hbit_0145_71); + + const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_7, m4), rhs_hbit_2367_31); + const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_7, 4), m4), rhs_hbit_2367_71); + + + // Shuffle pattern one - right side input + const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) + const __m256i rhs_mat_2367_00_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_00, 136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) + + const __m256i rhs_mat_0145_01_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_01, 136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) + const __m256i rhs_mat_2367_01_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_01, 136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) + + const __m256i rhs_mat_0145_10_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_10, 136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) + const __m256i rhs_mat_2367_10_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_10, 136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) + + const __m256i rhs_mat_0145_11_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_11, 136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) + const __m256i rhs_mat_2367_11_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_11, 136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) + + const __m256i rhs_mat_0145_20_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_20, 136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) + const __m256i rhs_mat_2367_20_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_20, 136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) + + const __m256i rhs_mat_0145_21_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_21, 136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) + const __m256i rhs_mat_2367_21_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_21, 136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) + + const __m256i rhs_mat_0145_30_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_30, 136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) + const __m256i rhs_mat_2367_30_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_30, 136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) + + const __m256i rhs_mat_0145_31_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_31, 136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11 + const __m256i rhs_mat_2367_31_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_31, 136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) + + const __m256i rhs_mat_0145_40_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_40, 136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) + const __m256i rhs_mat_2367_40_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_40, 136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) + + const __m256i rhs_mat_0145_41_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_41, 136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) + const __m256i rhs_mat_2367_41_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_41, 136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) + + const __m256i rhs_mat_0145_50_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_50, 136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) + const __m256i rhs_mat_2367_50_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_50, 136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) + + const __m256i rhs_mat_0145_51_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_51, 136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) + const __m256i rhs_mat_2367_51_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_51, 136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) + + const __m256i rhs_mat_0145_60_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_60, 136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) + const __m256i rhs_mat_2367_60_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_60, 136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) + + const __m256i rhs_mat_0145_61_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_61, 136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) + const __m256i rhs_mat_2367_61_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_61, 136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) + + const __m256i rhs_mat_0145_70_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_70, 136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) + const __m256i rhs_mat_2367_70_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_70, 136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) + + const __m256i rhs_mat_0145_71_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_71, 136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) + const __m256i rhs_mat_2367_71_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_71, 136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) + + + // Shuffle pattern two - right side input + const __m256i rhs_mat_0145_00_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_00, 221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) + const __m256i rhs_mat_2367_00_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_00, 221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) + + const __m256i rhs_mat_0145_01_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_01, 221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) + const __m256i rhs_mat_2367_01_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_01, 221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) + + const __m256i rhs_mat_0145_10_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_10, 221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) + const __m256i rhs_mat_2367_10_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_10, 221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) + + const __m256i rhs_mat_0145_11_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_11, 221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) + const __m256i rhs_mat_2367_11_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_11, 221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) + + const __m256i rhs_mat_0145_20_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_20, 221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) + const __m256i rhs_mat_2367_20_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_20, 221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) + + const __m256i rhs_mat_0145_21_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_21, 221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) + const __m256i rhs_mat_2367_21_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_21, 221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) + + const __m256i rhs_mat_0145_30_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_30, 221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) + const __m256i rhs_mat_2367_30_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_30, 221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) + + const __m256i rhs_mat_0145_31_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_31, 221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) + const __m256i rhs_mat_2367_31_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_31, 221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) + + const __m256i rhs_mat_0145_40_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_40, 221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) + const __m256i rhs_mat_2367_40_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_40, 221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) + + const __m256i rhs_mat_0145_41_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_41, 221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) + const __m256i rhs_mat_2367_41_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_41, 221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) + + const __m256i rhs_mat_0145_50_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_50, 221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) + const __m256i rhs_mat_2367_50_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_50, 221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) + + const __m256i rhs_mat_0145_51_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_51, 221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) + const __m256i rhs_mat_2367_51_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_51, 221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) + + const __m256i rhs_mat_0145_60_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_60, 221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) + const __m256i rhs_mat_2367_60_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_60, 221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) + + const __m256i rhs_mat_0145_61_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_61, 221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) + const __m256i rhs_mat_2367_61_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_61, 221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) + + const __m256i rhs_mat_0145_70_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_70, 221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) + const __m256i rhs_mat_2367_70_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_70, 221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) + + const __m256i rhs_mat_0145_71_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_71, 221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) + const __m256i rhs_mat_2367_71_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_71, 221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) + + //Scales of corresponding sub blocks from different Q6_K structures are stored together + //s00 s01 s10 s11 s20 s21 ...... s70 s71 + // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop + const __m128i scales_01 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + sb * 64)); + const __m128i scales_23 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 16 + sb * 64)); + const __m128i scales_45 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 32 + sb * 64)); + const __m128i scales_67 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 48 + sb * 64)); + + const __m256i scales_0 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_01, scalesmask1_sse)); + const __m256i scales_1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_01, scalesmask2_sse)); + const __m256i scales_2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_23, scalesmask1_sse)); + const __m256i scales_3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_23, scalesmask2_sse)); + const __m256i scales_4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_45, scalesmask1_sse)); + const __m256i scales_5 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_45, scalesmask2_sse)); + const __m256i scales_6 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_67, scalesmask1_sse)); + const __m256i scales_7 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_67, scalesmask2_sse)); + + const __m256i scale_0145_0 = _mm256_shuffle_epi32(scales_0, 68); + const __m256i scale_2367_0 = _mm256_shuffle_epi32(scales_0, 238); + + const __m256i scale_0145_1 = _mm256_shuffle_epi32(scales_1, 68); + const __m256i scale_2367_1 = _mm256_shuffle_epi32(scales_1, 238); + + const __m256i scale_0145_2 = _mm256_shuffle_epi32(scales_2, 68); + const __m256i scale_2367_2 = _mm256_shuffle_epi32(scales_2, 238); + + const __m256i scale_0145_3 = _mm256_shuffle_epi32(scales_3, 68); + const __m256i scale_2367_3 = _mm256_shuffle_epi32(scales_3, 238); + + const __m256i scale_0145_4 = _mm256_shuffle_epi32(scales_4, 68); + const __m256i scale_2367_4 = _mm256_shuffle_epi32(scales_4, 238); + + const __m256i scale_0145_5 = _mm256_shuffle_epi32(scales_5, 68); + const __m256i scale_2367_5 = _mm256_shuffle_epi32(scales_5, 238); + + const __m256i scale_0145_6 = _mm256_shuffle_epi32(scales_6, 68); + const __m256i scale_2367_6 = _mm256_shuffle_epi32(scales_6, 238); + + const __m256i scale_0145_7 = _mm256_shuffle_epi32(scales_7, 68); + const __m256i scale_2367_7 = _mm256_shuffle_epi32(scales_7, 238); + + for (int rp = 0; rp < 4; rp++) { + // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3 + // Loaded as set of 128 bit vectors and repeated into a 256 bit vector + __m256i lhs_mat_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 512 * sb))); + __m256i lhs_mat_01_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 0); + __m256i lhs_mat_23_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 17); + __m256i lhs_mat_0123_01 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 32 + 512 * sb))); + __m256i lhs_mat_01_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 0); + __m256i lhs_mat_23_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 17); + __m256i lhs_mat_0123_10 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 64 + 512 * sb))); + __m256i lhs_mat_01_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 0); + __m256i lhs_mat_23_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 17); + __m256i lhs_mat_0123_11 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 96 + 512 * sb))); + __m256i lhs_mat_01_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 0); + __m256i lhs_mat_23_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 17); + __m256i lhs_mat_0123_20 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 128 + 512 * sb))); + __m256i lhs_mat_01_20 = _mm256_permute2f128_si256(lhs_mat_0123_20, lhs_mat_0123_20, 0); + __m256i lhs_mat_23_20 = _mm256_permute2f128_si256(lhs_mat_0123_20, lhs_mat_0123_20, 17); + __m256i lhs_mat_0123_21 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 160 + 512 * sb))); + __m256i lhs_mat_01_21 = _mm256_permute2f128_si256(lhs_mat_0123_21, lhs_mat_0123_21, 0); + __m256i lhs_mat_23_21 = _mm256_permute2f128_si256(lhs_mat_0123_21, lhs_mat_0123_21, 17); + __m256i lhs_mat_0123_30 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 192 + 512 * sb))); + __m256i lhs_mat_01_30 = _mm256_permute2f128_si256(lhs_mat_0123_30, lhs_mat_0123_30, 0); + __m256i lhs_mat_23_30 = _mm256_permute2f128_si256(lhs_mat_0123_30, lhs_mat_0123_30, 17); + __m256i lhs_mat_0123_31 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 224 + 512 * sb))); + __m256i lhs_mat_01_31 = _mm256_permute2f128_si256(lhs_mat_0123_31, lhs_mat_0123_31, 0); + __m256i lhs_mat_23_31 = _mm256_permute2f128_si256(lhs_mat_0123_31, lhs_mat_0123_31, 17); + + __m256i lhs_mat_0123_40 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 256 + 512 * sb))); + __m256i lhs_mat_01_40 = _mm256_permute2f128_si256(lhs_mat_0123_40, lhs_mat_0123_40, 0); + __m256i lhs_mat_23_40 = _mm256_permute2f128_si256(lhs_mat_0123_40, lhs_mat_0123_40, 17); + __m256i lhs_mat_0123_41 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 288 + 512 * sb))); + __m256i lhs_mat_01_41 = _mm256_permute2f128_si256(lhs_mat_0123_41, lhs_mat_0123_41, 0); + __m256i lhs_mat_23_41 = _mm256_permute2f128_si256(lhs_mat_0123_41, lhs_mat_0123_41, 17); + __m256i lhs_mat_0123_50 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 320 + 512 * sb))); + __m256i lhs_mat_01_50 = _mm256_permute2f128_si256(lhs_mat_0123_50, lhs_mat_0123_50, 0); + __m256i lhs_mat_23_50 = _mm256_permute2f128_si256(lhs_mat_0123_50, lhs_mat_0123_50, 17); + __m256i lhs_mat_0123_51 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 352 + 512 * sb))); + __m256i lhs_mat_01_51 = _mm256_permute2f128_si256(lhs_mat_0123_51, lhs_mat_0123_51, 0); + __m256i lhs_mat_23_51 = _mm256_permute2f128_si256(lhs_mat_0123_51, lhs_mat_0123_51, 17); + __m256i lhs_mat_0123_60 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 384 + 512 * sb))); + __m256i lhs_mat_01_60 = _mm256_permute2f128_si256(lhs_mat_0123_60, lhs_mat_0123_60, 0); + __m256i lhs_mat_23_60 = _mm256_permute2f128_si256(lhs_mat_0123_60, lhs_mat_0123_60, 17); + __m256i lhs_mat_0123_61 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 416 + 512 * sb))); + __m256i lhs_mat_01_61 = _mm256_permute2f128_si256(lhs_mat_0123_61, lhs_mat_0123_61, 0); + __m256i lhs_mat_23_61 = _mm256_permute2f128_si256(lhs_mat_0123_61, lhs_mat_0123_61, 17); + __m256i lhs_mat_0123_70 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 448 + 512 * sb))); + __m256i lhs_mat_01_70 = _mm256_permute2f128_si256(lhs_mat_0123_70, lhs_mat_0123_70, 0); + __m256i lhs_mat_23_70 = _mm256_permute2f128_si256(lhs_mat_0123_70, lhs_mat_0123_70, 17); + __m256i lhs_mat_0123_71 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 480 + 512 * sb))); + __m256i lhs_mat_01_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 0); + __m256i lhs_mat_23_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 17); + + __m256i lhs_mat_s_01_00 = _mm256_maddubs_epi16(lhs_mat_01_00, m32s); + __m256i lhs_mat_s_23_00 = _mm256_maddubs_epi16(lhs_mat_23_00, m32s); + __m256i lhs_mat_s_01_01 = _mm256_maddubs_epi16(lhs_mat_01_01, m32s); + __m256i lhs_mat_s_23_01 = _mm256_maddubs_epi16(lhs_mat_23_01, m32s); + __m256i lhs_mat_s_01_10 = _mm256_maddubs_epi16(lhs_mat_01_10, m32s); + __m256i lhs_mat_s_23_10 = _mm256_maddubs_epi16(lhs_mat_23_10, m32s); + __m256i lhs_mat_s_01_11 = _mm256_maddubs_epi16(lhs_mat_01_11, m32s); + __m256i lhs_mat_s_23_11 = _mm256_maddubs_epi16(lhs_mat_23_11, m32s); + __m256i lhs_mat_s_01_20 = _mm256_maddubs_epi16(lhs_mat_01_20, m32s); + __m256i lhs_mat_s_23_20 = _mm256_maddubs_epi16(lhs_mat_23_20, m32s); + __m256i lhs_mat_s_01_21 = _mm256_maddubs_epi16(lhs_mat_01_21, m32s); + __m256i lhs_mat_s_23_21 = _mm256_maddubs_epi16(lhs_mat_23_21, m32s); + __m256i lhs_mat_s_01_30 = _mm256_maddubs_epi16(lhs_mat_01_30, m32s); + __m256i lhs_mat_s_23_30 = _mm256_maddubs_epi16(lhs_mat_23_30, m32s); + __m256i lhs_mat_s_01_31 = _mm256_maddubs_epi16(lhs_mat_01_31, m32s); + __m256i lhs_mat_s_23_31 = _mm256_maddubs_epi16(lhs_mat_23_31, m32s); + __m256i lhs_mat_s_01_40 = _mm256_maddubs_epi16(lhs_mat_01_40, m32s); + __m256i lhs_mat_s_23_40 = _mm256_maddubs_epi16(lhs_mat_23_40, m32s); + __m256i lhs_mat_s_01_41 = _mm256_maddubs_epi16(lhs_mat_01_41, m32s); + __m256i lhs_mat_s_23_41 = _mm256_maddubs_epi16(lhs_mat_23_41, m32s); + __m256i lhs_mat_s_01_50 = _mm256_maddubs_epi16(lhs_mat_01_50, m32s); + __m256i lhs_mat_s_23_50 = _mm256_maddubs_epi16(lhs_mat_23_50, m32s); + __m256i lhs_mat_s_01_51 = _mm256_maddubs_epi16(lhs_mat_01_51, m32s); + __m256i lhs_mat_s_23_51 = _mm256_maddubs_epi16(lhs_mat_23_51, m32s); + __m256i lhs_mat_s_01_60 = _mm256_maddubs_epi16(lhs_mat_01_60, m32s); + __m256i lhs_mat_s_23_60 = _mm256_maddubs_epi16(lhs_mat_23_60, m32s); + __m256i lhs_mat_s_01_61 = _mm256_maddubs_epi16(lhs_mat_01_61, m32s); + __m256i lhs_mat_s_23_61 = _mm256_maddubs_epi16(lhs_mat_23_61, m32s); + __m256i lhs_mat_s_01_70 = _mm256_maddubs_epi16(lhs_mat_01_70, m32s); + __m256i lhs_mat_s_23_70 = _mm256_maddubs_epi16(lhs_mat_23_70, m32s); + __m256i lhs_mat_s_01_71 = _mm256_maddubs_epi16(lhs_mat_01_71, m32s); + __m256i lhs_mat_s_23_71 = _mm256_maddubs_epi16(lhs_mat_23_71, m32s); + + // Shuffle pattern one - left side input + const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m256i lhs_mat_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + + const __m256i lhs_mat_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m256i lhs_mat_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + + const __m256i lhs_mat_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m256i lhs_mat_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + + const __m256i lhs_mat_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + + const __m256i lhs_mat_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m256i lhs_mat_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + + const __m256i lhs_mat_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m256i lhs_mat_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + + const __m256i lhs_mat_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m256i lhs_mat_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + + const __m256i lhs_mat_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m256i lhs_mat_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + + const __m256i lhs_mat_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m256i lhs_mat_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + + const __m256i lhs_mat_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m256i lhs_mat_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + + const __m256i lhs_mat_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m256i lhs_mat_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + + const __m256i lhs_mat_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m256i lhs_mat_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + + const __m256i lhs_mat_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m256i lhs_mat_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + + const __m256i lhs_mat_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m256i lhs_mat_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + + const __m256i lhs_mat_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m256i lhs_mat_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + + const __m256i lhs_mat_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m256i lhs_mat_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + + // Shuffle pattern two- left side input + const __m256i lhs_mat_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m256i lhs_mat_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + + const __m256i lhs_mat_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m256i lhs_mat_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + + const __m256i lhs_mat_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m256i lhs_mat_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + + const __m256i lhs_mat_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m256i lhs_mat_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + + const __m256i lhs_mat_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m256i lhs_mat_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + + const __m256i lhs_mat_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m256i lhs_mat_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + + const __m256i lhs_mat_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m256i lhs_mat_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + + const __m256i lhs_mat_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m256i lhs_mat_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + + const __m256i lhs_mat_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m256i lhs_mat_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + + const __m256i lhs_mat_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m256i lhs_mat_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + + const __m256i lhs_mat_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m256i lhs_mat_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + + const __m256i lhs_mat_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m256i lhs_mat_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + + const __m256i lhs_mat_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m256i lhs_mat_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m256i lhs_mat_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m256i lhs_mat_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m256i lhs_mat_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m256i lhs_mat_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m256i lhs_mat_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m256i lhs_mat_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + + // Shuffle pattern one - left side input + const __m256i lhs_mat_s_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m256i lhs_mat_s_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + + const __m256i lhs_mat_s_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m256i lhs_mat_s_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + + const __m256i lhs_mat_s_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m256i lhs_mat_s_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + + const __m256i lhs_mat_s_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m256i lhs_mat_s_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + + const __m256i lhs_mat_s_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m256i lhs_mat_s_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + + const __m256i lhs_mat_s_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m256i lhs_mat_s_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + + const __m256i lhs_mat_s_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m256i lhs_mat_s_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + + const __m256i lhs_mat_s_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m256i lhs_mat_s_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + + const __m256i lhs_mat_s_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m256i lhs_mat_s_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + + const __m256i lhs_mat_s_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m256i lhs_mat_s_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + + const __m256i lhs_mat_s_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m256i lhs_mat_s_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + + const __m256i lhs_mat_s_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m256i lhs_mat_s_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + + const __m256i lhs_mat_s_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m256i lhs_mat_s_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + + const __m256i lhs_mat_s_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m256i lhs_mat_s_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + + const __m256i lhs_mat_s_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m256i lhs_mat_s_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + + const __m256i lhs_mat_s_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m256i lhs_mat_s_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + + // Shuffle pattern two- left side input + const __m256i lhs_mat_s_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m256i lhs_mat_s_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + + const __m256i lhs_mat_s_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m256i lhs_mat_s_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + + const __m256i lhs_mat_s_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m256i lhs_mat_s_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + + const __m256i lhs_mat_s_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m256i lhs_mat_s_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + + const __m256i lhs_mat_s_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m256i lhs_mat_s_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + + const __m256i lhs_mat_s_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m256i lhs_mat_s_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + + const __m256i lhs_mat_s_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m256i lhs_mat_s_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + + const __m256i lhs_mat_s_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m256i lhs_mat_s_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + + const __m256i lhs_mat_s_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m256i lhs_mat_s_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + + const __m256i lhs_mat_s_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m256i lhs_mat_s_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + + const __m256i lhs_mat_s_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m256i lhs_mat_s_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + + const __m256i lhs_mat_s_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m256i lhs_mat_s_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + + const __m256i lhs_mat_s_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m256i lhs_mat_s_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m256i lhs_mat_s_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m256i lhs_mat_s_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m256i lhs_mat_s_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m256i lhs_mat_s_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m256i lhs_mat_s_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m256i lhs_mat_s_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + + // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane + __m256i iacc_mat_00_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); + __m256i iacc_mat_01_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); + + __m256i iacc_mat_10_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp1, lhs_mat_23_00_sp1), lhs_mat_s_23_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp1, lhs_mat_23_01_sp1), lhs_mat_s_23_01_sp1)); + __m256i iacc_mat_11_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_00_sp1, lhs_mat_23_00_sp1), lhs_mat_s_23_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_01_sp1, lhs_mat_23_01_sp1), lhs_mat_s_23_01_sp1)); + + __m256i iacc_mat_00_1_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_10_sp1, lhs_mat_01_10_sp1), lhs_mat_s_01_10_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_11_sp1, lhs_mat_01_11_sp1), lhs_mat_s_01_11_sp1)); + __m256i iacc_mat_01_1_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_10_sp1, lhs_mat_01_10_sp1), lhs_mat_s_01_10_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_11_sp1, lhs_mat_01_11_sp1), lhs_mat_s_01_11_sp1)); + + __m256i iacc_mat_10_1_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_10_sp1, lhs_mat_23_10_sp1), lhs_mat_s_23_10_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_11_sp1, lhs_mat_23_11_sp1), lhs_mat_s_23_11_sp1)); + __m256i iacc_mat_11_1_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_10_sp1, lhs_mat_23_10_sp1), lhs_mat_s_23_10_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_11_sp1, lhs_mat_23_11_sp1), lhs_mat_s_23_11_sp1)); + + __m256i iacc_mat_00_2_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_20_sp1, lhs_mat_01_20_sp1), lhs_mat_s_01_20_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_21_sp1, lhs_mat_01_21_sp1), lhs_mat_s_01_21_sp1)); + __m256i iacc_mat_01_2_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_20_sp1, lhs_mat_01_20_sp1), lhs_mat_s_01_20_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_21_sp1, lhs_mat_01_21_sp1), lhs_mat_s_01_21_sp1)); + + __m256i iacc_mat_10_2_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_20_sp1, lhs_mat_23_20_sp1), lhs_mat_s_23_20_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_21_sp1, lhs_mat_23_21_sp1), lhs_mat_s_23_21_sp1)); + __m256i iacc_mat_11_2_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_20_sp1, lhs_mat_23_20_sp1), lhs_mat_s_23_20_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_21_sp1, lhs_mat_23_21_sp1), lhs_mat_s_23_21_sp1)); + + __m256i iacc_mat_00_3_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_30_sp1, lhs_mat_01_30_sp1), lhs_mat_s_01_30_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_31_sp1, lhs_mat_01_31_sp1), lhs_mat_s_01_31_sp1)); + __m256i iacc_mat_01_3_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_30_sp1, lhs_mat_01_30_sp1), lhs_mat_s_01_30_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_31_sp1, lhs_mat_01_31_sp1), lhs_mat_s_01_31_sp1)); + + __m256i iacc_mat_10_3_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_30_sp1, lhs_mat_23_30_sp1), lhs_mat_s_23_30_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_31_sp1, lhs_mat_23_31_sp1), lhs_mat_s_23_31_sp1)); + __m256i iacc_mat_11_3_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_30_sp1, lhs_mat_23_30_sp1), lhs_mat_s_23_30_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_31_sp1, lhs_mat_23_31_sp1), lhs_mat_s_23_31_sp1)); + + __m256i iacc_mat_00_4_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_40_sp1, lhs_mat_01_40_sp1), lhs_mat_s_01_40_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_41_sp1, lhs_mat_01_41_sp1), lhs_mat_s_01_41_sp1)); + __m256i iacc_mat_01_4_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_40_sp1, lhs_mat_01_40_sp1), lhs_mat_s_01_40_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_41_sp1, lhs_mat_01_41_sp1), lhs_mat_s_01_41_sp1)); + + __m256i iacc_mat_10_4_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_40_sp1, lhs_mat_23_40_sp1), lhs_mat_s_23_40_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_41_sp1, lhs_mat_23_41_sp1), lhs_mat_s_23_41_sp1)); + __m256i iacc_mat_11_4_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_40_sp1, lhs_mat_23_40_sp1), lhs_mat_s_23_40_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_41_sp1, lhs_mat_23_41_sp1), lhs_mat_s_23_41_sp1)); + + __m256i iacc_mat_00_5_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_50_sp1, lhs_mat_01_50_sp1), lhs_mat_s_01_50_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_51_sp1, lhs_mat_01_51_sp1), lhs_mat_s_01_51_sp1)); + __m256i iacc_mat_01_5_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_50_sp1, lhs_mat_01_50_sp1), lhs_mat_s_01_50_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_51_sp1, lhs_mat_01_51_sp1), lhs_mat_s_01_51_sp1)); + + __m256i iacc_mat_10_5_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_50_sp1, lhs_mat_23_50_sp1), lhs_mat_s_23_50_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_51_sp1, lhs_mat_23_51_sp1), lhs_mat_s_23_51_sp1)); + __m256i iacc_mat_11_5_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_50_sp1, lhs_mat_23_50_sp1), lhs_mat_s_23_50_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_51_sp1, lhs_mat_23_51_sp1), lhs_mat_s_23_51_sp1)); + + __m256i iacc_mat_00_6_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_60_sp1, lhs_mat_01_60_sp1), lhs_mat_s_01_60_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_61_sp1, lhs_mat_01_61_sp1), lhs_mat_s_01_61_sp1)); + __m256i iacc_mat_01_6_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_60_sp1, lhs_mat_01_60_sp1), lhs_mat_s_01_60_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_61_sp1, lhs_mat_01_61_sp1), lhs_mat_s_01_61_sp1)); + + __m256i iacc_mat_10_6_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_60_sp1, lhs_mat_23_60_sp1), lhs_mat_s_23_60_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_61_sp1, lhs_mat_23_61_sp1), lhs_mat_s_23_61_sp1)); + __m256i iacc_mat_11_6_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_60_sp1, lhs_mat_23_60_sp1), lhs_mat_s_23_60_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_61_sp1, lhs_mat_23_61_sp1), lhs_mat_s_23_61_sp1)); + + __m256i iacc_mat_00_7_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_70_sp1, lhs_mat_01_70_sp1), lhs_mat_s_01_70_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_71_sp1, lhs_mat_01_71_sp1), lhs_mat_s_01_71_sp1)); + __m256i iacc_mat_01_7_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_70_sp1, lhs_mat_01_70_sp1), lhs_mat_s_01_70_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_71_sp1, lhs_mat_01_71_sp1), lhs_mat_s_01_71_sp1)); + + __m256i iacc_mat_10_7_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_70_sp1, lhs_mat_23_70_sp1), lhs_mat_s_23_70_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_71_sp1, lhs_mat_23_71_sp1), lhs_mat_s_23_71_sp1)); + __m256i iacc_mat_11_7_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_70_sp1, lhs_mat_23_70_sp1), lhs_mat_s_23_70_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_71_sp1, lhs_mat_23_71_sp1), lhs_mat_s_23_71_sp1)); + + __m256i iacc_mat_00_0_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp2, lhs_mat_01_00_sp2), lhs_mat_s_01_00_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp2, lhs_mat_01_01_sp2), lhs_mat_s_01_01_sp2)); + __m256i iacc_mat_01_0_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_00_sp2, lhs_mat_01_00_sp2), lhs_mat_s_01_00_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_01_sp2, lhs_mat_01_01_sp2), lhs_mat_s_01_01_sp2)); + + __m256i iacc_mat_10_0_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp2, lhs_mat_23_00_sp2), lhs_mat_s_23_00_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp2, lhs_mat_23_01_sp2), lhs_mat_s_23_01_sp2)); + __m256i iacc_mat_11_0_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_00_sp2, lhs_mat_23_00_sp2), lhs_mat_s_23_00_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_01_sp2, lhs_mat_23_01_sp2), lhs_mat_s_23_01_sp2)); + + __m256i iacc_mat_00_1_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_10_sp2, lhs_mat_01_10_sp2), lhs_mat_s_01_10_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_11_sp2, lhs_mat_01_11_sp2), lhs_mat_s_01_11_sp2)); + __m256i iacc_mat_01_1_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_10_sp2, lhs_mat_01_10_sp2), lhs_mat_s_01_10_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_11_sp2, lhs_mat_01_11_sp2), lhs_mat_s_01_11_sp2)); + + __m256i iacc_mat_10_1_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_10_sp2, lhs_mat_23_10_sp2), lhs_mat_s_23_10_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_11_sp2, lhs_mat_23_11_sp2), lhs_mat_s_23_11_sp2)); + __m256i iacc_mat_11_1_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_10_sp2, lhs_mat_23_10_sp2), lhs_mat_s_23_10_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_11_sp2, lhs_mat_23_11_sp2), lhs_mat_s_23_11_sp2)); + + __m256i iacc_mat_00_2_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_20_sp2, lhs_mat_01_20_sp2), lhs_mat_s_01_20_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_21_sp2, lhs_mat_01_21_sp2), lhs_mat_s_01_21_sp2)); + __m256i iacc_mat_01_2_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_20_sp2, lhs_mat_01_20_sp2), lhs_mat_s_01_20_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_21_sp2, lhs_mat_01_21_sp2), lhs_mat_s_01_21_sp2)); + + __m256i iacc_mat_10_2_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_20_sp2, lhs_mat_23_20_sp2), lhs_mat_s_23_20_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_21_sp2, lhs_mat_23_21_sp2), lhs_mat_s_23_21_sp2)); + __m256i iacc_mat_11_2_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_20_sp2, lhs_mat_23_20_sp2), lhs_mat_s_23_20_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_21_sp2, lhs_mat_23_21_sp2), lhs_mat_s_23_21_sp2)); + + __m256i iacc_mat_00_3_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_30_sp2, lhs_mat_01_30_sp2), lhs_mat_s_01_30_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_31_sp2, lhs_mat_01_31_sp2), lhs_mat_s_01_31_sp2)); + __m256i iacc_mat_01_3_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_30_sp2, lhs_mat_01_30_sp2), lhs_mat_s_01_30_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_31_sp2, lhs_mat_01_31_sp2), lhs_mat_s_01_31_sp2)); + + __m256i iacc_mat_10_3_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_30_sp2, lhs_mat_23_30_sp2), lhs_mat_s_23_30_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_31_sp2, lhs_mat_23_31_sp2), lhs_mat_s_23_31_sp2)); + __m256i iacc_mat_11_3_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_30_sp2, lhs_mat_23_30_sp2), lhs_mat_s_23_30_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_31_sp2, lhs_mat_23_31_sp2), lhs_mat_s_23_31_sp2)); + + __m256i iacc_mat_00_4_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_40_sp2, lhs_mat_01_40_sp2), lhs_mat_s_01_40_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_41_sp2, lhs_mat_01_41_sp2), lhs_mat_s_01_41_sp2)); + __m256i iacc_mat_01_4_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_40_sp2, lhs_mat_01_40_sp2), lhs_mat_s_01_40_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_41_sp2, lhs_mat_01_41_sp2), lhs_mat_s_01_41_sp2)); + + __m256i iacc_mat_10_4_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_40_sp2, lhs_mat_23_40_sp2), lhs_mat_s_23_40_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_41_sp2, lhs_mat_23_41_sp2), lhs_mat_s_23_41_sp2)); + __m256i iacc_mat_11_4_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_40_sp2, lhs_mat_23_40_sp2), lhs_mat_s_23_40_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_41_sp2, lhs_mat_23_41_sp2), lhs_mat_s_23_41_sp2)); + + __m256i iacc_mat_00_5_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_50_sp2, lhs_mat_01_50_sp2), lhs_mat_s_01_50_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_51_sp2, lhs_mat_01_51_sp2), lhs_mat_s_01_51_sp2)); + __m256i iacc_mat_01_5_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_50_sp2, lhs_mat_01_50_sp2), lhs_mat_s_01_50_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_51_sp2, lhs_mat_01_51_sp2), lhs_mat_s_01_51_sp2)); + + __m256i iacc_mat_10_5_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_50_sp2, lhs_mat_23_50_sp2), lhs_mat_s_23_50_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_51_sp2, lhs_mat_23_51_sp2), lhs_mat_s_23_51_sp2)); + __m256i iacc_mat_11_5_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_50_sp2, lhs_mat_23_50_sp2), lhs_mat_s_23_50_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_51_sp2, lhs_mat_23_51_sp2), lhs_mat_s_23_51_sp2)); + + __m256i iacc_mat_00_6_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_60_sp2, lhs_mat_01_60_sp2), lhs_mat_s_01_60_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_61_sp2, lhs_mat_01_61_sp2), lhs_mat_s_01_61_sp2)); + __m256i iacc_mat_01_6_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_60_sp2, lhs_mat_01_60_sp2), lhs_mat_s_01_60_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_61_sp2, lhs_mat_01_61_sp2), lhs_mat_s_01_61_sp2)); + + __m256i iacc_mat_10_6_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_60_sp2, lhs_mat_23_60_sp2), lhs_mat_s_23_60_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_61_sp2, lhs_mat_23_61_sp2), lhs_mat_s_23_61_sp2)); + __m256i iacc_mat_11_6_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_60_sp2, lhs_mat_23_60_sp2), lhs_mat_s_23_60_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_61_sp2, lhs_mat_23_61_sp2), lhs_mat_s_23_61_sp2)); + + __m256i iacc_mat_00_7_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_70_sp2, lhs_mat_01_70_sp2), lhs_mat_s_01_70_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_71_sp2, lhs_mat_01_71_sp2), lhs_mat_s_01_71_sp2)); + __m256i iacc_mat_01_7_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_70_sp2, lhs_mat_01_70_sp2), lhs_mat_s_01_70_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_71_sp2, lhs_mat_01_71_sp2), lhs_mat_s_01_71_sp2)); + + __m256i iacc_mat_10_7_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_70_sp2, lhs_mat_23_70_sp2), lhs_mat_s_23_70_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_71_sp2, lhs_mat_23_71_sp2), lhs_mat_s_23_71_sp2)); + __m256i iacc_mat_11_7_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_70_sp2, lhs_mat_23_70_sp2), lhs_mat_s_23_70_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_71_sp2, lhs_mat_23_71_sp2), lhs_mat_s_23_71_sp2)); + + // Combine results from both shuffle patterns for each output block + __m256i iacc_mat_00_0 = _mm256_add_epi16(iacc_mat_00_0_sp1, iacc_mat_00_0_sp2); + __m256i iacc_mat_01_0 = _mm256_add_epi16(iacc_mat_01_0_sp1, iacc_mat_01_0_sp2); + __m256i iacc_mat_10_0 = _mm256_add_epi16(iacc_mat_10_0_sp1, iacc_mat_10_0_sp2); + __m256i iacc_mat_11_0 = _mm256_add_epi16(iacc_mat_11_0_sp1, iacc_mat_11_0_sp2); + + __m256i iacc_mat_00_1 = _mm256_add_epi16(iacc_mat_00_1_sp1, iacc_mat_00_1_sp2); + __m256i iacc_mat_01_1 = _mm256_add_epi16(iacc_mat_01_1_sp1, iacc_mat_01_1_sp2); + __m256i iacc_mat_10_1 = _mm256_add_epi16(iacc_mat_10_1_sp1, iacc_mat_10_1_sp2); + __m256i iacc_mat_11_1 = _mm256_add_epi16(iacc_mat_11_1_sp1, iacc_mat_11_1_sp2); + + __m256i iacc_mat_00_2 = _mm256_add_epi16(iacc_mat_00_2_sp1, iacc_mat_00_2_sp2); + __m256i iacc_mat_01_2 = _mm256_add_epi16(iacc_mat_01_2_sp1, iacc_mat_01_2_sp2); + __m256i iacc_mat_10_2 = _mm256_add_epi16(iacc_mat_10_2_sp1, iacc_mat_10_2_sp2); + __m256i iacc_mat_11_2 = _mm256_add_epi16(iacc_mat_11_2_sp1, iacc_mat_11_2_sp2); + + __m256i iacc_mat_00_3 = _mm256_add_epi16(iacc_mat_00_3_sp1, iacc_mat_00_3_sp2); + __m256i iacc_mat_01_3 = _mm256_add_epi16(iacc_mat_01_3_sp1, iacc_mat_01_3_sp2); + __m256i iacc_mat_10_3 = _mm256_add_epi16(iacc_mat_10_3_sp1, iacc_mat_10_3_sp2); + __m256i iacc_mat_11_3 = _mm256_add_epi16(iacc_mat_11_3_sp1, iacc_mat_11_3_sp2); + + __m256i iacc_mat_00_4 = _mm256_add_epi16(iacc_mat_00_4_sp1, iacc_mat_00_4_sp2); + __m256i iacc_mat_01_4 = _mm256_add_epi16(iacc_mat_01_4_sp1, iacc_mat_01_4_sp2); + __m256i iacc_mat_10_4 = _mm256_add_epi16(iacc_mat_10_4_sp1, iacc_mat_10_4_sp2); + __m256i iacc_mat_11_4 = _mm256_add_epi16(iacc_mat_11_4_sp1, iacc_mat_11_4_sp2); + + __m256i iacc_mat_00_5 = _mm256_add_epi16(iacc_mat_00_5_sp1, iacc_mat_00_5_sp2); + __m256i iacc_mat_01_5 = _mm256_add_epi16(iacc_mat_01_5_sp1, iacc_mat_01_5_sp2); + __m256i iacc_mat_10_5 = _mm256_add_epi16(iacc_mat_10_5_sp1, iacc_mat_10_5_sp2); + __m256i iacc_mat_11_5 = _mm256_add_epi16(iacc_mat_11_5_sp1, iacc_mat_11_5_sp2); + + __m256i iacc_mat_00_6 = _mm256_add_epi16(iacc_mat_00_6_sp1, iacc_mat_00_6_sp2); + __m256i iacc_mat_01_6 = _mm256_add_epi16(iacc_mat_01_6_sp1, iacc_mat_01_6_sp2); + __m256i iacc_mat_10_6 = _mm256_add_epi16(iacc_mat_10_6_sp1, iacc_mat_10_6_sp2); + __m256i iacc_mat_11_6 = _mm256_add_epi16(iacc_mat_11_6_sp1, iacc_mat_11_6_sp2); + + __m256i iacc_mat_00_7 = _mm256_add_epi16(iacc_mat_00_7_sp1, iacc_mat_00_7_sp2); + __m256i iacc_mat_01_7 = _mm256_add_epi16(iacc_mat_01_7_sp1, iacc_mat_01_7_sp2); + __m256i iacc_mat_10_7 = _mm256_add_epi16(iacc_mat_10_7_sp1, iacc_mat_10_7_sp2); + __m256i iacc_mat_11_7 = _mm256_add_epi16(iacc_mat_11_7_sp1, iacc_mat_11_7_sp2); + + // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block + iacc_mat_00_0 = _mm256_madd_epi16(iacc_mat_00_0, scale_0145_0); + iacc_mat_01_0 = _mm256_madd_epi16(iacc_mat_01_0, scale_2367_0); + iacc_mat_10_0 = _mm256_madd_epi16(iacc_mat_10_0, scale_0145_0); + iacc_mat_11_0 = _mm256_madd_epi16(iacc_mat_11_0, scale_2367_0); + + iacc_mat_00_1 = _mm256_madd_epi16(iacc_mat_00_1, scale_0145_1); + iacc_mat_01_1 = _mm256_madd_epi16(iacc_mat_01_1, scale_2367_1); + iacc_mat_10_1 = _mm256_madd_epi16(iacc_mat_10_1, scale_0145_1); + iacc_mat_11_1 = _mm256_madd_epi16(iacc_mat_11_1, scale_2367_1); + + iacc_mat_00_2 = _mm256_madd_epi16(iacc_mat_00_2, scale_0145_2); + iacc_mat_01_2 = _mm256_madd_epi16(iacc_mat_01_2, scale_2367_2); + iacc_mat_10_2 = _mm256_madd_epi16(iacc_mat_10_2, scale_0145_2); + iacc_mat_11_2 = _mm256_madd_epi16(iacc_mat_11_2, scale_2367_2); + + iacc_mat_00_3 = _mm256_madd_epi16(iacc_mat_00_3, scale_0145_3); + iacc_mat_01_3 = _mm256_madd_epi16(iacc_mat_01_3, scale_2367_3); + iacc_mat_10_3 = _mm256_madd_epi16(iacc_mat_10_3, scale_0145_3); + iacc_mat_11_3 = _mm256_madd_epi16(iacc_mat_11_3, scale_2367_3); + + iacc_mat_00_4 = _mm256_madd_epi16(iacc_mat_00_4, scale_0145_4); + iacc_mat_01_4 = _mm256_madd_epi16(iacc_mat_01_4, scale_2367_4); + iacc_mat_10_4 = _mm256_madd_epi16(iacc_mat_10_4, scale_0145_4); + iacc_mat_11_4 = _mm256_madd_epi16(iacc_mat_11_4, scale_2367_4); + + iacc_mat_00_5 = _mm256_madd_epi16(iacc_mat_00_5, scale_0145_5); + iacc_mat_01_5 = _mm256_madd_epi16(iacc_mat_01_5, scale_2367_5); + iacc_mat_10_5 = _mm256_madd_epi16(iacc_mat_10_5, scale_0145_5); + iacc_mat_11_5 = _mm256_madd_epi16(iacc_mat_11_5, scale_2367_5); + + iacc_mat_00_6 = _mm256_madd_epi16(iacc_mat_00_6, scale_0145_6); + iacc_mat_01_6 = _mm256_madd_epi16(iacc_mat_01_6, scale_2367_6); + iacc_mat_10_6 = _mm256_madd_epi16(iacc_mat_10_6, scale_0145_6); + iacc_mat_11_6 = _mm256_madd_epi16(iacc_mat_11_6, scale_2367_6); + + iacc_mat_00_7 = _mm256_madd_epi16(iacc_mat_00_7, scale_0145_7); + iacc_mat_01_7 = _mm256_madd_epi16(iacc_mat_01_7, scale_2367_7); + iacc_mat_10_7 = _mm256_madd_epi16(iacc_mat_10_7, scale_0145_7); + iacc_mat_11_7 = _mm256_madd_epi16(iacc_mat_11_7, scale_2367_7); + + __m256i iacc_mat_00 = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(iacc_mat_00_0, iacc_mat_00_1), _mm256_add_epi32(iacc_mat_00_2, iacc_mat_00_3)), _mm256_add_epi32(_mm256_add_epi32(iacc_mat_00_4, iacc_mat_00_5), _mm256_add_epi32(iacc_mat_00_6, iacc_mat_00_7))); + __m256i iacc_mat_01 = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(iacc_mat_01_0, iacc_mat_01_1), _mm256_add_epi32(iacc_mat_01_2, iacc_mat_01_3)), _mm256_add_epi32(_mm256_add_epi32(iacc_mat_01_4, iacc_mat_01_5), _mm256_add_epi32(iacc_mat_01_6, iacc_mat_01_7))); + __m256i iacc_mat_10 = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(iacc_mat_10_0, iacc_mat_10_1), _mm256_add_epi32(iacc_mat_10_2, iacc_mat_10_3)), _mm256_add_epi32(_mm256_add_epi32(iacc_mat_10_4, iacc_mat_10_5), _mm256_add_epi32(iacc_mat_10_6, iacc_mat_10_7))); + __m256i iacc_mat_11 = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(iacc_mat_11_0, iacc_mat_11_1), _mm256_add_epi32(iacc_mat_11_2, iacc_mat_11_3)), _mm256_add_epi32(_mm256_add_epi32(iacc_mat_11_4, iacc_mat_11_5), _mm256_add_epi32(iacc_mat_11_6, iacc_mat_11_7))); + + // Straighten out to make 4 row vectors + __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204); + __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204); + __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204); + __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204); + + // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes + const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d); + const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse); + + // Multiply with appropiate scales and accumulate (for both d and dmin) below + acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]); + acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]); + acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]); + acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]); + } + } + } + + // Store the accumulated values + for (int i = 0; i < 16; i++) { + _mm256_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]); + } + } + } + +#else + + ggml_gemm_q6_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc); + + +#endif +} \ No newline at end of file diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index b70ea7d78b..f88cd7f627 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -616,6 +616,10 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } } +void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + +} + void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; @@ -1118,6 +1122,9 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } } +void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + +} void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; @@ -1410,6 +1417,51 @@ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_in } + +static block_q6_Kx8 make_block_q6_Kx8(block_q6_K* in, unsigned int blck_size_interleave) { + block_q6_Kx8 out; + + // Delta(scale) of the eight Q6_K structures are copied onto the output interleaved structure + for (int i = 0; i < 8; i++) { + out.d[i] = in[i].d; + } + + const int end = QK_K * 4 / blck_size_interleave; + + // Interleave Q6_K quants by taking 8 bytes at a time + for (int i = 0; i < end; ++i) { + int src_id = i % 8; + int src_offset = (i / 8) * blck_size_interleave; + int dst_offset = i * blck_size_interleave; + + uint64_t ql_elems; + memcpy(&ql_elems, &in[src_id].ql[src_offset], sizeof(uint64_t)); + memcpy(&out.ql[dst_offset], &ql_elems, sizeof(uint64_t)); + } + + for (int i = 0; i < 64; ++i) { + int qh_src_id = i % 8; + int qh_src_offset = (i / 8) * blck_size_interleave; + int qh_dst_offset = i * blck_size_interleave; + + uint64_t qh_elems; + memcpy(&qh_elems, &in[qh_src_id].qh[qh_src_offset], sizeof(uint64_t)); + memcpy(&out.qh[qh_dst_offset], &qh_elems, sizeof(uint64_t)); + } + + for (int i = 0; i < 128; i++) { + + // Index for selecting which q6k super block + int src1 = (i % 16) / 2; + // Index for selecting scale + int src2 = ((i / 16) * 2) + (i % 2); + + out.scales[i] = in[src1].scales[src2]; + } + return out; + +} + static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(interleave_block == 4 || interleave_block == 8); @@ -1503,6 +1555,38 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block GGML_UNUSED(data_size); } +static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor* t, int interleave_block, const void* GGML_RESTRICT data, size_t data_size) { + GGML_ASSERT(t->type == GGML_TYPE_Q6_K); + GGML_ASSERT(interleave_block == 8); + constexpr int nrows_interleaved = 8; + + block_q6_Kx8* dst = (block_q6_Kx8*)t->data; + const block_q6_K* src = (const block_q6_K*)data; + block_q6_K dst_tmp[8]; + int nrow = ggml_nrows(t); + int nblocks = t->ne[0] / QK_K; + + GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q6_K)); + + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + return -1; + } + + for (int b = 0; b < nrow; b += nrows_interleaved) { + for (int64_t x = 0; x < nblocks; x++) { + for (int i = 0; i < nrows_interleaved; i++) { + dst_tmp[i] = src[x + i * nblocks]; + } + *dst++ = make_block_q6_Kx8(dst_tmp, interleave_block); + } + src += nrows_interleaved * nblocks; + } + return 0; + + GGML_UNUSED(data_size); +} + + static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(interleave_block == 8); @@ -1689,6 +1773,10 @@ template <> int repack(struct ggml_tensor * t, const void * da return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size); } +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q6_K_to_q6_K_8_bl(t, 8, data, data_size); +} + template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); } @@ -1730,6 +1818,10 @@ template <> void gemv(int n, float * s, size_t ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); } +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); +} + template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); } @@ -1766,6 +1858,10 @@ template <> void gemm(int n, float * s, size_t ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); } +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); +} + template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); } @@ -2164,6 +2260,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons // instance for Q2 static const ggml::cpu::repack::tensor_traits q2_K_8x8_q8_K; + // instance for Q6 + static const ggml::cpu::repack::tensor_traits q6_K_8x8_q8_K; + // instance for IQ4 static const ggml::cpu::repack::tensor_traits iq4_nl_4x4_q8_0; static const ggml::cpu::repack::tensor_traits iq4_nl_8x8_q8_0; @@ -2207,6 +2306,12 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons return &q2_K_8x8_q8_K; } } + } else if (cur->type == GGML_TYPE_Q6_K) { + if (ggml_cpu_has_avx2()) { + if (cur->ne[1] % 8 == 0) { + return &q6_K_8x8_q8_K; + } + } } else if (cur->type == GGML_TYPE_IQ4_NL) { if (ggml_cpu_has_avx2()) { if (cur->ne[1] % 8 == 0) { diff --git a/ggml/src/ggml-cpu/repack.h b/ggml/src/ggml-cpu/repack.h index c4d928cd15..4a9c723c60 100644 --- a/ggml/src/ggml-cpu/repack.h +++ b/ggml/src/ggml-cpu/repack.h @@ -52,6 +52,14 @@ struct block_q2_Kx8 { }; static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding"); +struct block_q6_Kx8 { + uint8_t ql[1024]; // quants, lower 4 bits + uint8_t qh[512]; // quants, upper 2 bits + int8_t scales[128]; // scales, quantized with 8 bits + ggml_half d[8]; +}; + +static_assert(sizeof(block_q6_Kx8) == sizeof(ggml_half)*8 + (QK_K)/2 + 24*QK_K/4, "wrong block_q6_Kx8 size/padding"); struct block_q8_Kx4 { float d[4]; // delta int8_t qs[QK_K * 4]; // quants @@ -88,6 +96,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); @@ -96,6 +105,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); @@ -110,6 +120,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); @@ -118,6 +129,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); From 4630b5187e0682d444d105197b9ab6f8dba444ad Mon Sep 17 00:00:00 2001 From: Manogna-Sree Date: Mon, 11 Aug 2025 02:47:20 -0700 Subject: [PATCH 02/23] Fix for inaccuracy of GEMM Q6K --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 65 +++++++++++++-------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index d4a29058ba..fb82584550 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -6900,7 +6900,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_7, m4), rhs_hbit_2367_31); const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_7, 4), m4), rhs_hbit_2367_71); - // Shuffle pattern one - right side input const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) const __m256i rhs_mat_2367_00_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_00, 136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) @@ -7094,38 +7093,38 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i lhs_mat_01_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 0); __m256i lhs_mat_23_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 17); - __m256i lhs_mat_s_01_00 = _mm256_maddubs_epi16(lhs_mat_01_00, m32s); - __m256i lhs_mat_s_23_00 = _mm256_maddubs_epi16(lhs_mat_23_00, m32s); - __m256i lhs_mat_s_01_01 = _mm256_maddubs_epi16(lhs_mat_01_01, m32s); - __m256i lhs_mat_s_23_01 = _mm256_maddubs_epi16(lhs_mat_23_01, m32s); - __m256i lhs_mat_s_01_10 = _mm256_maddubs_epi16(lhs_mat_01_10, m32s); - __m256i lhs_mat_s_23_10 = _mm256_maddubs_epi16(lhs_mat_23_10, m32s); - __m256i lhs_mat_s_01_11 = _mm256_maddubs_epi16(lhs_mat_01_11, m32s); - __m256i lhs_mat_s_23_11 = _mm256_maddubs_epi16(lhs_mat_23_11, m32s); - __m256i lhs_mat_s_01_20 = _mm256_maddubs_epi16(lhs_mat_01_20, m32s); - __m256i lhs_mat_s_23_20 = _mm256_maddubs_epi16(lhs_mat_23_20, m32s); - __m256i lhs_mat_s_01_21 = _mm256_maddubs_epi16(lhs_mat_01_21, m32s); - __m256i lhs_mat_s_23_21 = _mm256_maddubs_epi16(lhs_mat_23_21, m32s); - __m256i lhs_mat_s_01_30 = _mm256_maddubs_epi16(lhs_mat_01_30, m32s); - __m256i lhs_mat_s_23_30 = _mm256_maddubs_epi16(lhs_mat_23_30, m32s); - __m256i lhs_mat_s_01_31 = _mm256_maddubs_epi16(lhs_mat_01_31, m32s); - __m256i lhs_mat_s_23_31 = _mm256_maddubs_epi16(lhs_mat_23_31, m32s); - __m256i lhs_mat_s_01_40 = _mm256_maddubs_epi16(lhs_mat_01_40, m32s); - __m256i lhs_mat_s_23_40 = _mm256_maddubs_epi16(lhs_mat_23_40, m32s); - __m256i lhs_mat_s_01_41 = _mm256_maddubs_epi16(lhs_mat_01_41, m32s); - __m256i lhs_mat_s_23_41 = _mm256_maddubs_epi16(lhs_mat_23_41, m32s); - __m256i lhs_mat_s_01_50 = _mm256_maddubs_epi16(lhs_mat_01_50, m32s); - __m256i lhs_mat_s_23_50 = _mm256_maddubs_epi16(lhs_mat_23_50, m32s); - __m256i lhs_mat_s_01_51 = _mm256_maddubs_epi16(lhs_mat_01_51, m32s); - __m256i lhs_mat_s_23_51 = _mm256_maddubs_epi16(lhs_mat_23_51, m32s); - __m256i lhs_mat_s_01_60 = _mm256_maddubs_epi16(lhs_mat_01_60, m32s); - __m256i lhs_mat_s_23_60 = _mm256_maddubs_epi16(lhs_mat_23_60, m32s); - __m256i lhs_mat_s_01_61 = _mm256_maddubs_epi16(lhs_mat_01_61, m32s); - __m256i lhs_mat_s_23_61 = _mm256_maddubs_epi16(lhs_mat_23_61, m32s); - __m256i lhs_mat_s_01_70 = _mm256_maddubs_epi16(lhs_mat_01_70, m32s); - __m256i lhs_mat_s_23_70 = _mm256_maddubs_epi16(lhs_mat_23_70, m32s); - __m256i lhs_mat_s_01_71 = _mm256_maddubs_epi16(lhs_mat_01_71, m32s); - __m256i lhs_mat_s_23_71 = _mm256_maddubs_epi16(lhs_mat_23_71, m32s); + __m256i lhs_mat_s_01_00 = _mm256_maddubs_epi16(m32s, lhs_mat_01_00); + __m256i lhs_mat_s_23_00 = _mm256_maddubs_epi16(m32s, lhs_mat_23_00); + __m256i lhs_mat_s_01_01 = _mm256_maddubs_epi16(m32s, lhs_mat_01_01); + __m256i lhs_mat_s_23_01 = _mm256_maddubs_epi16(m32s, lhs_mat_23_01); + __m256i lhs_mat_s_01_10 = _mm256_maddubs_epi16(m32s, lhs_mat_01_10); + __m256i lhs_mat_s_23_10 = _mm256_maddubs_epi16(m32s, lhs_mat_23_10); + __m256i lhs_mat_s_01_11 = _mm256_maddubs_epi16(m32s, lhs_mat_01_11); + __m256i lhs_mat_s_23_11 = _mm256_maddubs_epi16(m32s, lhs_mat_23_11); + __m256i lhs_mat_s_01_20 = _mm256_maddubs_epi16(m32s, lhs_mat_01_20); + __m256i lhs_mat_s_23_20 = _mm256_maddubs_epi16(m32s, lhs_mat_23_20); + __m256i lhs_mat_s_01_21 = _mm256_maddubs_epi16(m32s, lhs_mat_01_21); + __m256i lhs_mat_s_23_21 = _mm256_maddubs_epi16(m32s, lhs_mat_23_21); + __m256i lhs_mat_s_01_30 = _mm256_maddubs_epi16(m32s, lhs_mat_01_30); + __m256i lhs_mat_s_23_30 = _mm256_maddubs_epi16(m32s, lhs_mat_23_30); + __m256i lhs_mat_s_01_31 = _mm256_maddubs_epi16(m32s, lhs_mat_01_31); + __m256i lhs_mat_s_23_31 = _mm256_maddubs_epi16(m32s, lhs_mat_23_31); + __m256i lhs_mat_s_01_40 = _mm256_maddubs_epi16(m32s, lhs_mat_01_40); + __m256i lhs_mat_s_23_40 = _mm256_maddubs_epi16(m32s, lhs_mat_23_40); + __m256i lhs_mat_s_01_41 = _mm256_maddubs_epi16(m32s, lhs_mat_01_41); + __m256i lhs_mat_s_23_41 = _mm256_maddubs_epi16(m32s, lhs_mat_23_41); + __m256i lhs_mat_s_01_50 = _mm256_maddubs_epi16(m32s, lhs_mat_01_50); + __m256i lhs_mat_s_23_50 = _mm256_maddubs_epi16(m32s, lhs_mat_23_50); + __m256i lhs_mat_s_01_51 = _mm256_maddubs_epi16(m32s, lhs_mat_01_51); + __m256i lhs_mat_s_23_51 = _mm256_maddubs_epi16(m32s, lhs_mat_23_51); + __m256i lhs_mat_s_01_60 = _mm256_maddubs_epi16(m32s, lhs_mat_01_60); + __m256i lhs_mat_s_23_60 = _mm256_maddubs_epi16(m32s, lhs_mat_23_60); + __m256i lhs_mat_s_01_61 = _mm256_maddubs_epi16(m32s, lhs_mat_01_61); + __m256i lhs_mat_s_23_61 = _mm256_maddubs_epi16(m32s, lhs_mat_23_61); + __m256i lhs_mat_s_01_70 = _mm256_maddubs_epi16(m32s, lhs_mat_01_70); + __m256i lhs_mat_s_23_70 = _mm256_maddubs_epi16(m32s, lhs_mat_23_70); + __m256i lhs_mat_s_01_71 = _mm256_maddubs_epi16(m32s, lhs_mat_01_71); + __m256i lhs_mat_s_23_71 = _mm256_maddubs_epi16(m32s, lhs_mat_23_71); // Shuffle pattern one - left side input const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) From 5311e5217cd9cbfa30bee5dbbf2f4009665e237d Mon Sep 17 00:00:00 2001 From: Manogna-Sree Date: Mon, 11 Aug 2025 03:55:50 -0700 Subject: [PATCH 03/23] Initial implementation of GEMM Q6_K for edge handling case --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 807 ++++++++++++++++++++++++++ 1 file changed, 807 insertions(+) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index fb82584550..ba16c3c0b8 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -7532,6 +7532,813 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } } + for (; y < nr / 4; y ++) { + const block_q8_Kx4 * a_ptr = a_ptr_start + (y * nb); + + // Take group of eight block_q6_kx8 structures at each pass of the loop and perform dot product operation + for (int64_t x = xstart; x < nc / 8; x++) { + const block_q6_Kx8 * b_ptr = b_ptr_start + (x * b_nb); + + // Master FP accumulators + __m256 acc_rows[4]; + for (int i = 0; i < 4; i++) { + acc_rows[i] = _mm256_setzero_ps(); + } + + for (int64_t b = 0; b < nb; b++) { + // Delta values - Load the eight scale values of block_q6_kx8 + const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d); + + // Loop to iterate over the sixteen sub blocks of a super block - eight sub blocks are processed per iteration + for (int sb = 0; sb < QK_K / 128; sb++) { + + const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + sb * 512)); + const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 224 + sb * 512)); + + const __m256i rhs_raw_mat_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_mat_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_mat_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_mat_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_mat_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_mat_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_mat_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_mat_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 480 + sb * 512)); + + const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + sb * 256)); + const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 32 + sb * 256)); + const __m256i rhs_raw_hbit_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 64 + sb * 256)); + const __m256i rhs_raw_hbit_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 96 + sb * 256)); + const __m256i rhs_raw_hbit_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 128 + sb * 256)); + const __m256i rhs_raw_hbit_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 160 + sb * 256)); + const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 192 + sb * 256)); + const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 224 + sb * 256)); + + // Indices 0 through 7 (first block): + const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); + const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240); + const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240); + const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240); + + // Indices 4 through 7 (second block): + const __m256i rhs_raw_mat_0145_4 = _mm256_blend_epi32(rhs_raw_mat_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_4, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_4, requiredOrder), rhs_raw_mat_4567_4, 240); + const __m256i rhs_raw_mat_0145_5 = _mm256_blend_epi32(rhs_raw_mat_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_5, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_5, requiredOrder), rhs_raw_mat_4567_5, 240); + const __m256i rhs_raw_mat_0145_6 = _mm256_blend_epi32(rhs_raw_mat_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_6, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_6, requiredOrder), rhs_raw_mat_4567_6, 240); + const __m256i rhs_raw_mat_0145_7 = _mm256_blend_epi32(rhs_raw_mat_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_7, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_7, requiredOrder), rhs_raw_mat_4567_7, 240); + + const __m256i rhs_raw_hbit_0145_0 = _mm256_blend_epi32(rhs_raw_hbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_0, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_0, requiredOrder), rhs_raw_hbit_4567_0, 240); + const __m256i rhs_raw_hbit_0145_1 = _mm256_blend_epi32(rhs_raw_hbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_1, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_1, requiredOrder), rhs_raw_hbit_4567_1, 240); + const __m256i rhs_raw_hbit_0145_2 = _mm256_blend_epi32(rhs_raw_hbit_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_2, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_2, requiredOrder), rhs_raw_hbit_4567_2, 240); + const __m256i rhs_raw_hbit_0145_3 = _mm256_blend_epi32(rhs_raw_hbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_3, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_3, requiredOrder), rhs_raw_hbit_4567_3, 240); + + // 2-bit -> 8-bit + // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop + const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) + const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) + const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) + const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + + const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) + const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) + const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) + const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + + const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) + const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) + const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) + const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + + const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) + const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) + const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) + const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + + // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop + const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) + const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) + const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) + const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + + const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) + const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) + const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) + const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + + const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) + const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) + const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) + const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + + const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) + const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) + const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) + const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + + // 0 -7, 64 - 71 + const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_0, m4), rhs_hbit_0145_00); + const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4), rhs_hbit_0145_40); + + const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_0, m4), rhs_hbit_2367_00); + const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4), rhs_hbit_2367_40); + + // 8 - 15, 72 - 79 + const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_1, m4), rhs_hbit_0145_01); + const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4), rhs_hbit_0145_41); + + const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_1, m4), rhs_hbit_2367_01); + const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4), rhs_hbit_2367_41); + + // 16 - 23, 80 - 87 + const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_2, m4), rhs_hbit_0145_10); + const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_2, 4), m4), rhs_hbit_0145_50); + + const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_2, m4), rhs_hbit_2367_10); + const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_2, 4), m4), rhs_hbit_2367_50); + + // 24 - 31, 88 - 95 + const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_3, m4), rhs_hbit_0145_11); + const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_3, 4), m4), rhs_hbit_0145_51); + + const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_3, m4), rhs_hbit_2367_11); + const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_3, 4), m4), rhs_hbit_2367_51); + + // 32 - 39, 96 - 103 + const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_4, m4), rhs_hbit_0145_20); + const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_4, 4), m4), rhs_hbit_0145_60); + + const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_4, m4), rhs_hbit_2367_20); + const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_4, 4), m4), rhs_hbit_2367_60); + + // 40 - 47, 104 - 111 + const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_5, m4), rhs_hbit_0145_21); + const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_5, 4), m4), rhs_hbit_0145_61); + + const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_5, m4), rhs_hbit_2367_21); + const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_5, 4), m4), rhs_hbit_2367_61); + + // 48 - 55, 112 - 119 + const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_6, m4), rhs_hbit_0145_30); + const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_6, 4), m4), rhs_hbit_0145_70); + + const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_6, m4), rhs_hbit_2367_30); + const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_6, 4), m4), rhs_hbit_2367_70); + + // 56 - 63, 120 - 127 + const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_7, m4), rhs_hbit_0145_31); + const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_7, 4), m4), rhs_hbit_0145_71); + + const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_7, m4), rhs_hbit_2367_31); + const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_7, 4), m4), rhs_hbit_2367_71); + + // Shuffle pattern one - right side input + const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) + const __m256i rhs_mat_2367_00_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_00, 136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) + + const __m256i rhs_mat_0145_01_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_01, 136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) + const __m256i rhs_mat_2367_01_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_01, 136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) + + const __m256i rhs_mat_0145_10_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_10, 136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) + const __m256i rhs_mat_2367_10_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_10, 136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) + + const __m256i rhs_mat_0145_11_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_11, 136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) + const __m256i rhs_mat_2367_11_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_11, 136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) + + const __m256i rhs_mat_0145_20_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_20, 136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) + const __m256i rhs_mat_2367_20_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_20, 136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) + + const __m256i rhs_mat_0145_21_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_21, 136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) + const __m256i rhs_mat_2367_21_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_21, 136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) + + const __m256i rhs_mat_0145_30_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_30, 136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) + const __m256i rhs_mat_2367_30_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_30, 136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) + + const __m256i rhs_mat_0145_31_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_31, 136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11 + const __m256i rhs_mat_2367_31_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_31, 136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) + + const __m256i rhs_mat_0145_40_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_40, 136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) + const __m256i rhs_mat_2367_40_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_40, 136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) + + const __m256i rhs_mat_0145_41_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_41, 136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) + const __m256i rhs_mat_2367_41_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_41, 136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) + + const __m256i rhs_mat_0145_50_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_50, 136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) + const __m256i rhs_mat_2367_50_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_50, 136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) + + const __m256i rhs_mat_0145_51_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_51, 136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) + const __m256i rhs_mat_2367_51_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_51, 136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) + + const __m256i rhs_mat_0145_60_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_60, 136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) + const __m256i rhs_mat_2367_60_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_60, 136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) + + const __m256i rhs_mat_0145_61_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_61, 136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) + const __m256i rhs_mat_2367_61_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_61, 136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) + + const __m256i rhs_mat_0145_70_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_70, 136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) + const __m256i rhs_mat_2367_70_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_70, 136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) + + const __m256i rhs_mat_0145_71_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_71, 136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) + const __m256i rhs_mat_2367_71_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_71, 136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) + + + // Shuffle pattern two - right side input + const __m256i rhs_mat_0145_00_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_00, 221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) + const __m256i rhs_mat_2367_00_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_00, 221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) + + const __m256i rhs_mat_0145_01_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_01, 221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) + const __m256i rhs_mat_2367_01_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_01, 221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) + + const __m256i rhs_mat_0145_10_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_10, 221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) + const __m256i rhs_mat_2367_10_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_10, 221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) + + const __m256i rhs_mat_0145_11_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_11, 221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) + const __m256i rhs_mat_2367_11_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_11, 221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) + + const __m256i rhs_mat_0145_20_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_20, 221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) + const __m256i rhs_mat_2367_20_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_20, 221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) + + const __m256i rhs_mat_0145_21_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_21, 221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) + const __m256i rhs_mat_2367_21_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_21, 221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) + + const __m256i rhs_mat_0145_30_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_30, 221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) + const __m256i rhs_mat_2367_30_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_30, 221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) + + const __m256i rhs_mat_0145_31_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_31, 221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) + const __m256i rhs_mat_2367_31_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_31, 221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) + + const __m256i rhs_mat_0145_40_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_40, 221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) + const __m256i rhs_mat_2367_40_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_40, 221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) + + const __m256i rhs_mat_0145_41_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_41, 221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) + const __m256i rhs_mat_2367_41_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_41, 221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) + + const __m256i rhs_mat_0145_50_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_50, 221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) + const __m256i rhs_mat_2367_50_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_50, 221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) + + const __m256i rhs_mat_0145_51_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_51, 221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) + const __m256i rhs_mat_2367_51_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_51, 221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) + + const __m256i rhs_mat_0145_60_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_60, 221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) + const __m256i rhs_mat_2367_60_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_60, 221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) + + const __m256i rhs_mat_0145_61_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_61, 221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) + const __m256i rhs_mat_2367_61_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_61, 221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) + + const __m256i rhs_mat_0145_70_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_70, 221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) + const __m256i rhs_mat_2367_70_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_70, 221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) + + const __m256i rhs_mat_0145_71_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_71, 221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) + const __m256i rhs_mat_2367_71_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_71, 221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) + + //Scales of corresponding sub blocks from different Q6_K structures are stored together + //s00 s01 s10 s11 s20 s21 ...... s70 s71 + // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop + const __m128i scales_01 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + sb * 64)); + const __m128i scales_23 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 16 + sb * 64)); + const __m128i scales_45 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 32 + sb * 64)); + const __m128i scales_67 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 48 + sb * 64)); + + const __m256i scales_0 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_01, scalesmask1_sse)); + const __m256i scales_1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_01, scalesmask2_sse)); + const __m256i scales_2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_23, scalesmask1_sse)); + const __m256i scales_3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_23, scalesmask2_sse)); + const __m256i scales_4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_45, scalesmask1_sse)); + const __m256i scales_5 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_45, scalesmask2_sse)); + const __m256i scales_6 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_67, scalesmask1_sse)); + const __m256i scales_7 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales_67, scalesmask2_sse)); + + const __m256i scale_0145_0 = _mm256_shuffle_epi32(scales_0, 68); + const __m256i scale_2367_0 = _mm256_shuffle_epi32(scales_0, 238); + + const __m256i scale_0145_1 = _mm256_shuffle_epi32(scales_1, 68); + const __m256i scale_2367_1 = _mm256_shuffle_epi32(scales_1, 238); + + const __m256i scale_0145_2 = _mm256_shuffle_epi32(scales_2, 68); + const __m256i scale_2367_2 = _mm256_shuffle_epi32(scales_2, 238); + + const __m256i scale_0145_3 = _mm256_shuffle_epi32(scales_3, 68); + const __m256i scale_2367_3 = _mm256_shuffle_epi32(scales_3, 238); + + const __m256i scale_0145_4 = _mm256_shuffle_epi32(scales_4, 68); + const __m256i scale_2367_4 = _mm256_shuffle_epi32(scales_4, 238); + + const __m256i scale_0145_5 = _mm256_shuffle_epi32(scales_5, 68); + const __m256i scale_2367_5 = _mm256_shuffle_epi32(scales_5, 238); + + const __m256i scale_0145_6 = _mm256_shuffle_epi32(scales_6, 68); + const __m256i scale_2367_6 = _mm256_shuffle_epi32(scales_6, 238); + + const __m256i scale_0145_7 = _mm256_shuffle_epi32(scales_7, 68); + const __m256i scale_2367_7 = _mm256_shuffle_epi32(scales_7, 238); + + // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3 + // Loaded as set of 128 bit vectors and repeated into a 256 bit vector + __m256i lhs_mat_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 512 * sb))); + __m256i lhs_mat_01_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 0); + __m256i lhs_mat_23_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 17); + __m256i lhs_mat_0123_01 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 32 + 512 * sb))); + __m256i lhs_mat_01_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 0); + __m256i lhs_mat_23_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 17); + __m256i lhs_mat_0123_10 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 64 + 512 * sb))); + __m256i lhs_mat_01_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 0); + __m256i lhs_mat_23_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 17); + __m256i lhs_mat_0123_11 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 96 + 512 * sb))); + __m256i lhs_mat_01_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 0); + __m256i lhs_mat_23_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 17); + __m256i lhs_mat_0123_20 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 128 + 512 * sb))); + __m256i lhs_mat_01_20 = _mm256_permute2f128_si256(lhs_mat_0123_20, lhs_mat_0123_20, 0); + __m256i lhs_mat_23_20 = _mm256_permute2f128_si256(lhs_mat_0123_20, lhs_mat_0123_20, 17); + __m256i lhs_mat_0123_21 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 160 + 512 * sb))); + __m256i lhs_mat_01_21 = _mm256_permute2f128_si256(lhs_mat_0123_21, lhs_mat_0123_21, 0); + __m256i lhs_mat_23_21 = _mm256_permute2f128_si256(lhs_mat_0123_21, lhs_mat_0123_21, 17); + __m256i lhs_mat_0123_30 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 192 + 512 * sb))); + __m256i lhs_mat_01_30 = _mm256_permute2f128_si256(lhs_mat_0123_30, lhs_mat_0123_30, 0); + __m256i lhs_mat_23_30 = _mm256_permute2f128_si256(lhs_mat_0123_30, lhs_mat_0123_30, 17); + __m256i lhs_mat_0123_31 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 224 + 512 * sb))); + __m256i lhs_mat_01_31 = _mm256_permute2f128_si256(lhs_mat_0123_31, lhs_mat_0123_31, 0); + __m256i lhs_mat_23_31 = _mm256_permute2f128_si256(lhs_mat_0123_31, lhs_mat_0123_31, 17); + + __m256i lhs_mat_0123_40 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 256 + 512 * sb))); + __m256i lhs_mat_01_40 = _mm256_permute2f128_si256(lhs_mat_0123_40, lhs_mat_0123_40, 0); + __m256i lhs_mat_23_40 = _mm256_permute2f128_si256(lhs_mat_0123_40, lhs_mat_0123_40, 17); + __m256i lhs_mat_0123_41 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 288 + 512 * sb))); + __m256i lhs_mat_01_41 = _mm256_permute2f128_si256(lhs_mat_0123_41, lhs_mat_0123_41, 0); + __m256i lhs_mat_23_41 = _mm256_permute2f128_si256(lhs_mat_0123_41, lhs_mat_0123_41, 17); + __m256i lhs_mat_0123_50 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 320 + 512 * sb))); + __m256i lhs_mat_01_50 = _mm256_permute2f128_si256(lhs_mat_0123_50, lhs_mat_0123_50, 0); + __m256i lhs_mat_23_50 = _mm256_permute2f128_si256(lhs_mat_0123_50, lhs_mat_0123_50, 17); + __m256i lhs_mat_0123_51 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 352 + 512 * sb))); + __m256i lhs_mat_01_51 = _mm256_permute2f128_si256(lhs_mat_0123_51, lhs_mat_0123_51, 0); + __m256i lhs_mat_23_51 = _mm256_permute2f128_si256(lhs_mat_0123_51, lhs_mat_0123_51, 17); + __m256i lhs_mat_0123_60 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 384 + 512 * sb))); + __m256i lhs_mat_01_60 = _mm256_permute2f128_si256(lhs_mat_0123_60, lhs_mat_0123_60, 0); + __m256i lhs_mat_23_60 = _mm256_permute2f128_si256(lhs_mat_0123_60, lhs_mat_0123_60, 17); + __m256i lhs_mat_0123_61 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 416 + 512 * sb))); + __m256i lhs_mat_01_61 = _mm256_permute2f128_si256(lhs_mat_0123_61, lhs_mat_0123_61, 0); + __m256i lhs_mat_23_61 = _mm256_permute2f128_si256(lhs_mat_0123_61, lhs_mat_0123_61, 17); + __m256i lhs_mat_0123_70 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 448 + 512 * sb))); + __m256i lhs_mat_01_70 = _mm256_permute2f128_si256(lhs_mat_0123_70, lhs_mat_0123_70, 0); + __m256i lhs_mat_23_70 = _mm256_permute2f128_si256(lhs_mat_0123_70, lhs_mat_0123_70, 17); + __m256i lhs_mat_0123_71 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 480 + 512 * sb))); + __m256i lhs_mat_01_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 0); + __m256i lhs_mat_23_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 17); + + + __m256i lhs_mat_s_01_00 = _mm256_maddubs_epi16(m32s, lhs_mat_01_00); + __m256i lhs_mat_s_23_00 = _mm256_maddubs_epi16(m32s, lhs_mat_23_00); + __m256i lhs_mat_s_01_01 = _mm256_maddubs_epi16(m32s, lhs_mat_01_01); + __m256i lhs_mat_s_23_01 = _mm256_maddubs_epi16(m32s, lhs_mat_23_01); + __m256i lhs_mat_s_01_10 = _mm256_maddubs_epi16(m32s, lhs_mat_01_10); + __m256i lhs_mat_s_23_10 = _mm256_maddubs_epi16(m32s, lhs_mat_23_10); + __m256i lhs_mat_s_01_11 = _mm256_maddubs_epi16(m32s, lhs_mat_01_11); + __m256i lhs_mat_s_23_11 = _mm256_maddubs_epi16(m32s, lhs_mat_23_11); + __m256i lhs_mat_s_01_20 = _mm256_maddubs_epi16(m32s, lhs_mat_01_20); + __m256i lhs_mat_s_23_20 = _mm256_maddubs_epi16(m32s, lhs_mat_23_20); + __m256i lhs_mat_s_01_21 = _mm256_maddubs_epi16(m32s, lhs_mat_01_21); + __m256i lhs_mat_s_23_21 = _mm256_maddubs_epi16(m32s, lhs_mat_23_21); + __m256i lhs_mat_s_01_30 = _mm256_maddubs_epi16(m32s, lhs_mat_01_30); + __m256i lhs_mat_s_23_30 = _mm256_maddubs_epi16(m32s, lhs_mat_23_30); + __m256i lhs_mat_s_01_31 = _mm256_maddubs_epi16(m32s, lhs_mat_01_31); + __m256i lhs_mat_s_23_31 = _mm256_maddubs_epi16(m32s, lhs_mat_23_31); + __m256i lhs_mat_s_01_40 = _mm256_maddubs_epi16(m32s, lhs_mat_01_40); + __m256i lhs_mat_s_23_40 = _mm256_maddubs_epi16(m32s, lhs_mat_23_40); + __m256i lhs_mat_s_01_41 = _mm256_maddubs_epi16(m32s, lhs_mat_01_41); + __m256i lhs_mat_s_23_41 = _mm256_maddubs_epi16(m32s, lhs_mat_23_41); + __m256i lhs_mat_s_01_50 = _mm256_maddubs_epi16(m32s, lhs_mat_01_50); + __m256i lhs_mat_s_23_50 = _mm256_maddubs_epi16(m32s, lhs_mat_23_50); + __m256i lhs_mat_s_01_51 = _mm256_maddubs_epi16(m32s, lhs_mat_01_51); + __m256i lhs_mat_s_23_51 = _mm256_maddubs_epi16(m32s, lhs_mat_23_51); + __m256i lhs_mat_s_01_60 = _mm256_maddubs_epi16(m32s, lhs_mat_01_60); + __m256i lhs_mat_s_23_60 = _mm256_maddubs_epi16(m32s, lhs_mat_23_60); + __m256i lhs_mat_s_01_61 = _mm256_maddubs_epi16(m32s, lhs_mat_01_61); + __m256i lhs_mat_s_23_61 = _mm256_maddubs_epi16(m32s, lhs_mat_23_61); + __m256i lhs_mat_s_01_70 = _mm256_maddubs_epi16(m32s, lhs_mat_01_70); + __m256i lhs_mat_s_23_70 = _mm256_maddubs_epi16(m32s, lhs_mat_23_70); + __m256i lhs_mat_s_01_71 = _mm256_maddubs_epi16(m32s, lhs_mat_01_71); + __m256i lhs_mat_s_23_71 = _mm256_maddubs_epi16(m32s, lhs_mat_23_71); + + // Shuffle pattern one - left side input + const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m256i lhs_mat_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + + const __m256i lhs_mat_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m256i lhs_mat_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + + const __m256i lhs_mat_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m256i lhs_mat_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + + const __m256i lhs_mat_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + + const __m256i lhs_mat_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m256i lhs_mat_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + + const __m256i lhs_mat_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m256i lhs_mat_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + + const __m256i lhs_mat_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m256i lhs_mat_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + + const __m256i lhs_mat_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m256i lhs_mat_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + + const __m256i lhs_mat_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m256i lhs_mat_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + + const __m256i lhs_mat_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m256i lhs_mat_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + + const __m256i lhs_mat_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m256i lhs_mat_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + + const __m256i lhs_mat_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m256i lhs_mat_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + + const __m256i lhs_mat_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m256i lhs_mat_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + + const __m256i lhs_mat_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m256i lhs_mat_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + + const __m256i lhs_mat_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m256i lhs_mat_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + + const __m256i lhs_mat_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m256i lhs_mat_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + + // Shuffle pattern two- left side input + const __m256i lhs_mat_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m256i lhs_mat_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + + const __m256i lhs_mat_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m256i lhs_mat_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + + const __m256i lhs_mat_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m256i lhs_mat_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + + const __m256i lhs_mat_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m256i lhs_mat_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + + const __m256i lhs_mat_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m256i lhs_mat_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + + const __m256i lhs_mat_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m256i lhs_mat_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + + const __m256i lhs_mat_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m256i lhs_mat_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + + const __m256i lhs_mat_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m256i lhs_mat_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + + const __m256i lhs_mat_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m256i lhs_mat_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + + const __m256i lhs_mat_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m256i lhs_mat_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + + const __m256i lhs_mat_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m256i lhs_mat_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + + const __m256i lhs_mat_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m256i lhs_mat_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + + const __m256i lhs_mat_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m256i lhs_mat_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m256i lhs_mat_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m256i lhs_mat_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m256i lhs_mat_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m256i lhs_mat_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m256i lhs_mat_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m256i lhs_mat_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + + // Shuffle pattern one - left side input + const __m256i lhs_mat_s_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m256i lhs_mat_s_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + + const __m256i lhs_mat_s_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m256i lhs_mat_s_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + + const __m256i lhs_mat_s_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m256i lhs_mat_s_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + + const __m256i lhs_mat_s_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m256i lhs_mat_s_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + + const __m256i lhs_mat_s_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m256i lhs_mat_s_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + + const __m256i lhs_mat_s_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m256i lhs_mat_s_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + + const __m256i lhs_mat_s_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m256i lhs_mat_s_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + + const __m256i lhs_mat_s_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m256i lhs_mat_s_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + + const __m256i lhs_mat_s_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m256i lhs_mat_s_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + + const __m256i lhs_mat_s_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m256i lhs_mat_s_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + + const __m256i lhs_mat_s_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m256i lhs_mat_s_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + + const __m256i lhs_mat_s_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m256i lhs_mat_s_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + + const __m256i lhs_mat_s_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m256i lhs_mat_s_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + + const __m256i lhs_mat_s_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m256i lhs_mat_s_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + + const __m256i lhs_mat_s_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m256i lhs_mat_s_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + + const __m256i lhs_mat_s_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m256i lhs_mat_s_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + + // Shuffle pattern two- left side input + const __m256i lhs_mat_s_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m256i lhs_mat_s_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + + const __m256i lhs_mat_s_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m256i lhs_mat_s_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + + const __m256i lhs_mat_s_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m256i lhs_mat_s_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + + const __m256i lhs_mat_s_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m256i lhs_mat_s_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + + const __m256i lhs_mat_s_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m256i lhs_mat_s_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + + const __m256i lhs_mat_s_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m256i lhs_mat_s_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + + const __m256i lhs_mat_s_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m256i lhs_mat_s_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + + const __m256i lhs_mat_s_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m256i lhs_mat_s_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + + const __m256i lhs_mat_s_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m256i lhs_mat_s_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + + const __m256i lhs_mat_s_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m256i lhs_mat_s_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + + const __m256i lhs_mat_s_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m256i lhs_mat_s_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + + const __m256i lhs_mat_s_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m256i lhs_mat_s_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + + const __m256i lhs_mat_s_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m256i lhs_mat_s_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m256i lhs_mat_s_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m256i lhs_mat_s_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m256i lhs_mat_s_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m256i lhs_mat_s_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m256i lhs_mat_s_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m256i lhs_mat_s_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + + // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane + __m256i iacc_mat_00_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); + __m256i iacc_mat_01_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); + + __m256i iacc_mat_10_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp1, lhs_mat_23_00_sp1), lhs_mat_s_23_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp1, lhs_mat_23_01_sp1), lhs_mat_s_23_01_sp1)); + __m256i iacc_mat_11_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_00_sp1, lhs_mat_23_00_sp1), lhs_mat_s_23_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_01_sp1, lhs_mat_23_01_sp1), lhs_mat_s_23_01_sp1)); + + __m256i iacc_mat_00_1_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_10_sp1, lhs_mat_01_10_sp1), lhs_mat_s_01_10_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_11_sp1, lhs_mat_01_11_sp1), lhs_mat_s_01_11_sp1)); + __m256i iacc_mat_01_1_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_10_sp1, lhs_mat_01_10_sp1), lhs_mat_s_01_10_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_11_sp1, lhs_mat_01_11_sp1), lhs_mat_s_01_11_sp1)); + + __m256i iacc_mat_10_1_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_10_sp1, lhs_mat_23_10_sp1), lhs_mat_s_23_10_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_11_sp1, lhs_mat_23_11_sp1), lhs_mat_s_23_11_sp1)); + __m256i iacc_mat_11_1_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_10_sp1, lhs_mat_23_10_sp1), lhs_mat_s_23_10_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_11_sp1, lhs_mat_23_11_sp1), lhs_mat_s_23_11_sp1)); + + __m256i iacc_mat_00_2_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_20_sp1, lhs_mat_01_20_sp1), lhs_mat_s_01_20_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_21_sp1, lhs_mat_01_21_sp1), lhs_mat_s_01_21_sp1)); + __m256i iacc_mat_01_2_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_20_sp1, lhs_mat_01_20_sp1), lhs_mat_s_01_20_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_21_sp1, lhs_mat_01_21_sp1), lhs_mat_s_01_21_sp1)); + + __m256i iacc_mat_10_2_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_20_sp1, lhs_mat_23_20_sp1), lhs_mat_s_23_20_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_21_sp1, lhs_mat_23_21_sp1), lhs_mat_s_23_21_sp1)); + __m256i iacc_mat_11_2_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_20_sp1, lhs_mat_23_20_sp1), lhs_mat_s_23_20_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_21_sp1, lhs_mat_23_21_sp1), lhs_mat_s_23_21_sp1)); + + __m256i iacc_mat_00_3_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_30_sp1, lhs_mat_01_30_sp1), lhs_mat_s_01_30_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_31_sp1, lhs_mat_01_31_sp1), lhs_mat_s_01_31_sp1)); + __m256i iacc_mat_01_3_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_30_sp1, lhs_mat_01_30_sp1), lhs_mat_s_01_30_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_31_sp1, lhs_mat_01_31_sp1), lhs_mat_s_01_31_sp1)); + + __m256i iacc_mat_10_3_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_30_sp1, lhs_mat_23_30_sp1), lhs_mat_s_23_30_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_31_sp1, lhs_mat_23_31_sp1), lhs_mat_s_23_31_sp1)); + __m256i iacc_mat_11_3_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_30_sp1, lhs_mat_23_30_sp1), lhs_mat_s_23_30_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_31_sp1, lhs_mat_23_31_sp1), lhs_mat_s_23_31_sp1)); + + __m256i iacc_mat_00_4_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_40_sp1, lhs_mat_01_40_sp1), lhs_mat_s_01_40_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_41_sp1, lhs_mat_01_41_sp1), lhs_mat_s_01_41_sp1)); + __m256i iacc_mat_01_4_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_40_sp1, lhs_mat_01_40_sp1), lhs_mat_s_01_40_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_41_sp1, lhs_mat_01_41_sp1), lhs_mat_s_01_41_sp1)); + + __m256i iacc_mat_10_4_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_40_sp1, lhs_mat_23_40_sp1), lhs_mat_s_23_40_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_41_sp1, lhs_mat_23_41_sp1), lhs_mat_s_23_41_sp1)); + __m256i iacc_mat_11_4_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_40_sp1, lhs_mat_23_40_sp1), lhs_mat_s_23_40_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_41_sp1, lhs_mat_23_41_sp1), lhs_mat_s_23_41_sp1)); + + __m256i iacc_mat_00_5_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_50_sp1, lhs_mat_01_50_sp1), lhs_mat_s_01_50_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_51_sp1, lhs_mat_01_51_sp1), lhs_mat_s_01_51_sp1)); + __m256i iacc_mat_01_5_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_50_sp1, lhs_mat_01_50_sp1), lhs_mat_s_01_50_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_51_sp1, lhs_mat_01_51_sp1), lhs_mat_s_01_51_sp1)); + + __m256i iacc_mat_10_5_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_50_sp1, lhs_mat_23_50_sp1), lhs_mat_s_23_50_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_51_sp1, lhs_mat_23_51_sp1), lhs_mat_s_23_51_sp1)); + __m256i iacc_mat_11_5_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_50_sp1, lhs_mat_23_50_sp1), lhs_mat_s_23_50_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_51_sp1, lhs_mat_23_51_sp1), lhs_mat_s_23_51_sp1)); + + __m256i iacc_mat_00_6_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_60_sp1, lhs_mat_01_60_sp1), lhs_mat_s_01_60_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_61_sp1, lhs_mat_01_61_sp1), lhs_mat_s_01_61_sp1)); + __m256i iacc_mat_01_6_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_60_sp1, lhs_mat_01_60_sp1), lhs_mat_s_01_60_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_61_sp1, lhs_mat_01_61_sp1), lhs_mat_s_01_61_sp1)); + + __m256i iacc_mat_10_6_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_60_sp1, lhs_mat_23_60_sp1), lhs_mat_s_23_60_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_61_sp1, lhs_mat_23_61_sp1), lhs_mat_s_23_61_sp1)); + __m256i iacc_mat_11_6_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_60_sp1, lhs_mat_23_60_sp1), lhs_mat_s_23_60_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_61_sp1, lhs_mat_23_61_sp1), lhs_mat_s_23_61_sp1)); + + __m256i iacc_mat_00_7_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_70_sp1, lhs_mat_01_70_sp1), lhs_mat_s_01_70_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_71_sp1, lhs_mat_01_71_sp1), lhs_mat_s_01_71_sp1)); + __m256i iacc_mat_01_7_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_70_sp1, lhs_mat_01_70_sp1), lhs_mat_s_01_70_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_71_sp1, lhs_mat_01_71_sp1), lhs_mat_s_01_71_sp1)); + + __m256i iacc_mat_10_7_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_70_sp1, lhs_mat_23_70_sp1), lhs_mat_s_23_70_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_71_sp1, lhs_mat_23_71_sp1), lhs_mat_s_23_71_sp1)); + __m256i iacc_mat_11_7_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_70_sp1, lhs_mat_23_70_sp1), lhs_mat_s_23_70_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_71_sp1, lhs_mat_23_71_sp1), lhs_mat_s_23_71_sp1)); + + __m256i iacc_mat_00_0_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp2, lhs_mat_01_00_sp2), lhs_mat_s_01_00_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp2, lhs_mat_01_01_sp2), lhs_mat_s_01_01_sp2)); + __m256i iacc_mat_01_0_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_00_sp2, lhs_mat_01_00_sp2), lhs_mat_s_01_00_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_01_sp2, lhs_mat_01_01_sp2), lhs_mat_s_01_01_sp2)); + + __m256i iacc_mat_10_0_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp2, lhs_mat_23_00_sp2), lhs_mat_s_23_00_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp2, lhs_mat_23_01_sp2), lhs_mat_s_23_01_sp2)); + __m256i iacc_mat_11_0_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_00_sp2, lhs_mat_23_00_sp2), lhs_mat_s_23_00_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_01_sp2, lhs_mat_23_01_sp2), lhs_mat_s_23_01_sp2)); + + __m256i iacc_mat_00_1_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_10_sp2, lhs_mat_01_10_sp2), lhs_mat_s_01_10_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_11_sp2, lhs_mat_01_11_sp2), lhs_mat_s_01_11_sp2)); + __m256i iacc_mat_01_1_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_10_sp2, lhs_mat_01_10_sp2), lhs_mat_s_01_10_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_11_sp2, lhs_mat_01_11_sp2), lhs_mat_s_01_11_sp2)); + + __m256i iacc_mat_10_1_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_10_sp2, lhs_mat_23_10_sp2), lhs_mat_s_23_10_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_11_sp2, lhs_mat_23_11_sp2), lhs_mat_s_23_11_sp2)); + __m256i iacc_mat_11_1_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_10_sp2, lhs_mat_23_10_sp2), lhs_mat_s_23_10_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_11_sp2, lhs_mat_23_11_sp2), lhs_mat_s_23_11_sp2)); + + __m256i iacc_mat_00_2_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_20_sp2, lhs_mat_01_20_sp2), lhs_mat_s_01_20_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_21_sp2, lhs_mat_01_21_sp2), lhs_mat_s_01_21_sp2)); + __m256i iacc_mat_01_2_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_20_sp2, lhs_mat_01_20_sp2), lhs_mat_s_01_20_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_21_sp2, lhs_mat_01_21_sp2), lhs_mat_s_01_21_sp2)); + + __m256i iacc_mat_10_2_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_20_sp2, lhs_mat_23_20_sp2), lhs_mat_s_23_20_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_21_sp2, lhs_mat_23_21_sp2), lhs_mat_s_23_21_sp2)); + __m256i iacc_mat_11_2_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_20_sp2, lhs_mat_23_20_sp2), lhs_mat_s_23_20_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_21_sp2, lhs_mat_23_21_sp2), lhs_mat_s_23_21_sp2)); + + __m256i iacc_mat_00_3_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_30_sp2, lhs_mat_01_30_sp2), lhs_mat_s_01_30_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_31_sp2, lhs_mat_01_31_sp2), lhs_mat_s_01_31_sp2)); + __m256i iacc_mat_01_3_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_30_sp2, lhs_mat_01_30_sp2), lhs_mat_s_01_30_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_31_sp2, lhs_mat_01_31_sp2), lhs_mat_s_01_31_sp2)); + + __m256i iacc_mat_10_3_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_30_sp2, lhs_mat_23_30_sp2), lhs_mat_s_23_30_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_31_sp2, lhs_mat_23_31_sp2), lhs_mat_s_23_31_sp2)); + __m256i iacc_mat_11_3_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_30_sp2, lhs_mat_23_30_sp2), lhs_mat_s_23_30_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_31_sp2, lhs_mat_23_31_sp2), lhs_mat_s_23_31_sp2)); + + __m256i iacc_mat_00_4_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_40_sp2, lhs_mat_01_40_sp2), lhs_mat_s_01_40_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_41_sp2, lhs_mat_01_41_sp2), lhs_mat_s_01_41_sp2)); + __m256i iacc_mat_01_4_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_40_sp2, lhs_mat_01_40_sp2), lhs_mat_s_01_40_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_41_sp2, lhs_mat_01_41_sp2), lhs_mat_s_01_41_sp2)); + + __m256i iacc_mat_10_4_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_40_sp2, lhs_mat_23_40_sp2), lhs_mat_s_23_40_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_41_sp2, lhs_mat_23_41_sp2), lhs_mat_s_23_41_sp2)); + __m256i iacc_mat_11_4_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_40_sp2, lhs_mat_23_40_sp2), lhs_mat_s_23_40_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_41_sp2, lhs_mat_23_41_sp2), lhs_mat_s_23_41_sp2)); + + __m256i iacc_mat_00_5_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_50_sp2, lhs_mat_01_50_sp2), lhs_mat_s_01_50_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_51_sp2, lhs_mat_01_51_sp2), lhs_mat_s_01_51_sp2)); + __m256i iacc_mat_01_5_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_50_sp2, lhs_mat_01_50_sp2), lhs_mat_s_01_50_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_51_sp2, lhs_mat_01_51_sp2), lhs_mat_s_01_51_sp2)); + + __m256i iacc_mat_10_5_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_50_sp2, lhs_mat_23_50_sp2), lhs_mat_s_23_50_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_51_sp2, lhs_mat_23_51_sp2), lhs_mat_s_23_51_sp2)); + __m256i iacc_mat_11_5_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_50_sp2, lhs_mat_23_50_sp2), lhs_mat_s_23_50_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_51_sp2, lhs_mat_23_51_sp2), lhs_mat_s_23_51_sp2)); + + __m256i iacc_mat_00_6_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_60_sp2, lhs_mat_01_60_sp2), lhs_mat_s_01_60_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_61_sp2, lhs_mat_01_61_sp2), lhs_mat_s_01_61_sp2)); + __m256i iacc_mat_01_6_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_60_sp2, lhs_mat_01_60_sp2), lhs_mat_s_01_60_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_61_sp2, lhs_mat_01_61_sp2), lhs_mat_s_01_61_sp2)); + + __m256i iacc_mat_10_6_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_60_sp2, lhs_mat_23_60_sp2), lhs_mat_s_23_60_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_61_sp2, lhs_mat_23_61_sp2), lhs_mat_s_23_61_sp2)); + __m256i iacc_mat_11_6_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_60_sp2, lhs_mat_23_60_sp2), lhs_mat_s_23_60_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_61_sp2, lhs_mat_23_61_sp2), lhs_mat_s_23_61_sp2)); + + __m256i iacc_mat_00_7_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_70_sp2, lhs_mat_01_70_sp2), lhs_mat_s_01_70_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_71_sp2, lhs_mat_01_71_sp2), lhs_mat_s_01_71_sp2)); + __m256i iacc_mat_01_7_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_70_sp2, lhs_mat_01_70_sp2), lhs_mat_s_01_70_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_71_sp2, lhs_mat_01_71_sp2), lhs_mat_s_01_71_sp2)); + + __m256i iacc_mat_10_7_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_70_sp2, lhs_mat_23_70_sp2), lhs_mat_s_23_70_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_71_sp2, lhs_mat_23_71_sp2), lhs_mat_s_23_71_sp2)); + __m256i iacc_mat_11_7_sp2 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_70_sp2, lhs_mat_23_70_sp2), lhs_mat_s_23_70_sp2), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_71_sp2, lhs_mat_23_71_sp2), lhs_mat_s_23_71_sp2)); + + // Combine results from both shuffle patterns for each output block + __m256i iacc_mat_00_0 = _mm256_add_epi16(iacc_mat_00_0_sp1, iacc_mat_00_0_sp2); + __m256i iacc_mat_01_0 = _mm256_add_epi16(iacc_mat_01_0_sp1, iacc_mat_01_0_sp2); + __m256i iacc_mat_10_0 = _mm256_add_epi16(iacc_mat_10_0_sp1, iacc_mat_10_0_sp2); + __m256i iacc_mat_11_0 = _mm256_add_epi16(iacc_mat_11_0_sp1, iacc_mat_11_0_sp2); + + __m256i iacc_mat_00_1 = _mm256_add_epi16(iacc_mat_00_1_sp1, iacc_mat_00_1_sp2); + __m256i iacc_mat_01_1 = _mm256_add_epi16(iacc_mat_01_1_sp1, iacc_mat_01_1_sp2); + __m256i iacc_mat_10_1 = _mm256_add_epi16(iacc_mat_10_1_sp1, iacc_mat_10_1_sp2); + __m256i iacc_mat_11_1 = _mm256_add_epi16(iacc_mat_11_1_sp1, iacc_mat_11_1_sp2); + + __m256i iacc_mat_00_2 = _mm256_add_epi16(iacc_mat_00_2_sp1, iacc_mat_00_2_sp2); + __m256i iacc_mat_01_2 = _mm256_add_epi16(iacc_mat_01_2_sp1, iacc_mat_01_2_sp2); + __m256i iacc_mat_10_2 = _mm256_add_epi16(iacc_mat_10_2_sp1, iacc_mat_10_2_sp2); + __m256i iacc_mat_11_2 = _mm256_add_epi16(iacc_mat_11_2_sp1, iacc_mat_11_2_sp2); + + __m256i iacc_mat_00_3 = _mm256_add_epi16(iacc_mat_00_3_sp1, iacc_mat_00_3_sp2); + __m256i iacc_mat_01_3 = _mm256_add_epi16(iacc_mat_01_3_sp1, iacc_mat_01_3_sp2); + __m256i iacc_mat_10_3 = _mm256_add_epi16(iacc_mat_10_3_sp1, iacc_mat_10_3_sp2); + __m256i iacc_mat_11_3 = _mm256_add_epi16(iacc_mat_11_3_sp1, iacc_mat_11_3_sp2); + + __m256i iacc_mat_00_4 = _mm256_add_epi16(iacc_mat_00_4_sp1, iacc_mat_00_4_sp2); + __m256i iacc_mat_01_4 = _mm256_add_epi16(iacc_mat_01_4_sp1, iacc_mat_01_4_sp2); + __m256i iacc_mat_10_4 = _mm256_add_epi16(iacc_mat_10_4_sp1, iacc_mat_10_4_sp2); + __m256i iacc_mat_11_4 = _mm256_add_epi16(iacc_mat_11_4_sp1, iacc_mat_11_4_sp2); + + __m256i iacc_mat_00_5 = _mm256_add_epi16(iacc_mat_00_5_sp1, iacc_mat_00_5_sp2); + __m256i iacc_mat_01_5 = _mm256_add_epi16(iacc_mat_01_5_sp1, iacc_mat_01_5_sp2); + __m256i iacc_mat_10_5 = _mm256_add_epi16(iacc_mat_10_5_sp1, iacc_mat_10_5_sp2); + __m256i iacc_mat_11_5 = _mm256_add_epi16(iacc_mat_11_5_sp1, iacc_mat_11_5_sp2); + + __m256i iacc_mat_00_6 = _mm256_add_epi16(iacc_mat_00_6_sp1, iacc_mat_00_6_sp2); + __m256i iacc_mat_01_6 = _mm256_add_epi16(iacc_mat_01_6_sp1, iacc_mat_01_6_sp2); + __m256i iacc_mat_10_6 = _mm256_add_epi16(iacc_mat_10_6_sp1, iacc_mat_10_6_sp2); + __m256i iacc_mat_11_6 = _mm256_add_epi16(iacc_mat_11_6_sp1, iacc_mat_11_6_sp2); + + __m256i iacc_mat_00_7 = _mm256_add_epi16(iacc_mat_00_7_sp1, iacc_mat_00_7_sp2); + __m256i iacc_mat_01_7 = _mm256_add_epi16(iacc_mat_01_7_sp1, iacc_mat_01_7_sp2); + __m256i iacc_mat_10_7 = _mm256_add_epi16(iacc_mat_10_7_sp1, iacc_mat_10_7_sp2); + __m256i iacc_mat_11_7 = _mm256_add_epi16(iacc_mat_11_7_sp1, iacc_mat_11_7_sp2); + + // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block + iacc_mat_00_0 = _mm256_madd_epi16(iacc_mat_00_0, scale_0145_0); + iacc_mat_01_0 = _mm256_madd_epi16(iacc_mat_01_0, scale_2367_0); + iacc_mat_10_0 = _mm256_madd_epi16(iacc_mat_10_0, scale_0145_0); + iacc_mat_11_0 = _mm256_madd_epi16(iacc_mat_11_0, scale_2367_0); + + iacc_mat_00_1 = _mm256_madd_epi16(iacc_mat_00_1, scale_0145_1); + iacc_mat_01_1 = _mm256_madd_epi16(iacc_mat_01_1, scale_2367_1); + iacc_mat_10_1 = _mm256_madd_epi16(iacc_mat_10_1, scale_0145_1); + iacc_mat_11_1 = _mm256_madd_epi16(iacc_mat_11_1, scale_2367_1); + + iacc_mat_00_2 = _mm256_madd_epi16(iacc_mat_00_2, scale_0145_2); + iacc_mat_01_2 = _mm256_madd_epi16(iacc_mat_01_2, scale_2367_2); + iacc_mat_10_2 = _mm256_madd_epi16(iacc_mat_10_2, scale_0145_2); + iacc_mat_11_2 = _mm256_madd_epi16(iacc_mat_11_2, scale_2367_2); + + iacc_mat_00_3 = _mm256_madd_epi16(iacc_mat_00_3, scale_0145_3); + iacc_mat_01_3 = _mm256_madd_epi16(iacc_mat_01_3, scale_2367_3); + iacc_mat_10_3 = _mm256_madd_epi16(iacc_mat_10_3, scale_0145_3); + iacc_mat_11_3 = _mm256_madd_epi16(iacc_mat_11_3, scale_2367_3); + + iacc_mat_00_4 = _mm256_madd_epi16(iacc_mat_00_4, scale_0145_4); + iacc_mat_01_4 = _mm256_madd_epi16(iacc_mat_01_4, scale_2367_4); + iacc_mat_10_4 = _mm256_madd_epi16(iacc_mat_10_4, scale_0145_4); + iacc_mat_11_4 = _mm256_madd_epi16(iacc_mat_11_4, scale_2367_4); + + iacc_mat_00_5 = _mm256_madd_epi16(iacc_mat_00_5, scale_0145_5); + iacc_mat_01_5 = _mm256_madd_epi16(iacc_mat_01_5, scale_2367_5); + iacc_mat_10_5 = _mm256_madd_epi16(iacc_mat_10_5, scale_0145_5); + iacc_mat_11_5 = _mm256_madd_epi16(iacc_mat_11_5, scale_2367_5); + + iacc_mat_00_6 = _mm256_madd_epi16(iacc_mat_00_6, scale_0145_6); + iacc_mat_01_6 = _mm256_madd_epi16(iacc_mat_01_6, scale_2367_6); + iacc_mat_10_6 = _mm256_madd_epi16(iacc_mat_10_6, scale_0145_6); + iacc_mat_11_6 = _mm256_madd_epi16(iacc_mat_11_6, scale_2367_6); + + iacc_mat_00_7 = _mm256_madd_epi16(iacc_mat_00_7, scale_0145_7); + iacc_mat_01_7 = _mm256_madd_epi16(iacc_mat_01_7, scale_2367_7); + iacc_mat_10_7 = _mm256_madd_epi16(iacc_mat_10_7, scale_0145_7); + iacc_mat_11_7 = _mm256_madd_epi16(iacc_mat_11_7, scale_2367_7); + + __m256i iacc_mat_00 = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(iacc_mat_00_0, iacc_mat_00_1), _mm256_add_epi32(iacc_mat_00_2, iacc_mat_00_3)), _mm256_add_epi32(_mm256_add_epi32(iacc_mat_00_4, iacc_mat_00_5), _mm256_add_epi32(iacc_mat_00_6, iacc_mat_00_7))); + __m256i iacc_mat_01 = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(iacc_mat_01_0, iacc_mat_01_1), _mm256_add_epi32(iacc_mat_01_2, iacc_mat_01_3)), _mm256_add_epi32(_mm256_add_epi32(iacc_mat_01_4, iacc_mat_01_5), _mm256_add_epi32(iacc_mat_01_6, iacc_mat_01_7))); + __m256i iacc_mat_10 = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(iacc_mat_10_0, iacc_mat_10_1), _mm256_add_epi32(iacc_mat_10_2, iacc_mat_10_3)), _mm256_add_epi32(_mm256_add_epi32(iacc_mat_10_4, iacc_mat_10_5), _mm256_add_epi32(iacc_mat_10_6, iacc_mat_10_7))); + __m256i iacc_mat_11 = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(iacc_mat_11_0, iacc_mat_11_1), _mm256_add_epi32(iacc_mat_11_2, iacc_mat_11_3)), _mm256_add_epi32(_mm256_add_epi32(iacc_mat_11_4, iacc_mat_11_5), _mm256_add_epi32(iacc_mat_11_6, iacc_mat_11_7))); + + // Straighten out to make 4 row vectors + __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204); + __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204); + __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204); + __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204); + + // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes + const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d); + const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse); + + // Multiply with appropiate scales and accumulate (for both d and dmin) below + acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]); + acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]); + acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]); + acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]); + + } + + } + // Store the accumulated values + for (int i = 0; i < 4; i++) { + _mm256_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]); + } + } + } + #else ggml_gemm_q6_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc); From 684c4cad9e3997158ef89ff207b7b7e170e0bc88 Mon Sep 17 00:00:00 2001 From: Manogna-Sree Date: Mon, 11 Aug 2025 05:34:35 -0700 Subject: [PATCH 04/23] Avx512 implementation of GEMM Q6K --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 983 +++++++++++++++++++++++++- 1 file changed, 975 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index ba16c3c0b8..9a9722f097 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -2193,14 +2193,14 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo lhs_vec_6 = _mm256_permute2f128_si256(lhs_vec_6, lhs_vec_6, 0); lhs_vec_7 = _mm256_permute2f128_si256(lhs_vec_7, lhs_vec_7, 0); - __m256i lhs_vec_s_0 = _mm256_maddubs_epi16(lhs_vec_0, m32s); - __m256i lhs_vec_s_1 = _mm256_maddubs_epi16(lhs_vec_1, m32s); - __m256i lhs_vec_s_2 = _mm256_maddubs_epi16(lhs_vec_2, m32s); - __m256i lhs_vec_s_3 = _mm256_maddubs_epi16(lhs_vec_3, m32s); - __m256i lhs_vec_s_4 = _mm256_maddubs_epi16(lhs_vec_4, m32s); - __m256i lhs_vec_s_5 = _mm256_maddubs_epi16(lhs_vec_5, m32s); - __m256i lhs_vec_s_6 = _mm256_maddubs_epi16(lhs_vec_6, m32s); - __m256i lhs_vec_s_7 = _mm256_maddubs_epi16(lhs_vec_7, m32s); + __m256i lhs_vec_s_0 = _mm256_maddubs_epi16(m32s, lhs_vec_0); + __m256i lhs_vec_s_1 = _mm256_maddubs_epi16(m32s, lhs_vec_1); + __m256i lhs_vec_s_2 = _mm256_maddubs_epi16(m32s, lhs_vec_2); + __m256i lhs_vec_s_3 = _mm256_maddubs_epi16(m32s, lhs_vec_3); + __m256i lhs_vec_s_4 = _mm256_maddubs_epi16(m32s, lhs_vec_4); + __m256i lhs_vec_s_5 = _mm256_maddubs_epi16(m32s, lhs_vec_5); + __m256i lhs_vec_s_6 = _mm256_maddubs_epi16(m32s, lhs_vec_6); + __m256i lhs_vec_s_7 = _mm256_maddubs_epi16(m32s, lhs_vec_7); __m256i iacc_0 = _mm256_setzero_si256(); __m256i iacc_1 = _mm256_setzero_si256(); @@ -2295,6 +2295,7 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Accumulated output values permuted so as to be stored in appropriate order post accumulation acc_row = _mm256_permutevar8x32_ps(acc_row, finalpermutemask); _mm256_storeu_ps(s + (y * nr + x * 8), acc_row); + } } #else @@ -6719,7 +6720,973 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo scalesmask1 = _mm256_permute2f128_si256(scalesmask1, scalesmask1, 0); __m256i scalesmask2 = _mm256_castsi128_si256(scalesmask2_sse); scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0); +/* +#ifdef __AVX512F__ + int anc = nc - nc % 16; // Used to align nc with boundary of 16 + const __m512i m4_expanded = _mm512_set1_epi8(0xF); + const __m512i m2_expanded = _mm512_set1_epi8(3); + const __m512i m32s_expanded = _mm512_set1_epi8(32); + for (; y < anr / 4; y += 4){ + + const block_q8_Kx4 * a_ptrs[4]; + + a_ptrs[0] = a_ptr_start + (y * nb); + for (int i = 0; i < 3; ++i) { + a_ptrs[i + 1] = a_ptrs[i] + nb; + } + // Take group of eight block_q6_kx8 structures at each pass of the loop and perform dot product operation + for (int64_t x = xstart; x < anc / 8; x += 2) { + + const block_q6_Kx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb); + const block_q6_Kx8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb); + + // Master FP accumulators + __m512 acc_rows[16]; + for (int i = 0; i < 16; i++) { + acc_rows[i] = _mm512_setzero_ps(); + } + + // For super block + for (int64_t b = 0; b < nb; b++) { + // Delta values - Load the sixteen scale values from two block_q2_kx8 structures + const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d); + + for (int sb = 0; sb < QK_K / 128; sb++) { + const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + sb * 512)); + const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 224 + sb * 512)); + + const __m256i rhs_raw_mat_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_mat_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_mat_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_mat_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_mat_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_mat_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_mat_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_mat_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 480 + sb * 512)); + + const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + sb * 512)); + const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_mat_89AB_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_mat_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 224 + sb * 512)); + + const __m256i rhs_raw_mat_89AB_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_mat_89AB_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_mat_89AB_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_mat_89AB_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 480 + sb * 512)); + + const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + sb * 256)); + const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 32 + sb * 256)); + const __m256i rhs_raw_hbit_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 64 + sb * 256)); + const __m256i rhs_raw_hbit_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 96 + sb * 256)); + const __m256i rhs_raw_hbit_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 128 + sb * 256)); + const __m256i rhs_raw_hbit_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 160 + sb * 256)); + const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 192 + sb * 256)); + const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 224 + sb * 256)); + + const __m256i rhs_raw_hbit_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + sb * 256)); + const __m256i rhs_raw_hbit_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 32 + sb * 256)); + const __m256i rhs_raw_hbit_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 64 + sb * 256)); + const __m256i rhs_raw_hbit_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 96 + sb * 256)); + const __m256i rhs_raw_hbit_89AB_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 128 + sb * 256)); + const __m256i rhs_raw_hbit_CDEF_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 160 + sb * 256)); + const __m256i rhs_raw_hbit_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 192 + sb * 256)); + const __m256i rhs_raw_hbit_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 224 + sb * 256)); + + // Indices 0 through 7 (first block): + const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); + const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240); + const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240); + const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240); + + // Indices 4 through 7 (second block): + const __m256i rhs_raw_mat_0145_4 = _mm256_blend_epi32(rhs_raw_mat_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_4, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_4, requiredOrder), rhs_raw_mat_4567_4, 240); + const __m256i rhs_raw_mat_0145_5 = _mm256_blend_epi32(rhs_raw_mat_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_5, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_5, requiredOrder), rhs_raw_mat_4567_5, 240); + const __m256i rhs_raw_mat_0145_6 = _mm256_blend_epi32(rhs_raw_mat_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_6, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_6, requiredOrder), rhs_raw_mat_4567_6, 240); + const __m256i rhs_raw_mat_0145_7 = _mm256_blend_epi32(rhs_raw_mat_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_7, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_7, requiredOrder), rhs_raw_mat_4567_7, 240); + + // Indices 8 through F (first block): + const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240); + const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240); + const __m256i rhs_raw_mat_89CD_2 = _mm256_blend_epi32(rhs_raw_mat_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_2, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_2, requiredOrder), rhs_raw_mat_CDEF_2, 240); + const __m256i rhs_raw_mat_89CD_3 = _mm256_blend_epi32(rhs_raw_mat_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_3, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_3, requiredOrder), rhs_raw_mat_CDEF_3, 240); + + // Indices 8 through F (second block): + const __m256i rhs_raw_mat_89CD_4 = _mm256_blend_epi32(rhs_raw_mat_89AB_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_4, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_4, requiredOrder), rhs_raw_mat_CDEF_4, 240); + const __m256i rhs_raw_mat_89CD_5 = _mm256_blend_epi32(rhs_raw_mat_89AB_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_5, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_5, requiredOrder), rhs_raw_mat_CDEF_5, 240); + const __m256i rhs_raw_mat_89CD_6 = _mm256_blend_epi32(rhs_raw_mat_89AB_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_6, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_6, requiredOrder), rhs_raw_mat_CDEF_6, 240); + const __m256i rhs_raw_mat_89CD_7 = _mm256_blend_epi32(rhs_raw_mat_89AB_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_7, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_7, requiredOrder), rhs_raw_mat_CDEF_7, 240); + + const __m256i rhs_raw_hbit_0145_0 = _mm256_blend_epi32(rhs_raw_hbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_0, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_0, requiredOrder), rhs_raw_hbit_4567_0, 240); + const __m256i rhs_raw_hbit_0145_1 = _mm256_blend_epi32(rhs_raw_hbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_1, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_1, requiredOrder), rhs_raw_hbit_4567_1, 240); + const __m256i rhs_raw_hbit_0145_2 = _mm256_blend_epi32(rhs_raw_hbit_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_2, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_2, requiredOrder), rhs_raw_hbit_4567_2, 240); + const __m256i rhs_raw_hbit_0145_3 = _mm256_blend_epi32(rhs_raw_hbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_3, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_3, requiredOrder), rhs_raw_hbit_4567_3, 240); + + const __m256i rhs_raw_hbit_89CD_0 = _mm256_blend_epi32(rhs_raw_hbit_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_0, requiredOrder), 240); + const __m256i rhs_raw_hbit_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_0, requiredOrder), rhs_raw_hbit_CDEF_0, 240); + const __m256i rhs_raw_hbit_89CD_1 = _mm256_blend_epi32(rhs_raw_hbit_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_1, requiredOrder), 240); + const __m256i rhs_raw_hbit_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_1, requiredOrder), rhs_raw_hbit_CDEF_1, 240); + const __m256i rhs_raw_hbit_89CD_2 = _mm256_blend_epi32(rhs_raw_hbit_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_2, requiredOrder), 240); + const __m256i rhs_raw_hbit_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_2, requiredOrder), rhs_raw_hbit_CDEF_2, 240); + const __m256i rhs_raw_hbit_89CD_3 = _mm256_blend_epi32(rhs_raw_hbit_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_3, requiredOrder), 240); + const __m256i rhs_raw_hbit_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_3, requiredOrder), rhs_raw_hbit_CDEF_3, 240); + + const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1); + const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1); + const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1); + const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1); + + const __m512i rhs_raw_mat_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_2), rhs_raw_mat_89CD_2, 1); + const __m512i rhs_raw_mat_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_2), rhs_raw_mat_ABEF_2, 1); + const __m512i rhs_raw_mat_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_3), rhs_raw_mat_89CD_3, 1); + const __m512i rhs_raw_mat_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_3), rhs_raw_mat_ABEF_3, 1); + + const __m512i rhs_raw_mat_014589CD_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_4), rhs_raw_mat_89CD_4, 1); + const __m512i rhs_raw_mat_2367ABEF_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_4), rhs_raw_mat_ABEF_4, 1); + const __m512i rhs_raw_mat_014589CD_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_5), rhs_raw_mat_89CD_5, 1); + const __m512i rhs_raw_mat_2367ABEF_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_5), rhs_raw_mat_ABEF_5, 1); + + const __m512i rhs_raw_mat_014589CD_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_6), rhs_raw_mat_89CD_6, 1); + const __m512i rhs_raw_mat_2367ABEF_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_6), rhs_raw_mat_ABEF_6, 1); + const __m512i rhs_raw_mat_014589CD_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_7), rhs_raw_mat_89CD_7, 1); + const __m512i rhs_raw_mat_2367ABEF_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_7), rhs_raw_mat_ABEF_7, 1); + + const __m512i rhs_raw_hbit_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_0), rhs_raw_hbit_89CD_0, 1); + const __m512i rhs_raw_hbit_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_0), rhs_raw_hbit_ABEF_0, 1); + const __m512i rhs_raw_hbit_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_1), rhs_raw_hbit_89CD_1, 1); + const __m512i rhs_raw_hbit_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_1), rhs_raw_hbit_ABEF_1, 1); + + const __m512i rhs_raw_hbit_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_2), rhs_raw_hbit_89CD_2, 1); + const __m512i rhs_raw_hbit_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_2), rhs_raw_hbit_ABEF_2, 1); + const __m512i rhs_raw_hbit_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_3), rhs_raw_hbit_89CD_3, 1); + const __m512i rhs_raw_hbit_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_3), rhs_raw_hbit_ABEF_3, 1); + + // 2-bit -> 8-bit + // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop + const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) + const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) + const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) + const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + + const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) + const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) + const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) + const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + + const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) + const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) + const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) + const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + + const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) + const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) + const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) + const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + + // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop + const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) + const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) + const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) + const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + + const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) + const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) + const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) + const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + + const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) + const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) + const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) + const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + + const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) + const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) + const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) + const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + + // 0 -7, 64 - 71 + const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); + const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4_expanded), rhs_hbit_014589CD_40); + + const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4_expanded), rhs_hbit_2367ABEF_00); + const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4_expanded), rhs_hbit_2367ABEF_40); + + // 8 - 15, 72 - 79 + const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_1, m4_expanded), rhs_hbit_014589CD_01); + const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4_expanded), rhs_hbit_014589CD_41); + + const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4_expanded), rhs_hbit_2367ABEF_01); + const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4_expanded), rhs_hbit_2367ABEF_41); + + // 16 - 23, 80 - 87 + const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_2, m4_expanded), rhs_hbit_014589CD_10); + const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 4), m4_expanded), rhs_hbit_014589CD_50); + + const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_2, m4_expanded), rhs_hbit_2367ABEF_10); + const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 4), m4_expanded), rhs_hbit_2367ABEF_50); + + // 24 - 31, 88 - 95 + const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_3, m4_expanded), rhs_hbit_014589CD_11); + const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 4), m4_expanded), rhs_hbit_014589CD_51); + + const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_3, m4_expanded), rhs_hbit_2367ABEF_11); + const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 4), m4_expanded), rhs_hbit_2367ABEF_51); + + // 32 - 39, 96 - 103 + const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_4, m4_expanded), rhs_hbit_014589CD_20); + const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_4, 4), m4_expanded), rhs_hbit_014589CD_60); + + const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_4, m4_expanded), rhs_hbit_2367ABEF_20); + const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_4, 4), m4_expanded), rhs_hbit_2367ABEF_60); + + // 40 - 47, 104 - 111 + const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_5, m4_expanded), rhs_hbit_014589CD_21); + const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_5, 4), m4_expanded), rhs_hbit_014589CD_61); + + const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_5, m4_expanded), rhs_hbit_2367ABEF_21); + const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_5, 4), m4_expanded), rhs_hbit_2367ABEF_61); + + // 48 - 55, 112 - 119 + const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_6, m4_expanded), rhs_hbit_014589CD_30); + const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_6, 4), m4_expanded), rhs_hbit_014589CD_70); + + const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_6, m4_expanded), rhs_hbit_2367ABEF_30); + const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_6, 4), m4_expanded), rhs_hbit_2367ABEF_70); + + // 56 - 63, 120 - 127 + const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_7, m4_expanded), rhs_hbit_014589CD_31); + const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_7, 4), m4_expanded), rhs_hbit_014589CD_71); + + const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_7, m4_expanded), rhs_hbit_2367ABEF_31); + const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_7, 4), m4_expanded), rhs_hbit_2367ABEF_71); + + // Shuffle pattern one - right side input + const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) + const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) + + const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) + const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) + + const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) + const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) + + const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) + const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) + + const __m512i rhs_mat_014589CD_20_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) + const __m512i rhs_mat_2367ABEF_20_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) + + const __m512i rhs_mat_014589CD_21_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) + const __m512i rhs_mat_2367ABEF_21_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) + + const __m512i rhs_mat_014589CD_30_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) + const __m512i rhs_mat_2367ABEF_30_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) + + const __m512i rhs_mat_014589CD_31_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11 + const __m512i rhs_mat_2367ABEF_31_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) + + const __m512i rhs_mat_014589CD_40_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) + const __m512i rhs_mat_2367ABEF_40_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) + + const __m512i rhs_mat_014589CD_41_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) + const __m512i rhs_mat_2367ABEF_41_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) + + const __m512i rhs_mat_014589CD_50_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) + const __m512i rhs_mat_2367ABEF_50_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) + + const __m512i rhs_mat_014589CD_51_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) + const __m512i rhs_mat_2367ABEF_51_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) + + const __m512i rhs_mat_014589CD_60_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) + const __m512i rhs_mat_2367ABEF_60_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) + + const __m512i rhs_mat_014589CD_61_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) + const __m512i rhs_mat_2367ABEF_61_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) + + const __m512i rhs_mat_014589CD_70_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) + const __m512i rhs_mat_2367ABEF_70_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) + + const __m512i rhs_mat_014589CD_71_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) + const __m512i rhs_mat_2367ABEF_71_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) + + + // Shuffle pattern two - right side input + const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) + const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) + + const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) + const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) + + const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) + const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) + + const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) + const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) + + const __m512i rhs_mat_014589CD_20_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) + const __m512i rhs_mat_2367ABEF_20_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) + + const __m512i rhs_mat_014589CD_21_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) + const __m512i rhs_mat_2367ABEF_21_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) + + const __m512i rhs_mat_014589CD_30_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) + const __m512i rhs_mat_2367ABEF_30_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) + + const __m512i rhs_mat_014589CD_31_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) + const __m512i rhs_mat_2367ABEF_31_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) + + const __m512i rhs_mat_014589CD_40_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) + const __m512i rhs_mat_2367ABEF_40_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) + + const __m512i rhs_mat_014589CD_41_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) + const __m512i rhs_mat_2367ABEF_41_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) + + const __m512i rhs_mat_014589CD_50_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) + const __m512i rhs_mat_2367ABEF_50_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) + + const __m512i rhs_mat_014589CD_51_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) + const __m512i rhs_mat_2367ABEF_51_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) + + const __m512i rhs_mat_014589CD_60_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) + const __m512i rhs_mat_2367ABEF_60_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) + + const __m512i rhs_mat_014589CD_61_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) + const __m512i rhs_mat_2367ABEF_61_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) + + const __m512i rhs_mat_014589CD_70_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) + const __m512i rhs_mat_2367ABEF_70_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) + + const __m512i rhs_mat_014589CD_71_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) + const __m512i rhs_mat_2367ABEF_71_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) + + //Scales of corresponding sub blocks from different Q6_K structures are stored together + //s00 s01 s10 s11 s20 s21 ...... s70 s71 + // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop + const __m128i scales_01_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + sb * 64)); + const __m128i scales_23_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 16 + sb * 64)); + const __m128i scales_45_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 32 + sb * 64)); + const __m128i scales_67_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 48 + sb * 64)); + + const __m128i scales_01_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + sb * 64)); + const __m128i scales_23_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 16 + sb * 64)); + const __m128i scales_45_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 32 + sb * 64)); + const __m128i scales_67_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 48 + sb * 64)); + + // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop + const __m256i scales_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_01_0), scales_01_1, 1); + const __m256i scales_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_23_0), scales_23_1, 1); + const __m256i scales_45 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_45_0), scales_45_1, 1); + const __m256i scales_67 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_67_0), scales_67_1, 1); + + const __m512i scales_0 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_01,scalesmask1)); + const __m512i scales_1 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_01,scalesmask2)); + const __m512i scales_2 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_23,scalesmask1)); + const __m512i scales_3 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_23,scalesmask2)); + const __m512i scales_4 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_45,scalesmask1)); + const __m512i scales_5 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_45,scalesmask2)); + const __m512i scales_6 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_67,scalesmask1)); + const __m512i scales_7 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_67,scalesmask2)); + + const __m512i scale_014589CD_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_2 = _mm512_shuffle_epi32(scales_2, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_2 = _mm512_shuffle_epi32(scales_2, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_3 = _mm512_shuffle_epi32(scales_3, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_3 = _mm512_shuffle_epi32(scales_3, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_4 = _mm512_shuffle_epi32(scales_4, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_4 = _mm512_shuffle_epi32(scales_4, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_5 = _mm512_shuffle_epi32(scales_5, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_5 = _mm512_shuffle_epi32(scales_5, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_6 = _mm512_shuffle_epi32(scales_6, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_6 = _mm512_shuffle_epi32(scales_6, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_7 = _mm512_shuffle_epi32(scales_7, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_7 = _mm512_shuffle_epi32(scales_7, (_MM_PERM_ENUM)238); + + + for (int rp = 0; rp < 4; rp++) { + // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3 + // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector + __m256i lhs_mat_ymm_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 512 * sb))); + __m256i lhs_mat_ymm_01_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 0); + __m256i lhs_mat_ymm_23_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 17); + __m256i lhs_mat_ymm_0123_01 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 32 + 512 * sb))); + __m256i lhs_mat_ymm_01_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 0); + __m256i lhs_mat_ymm_23_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 17); + __m256i lhs_mat_ymm_0123_10 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 64 + 512 * sb))); + __m256i lhs_mat_ymm_01_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 0); + __m256i lhs_mat_ymm_23_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 17); + __m256i lhs_mat_ymm_0123_11 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 96 + 512 * sb))); + __m256i lhs_mat_ymm_01_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 0); + __m256i lhs_mat_ymm_23_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 17); + __m256i lhs_mat_ymm_0123_20 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 128 + 512 * sb))); + __m256i lhs_mat_ymm_01_20 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_20, lhs_mat_ymm_0123_20, 0); + __m256i lhs_mat_ymm_23_20 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_20, lhs_mat_ymm_0123_20, 17); + __m256i lhs_mat_ymm_0123_21 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 160 + 512 * sb))); + __m256i lhs_mat_ymm_01_21 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_21, lhs_mat_ymm_0123_21, 0); + __m256i lhs_mat_ymm_23_21 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_21, lhs_mat_ymm_0123_21, 17); + __m256i lhs_mat_ymm_0123_30 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 192 + 512 * sb))); + __m256i lhs_mat_ymm_01_30 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_30, lhs_mat_ymm_0123_30, 0); + __m256i lhs_mat_ymm_23_30 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_30, lhs_mat_ymm_0123_30, 17); + __m256i lhs_mat_ymm_0123_31 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 224 + 512 * sb))); + __m256i lhs_mat_ymm_01_31 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_31, lhs_mat_ymm_0123_31, 0); + __m256i lhs_mat_ymm_23_31 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_31, lhs_mat_ymm_0123_31, 17); + + __m256i lhs_mat_ymm_0123_40 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 256 + 512 * sb))); + __m256i lhs_mat_ymm_01_40 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_40, lhs_mat_ymm_0123_40, 0); + __m256i lhs_mat_ymm_23_40 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_40, lhs_mat_ymm_0123_40, 17); + __m256i lhs_mat_ymm_0123_41 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 288 + 512 * sb))); + __m256i lhs_mat_ymm_01_41 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_41, lhs_mat_ymm_0123_41, 0); + __m256i lhs_mat_ymm_23_41 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_41, lhs_mat_ymm_0123_41, 17); + __m256i lhs_mat_ymm_0123_50 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 320 + 512 * sb))); + __m256i lhs_mat_ymm_01_50 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_50, lhs_mat_ymm_0123_50, 0); + __m256i lhs_mat_ymm_23_50 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_50, lhs_mat_ymm_0123_50, 17); + __m256i lhs_mat_ymm_0123_51 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 352 + 512 * sb))); + __m256i lhs_mat_ymm_01_51 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_51, lhs_mat_ymm_0123_51, 0); + __m256i lhs_mat_ymm_23_51 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_51, lhs_mat_ymm_0123_51, 17); + __m256i lhs_mat_ymm_0123_60 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 384 + 512 * sb))); + __m256i lhs_mat_ymm_01_60 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_60, lhs_mat_ymm_0123_60, 0); + __m256i lhs_mat_ymm_23_60 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_60, lhs_mat_ymm_0123_60, 17); + __m256i lhs_mat_ymm_0123_61 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 416 + 512 * sb))); + __m256i lhs_mat_ymm_01_61 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_61, lhs_mat_ymm_0123_61, 0); + __m256i lhs_mat_ymm_23_61 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_61, lhs_mat_ymm_0123_61, 17); + __m256i lhs_mat_ymm_0123_70 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 448 + 512 * sb))); + __m256i lhs_mat_ymm_01_70 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_70, lhs_mat_ymm_0123_70, 0); + __m256i lhs_mat_ymm_23_70 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_70, lhs_mat_ymm_0123_70, 17); + __m256i lhs_mat_ymm_0123_71 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 480 + 512 * sb))); + __m256i lhs_mat_ymm_01_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 0); + __m256i lhs_mat_ymm_23_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 17); + + + __m512i lhs_mat_01_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_00), lhs_mat_ymm_01_00, 1); + __m512i lhs_mat_23_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_00), lhs_mat_ymm_23_00, 1); + __m512i lhs_mat_01_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_01), lhs_mat_ymm_01_01, 1); + __m512i lhs_mat_23_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_01), lhs_mat_ymm_23_01, 1); + + __m512i lhs_mat_01_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_10), lhs_mat_ymm_01_10, 1); + __m512i lhs_mat_23_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_10), lhs_mat_ymm_23_10, 1); + __m512i lhs_mat_01_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_11), lhs_mat_ymm_01_11, 1); + __m512i lhs_mat_23_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_11), lhs_mat_ymm_23_11, 1); + + __m512i lhs_mat_01_20 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_20), lhs_mat_ymm_01_20, 1); + __m512i lhs_mat_23_20 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_20), lhs_mat_ymm_23_20, 1); + __m512i lhs_mat_01_21 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_21), lhs_mat_ymm_01_21, 1); + __m512i lhs_mat_23_21 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_21), lhs_mat_ymm_23_21, 1); + + __m512i lhs_mat_01_30 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_30), lhs_mat_ymm_01_30, 1); + __m512i lhs_mat_23_30 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_30), lhs_mat_ymm_23_30, 1); + __m512i lhs_mat_01_31 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_31), lhs_mat_ymm_01_31, 1); + __m512i lhs_mat_23_31 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_31), lhs_mat_ymm_23_31, 1); + + __m512i lhs_mat_01_40 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_40), lhs_mat_ymm_01_40, 1); + __m512i lhs_mat_23_40 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_40), lhs_mat_ymm_23_40, 1); + __m512i lhs_mat_01_41 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_41), lhs_mat_ymm_01_41, 1); + __m512i lhs_mat_23_41 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_41), lhs_mat_ymm_23_41, 1); + + __m512i lhs_mat_01_50 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_50), lhs_mat_ymm_01_50, 1); + __m512i lhs_mat_23_50 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_50), lhs_mat_ymm_23_50, 1); + __m512i lhs_mat_01_51 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_51), lhs_mat_ymm_01_51, 1); + __m512i lhs_mat_23_51 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_51), lhs_mat_ymm_23_51, 1); + + __m512i lhs_mat_01_60 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_60), lhs_mat_ymm_01_60, 1); + __m512i lhs_mat_23_60 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_60), lhs_mat_ymm_23_60, 1); + __m512i lhs_mat_01_61 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_61), lhs_mat_ymm_01_61, 1); + __m512i lhs_mat_23_61 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_61), lhs_mat_ymm_23_61, 1); + + __m512i lhs_mat_01_70 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_70), lhs_mat_ymm_01_70, 1); + __m512i lhs_mat_23_70 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_70), lhs_mat_ymm_23_70, 1); + __m512i lhs_mat_01_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_71), lhs_mat_ymm_01_71, 1); + __m512i lhs_mat_23_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_71), lhs_mat_ymm_23_71, 1); + + __m512i lhs_mat_s_01_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_00); + __m512i lhs_mat_s_23_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_00); + __m512i lhs_mat_s_01_01 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_01); + __m512i lhs_mat_s_23_01 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_01); + __m512i lhs_mat_s_01_10 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_10); + __m512i lhs_mat_s_23_10 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_10); + __m512i lhs_mat_s_01_11 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_11); + __m512i lhs_mat_s_23_11 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_11); + __m512i lhs_mat_s_01_20 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_20); + __m512i lhs_mat_s_23_20 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_20); + __m512i lhs_mat_s_01_21 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_21); + __m512i lhs_mat_s_23_21 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_21); + __m512i lhs_mat_s_01_30 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_30); + __m512i lhs_mat_s_23_30 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_30); + __m512i lhs_mat_s_01_31 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_31); + __m512i lhs_mat_s_23_31 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_31); + __m512i lhs_mat_s_01_40 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_40); + __m512i lhs_mat_s_23_40 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_40); + __m512i lhs_mat_s_01_41 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_41); + __m512i lhs_mat_s_23_41 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_41); + __m512i lhs_mat_s_01_50 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_50); + __m512i lhs_mat_s_23_50 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_50); + __m512i lhs_mat_s_01_51 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_51); + __m512i lhs_mat_s_23_51 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_51); + __m512i lhs_mat_s_01_60 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_60); + __m512i lhs_mat_s_23_60 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_60); + __m512i lhs_mat_s_01_61 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_61); + __m512i lhs_mat_s_23_61 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_61); + __m512i lhs_mat_s_01_70 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_70); + __m512i lhs_mat_s_23_70 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_70); + __m512i lhs_mat_s_01_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_71); + __m512i lhs_mat_s_23_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_71); + + // Shuffle pattern one - left side input + const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + + const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + + const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + + const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + + const __m512i lhs_mat_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m512i lhs_mat_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + + const __m512i lhs_mat_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m512i lhs_mat_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + + const __m512i lhs_mat_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m512i lhs_mat_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + + const __m512i lhs_mat_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m512i lhs_mat_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + + const __m512i lhs_mat_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m512i lhs_mat_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + + const __m512i lhs_mat_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m512i lhs_mat_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + + const __m512i lhs_mat_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m512i lhs_mat_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + + const __m512i lhs_mat_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m512i lhs_mat_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + + const __m512i lhs_mat_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m512i lhs_mat_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + + const __m512i lhs_mat_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m512i lhs_mat_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + + const __m512i lhs_mat_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m512i lhs_mat_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + + const __m512i lhs_mat_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m512i lhs_mat_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + + // Shuffle pattern two- left side input + const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + + const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + + const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + + const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + + const __m512i lhs_mat_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m512i lhs_mat_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + + const __m512i lhs_mat_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m512i lhs_mat_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + + const __m512i lhs_mat_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m512i lhs_mat_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + + const __m512i lhs_mat_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m512i lhs_mat_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + + const __m512i lhs_mat_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m512i lhs_mat_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + + const __m512i lhs_mat_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m512i lhs_mat_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + + const __m512i lhs_mat_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m512i lhs_mat_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + + const __m512i lhs_mat_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m512i lhs_mat_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + + const __m512i lhs_mat_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m512i lhs_mat_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m512i lhs_mat_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m512i lhs_mat_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m512i lhs_mat_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m512i lhs_mat_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m512i lhs_mat_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m512i lhs_mat_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + + // Shuffle pattern one - left side input + const __m512i lhs_mat_s_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m512i lhs_mat_s_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + + const __m512i lhs_mat_s_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m512i lhs_mat_s_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + + const __m512i lhs_mat_s_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m512i lhs_mat_s_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + + const __m512i lhs_mat_s_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m512i lhs_mat_s_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + + const __m512i lhs_mat_s_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m512i lhs_mat_s_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + + const __m512i lhs_mat_s_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m512i lhs_mat_s_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + + const __m512i lhs_mat_s_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m512i lhs_mat_s_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + + const __m512i lhs_mat_s_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m512i lhs_mat_s_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + + const __m512i lhs_mat_s_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m512i lhs_mat_s_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + + const __m512i lhs_mat_s_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m512i lhs_mat_s_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + + const __m512i lhs_mat_s_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m512i lhs_mat_s_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + + const __m512i lhs_mat_s_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m512i lhs_mat_s_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + + const __m512i lhs_mat_s_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m512i lhs_mat_s_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + + const __m512i lhs_mat_s_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m512i lhs_mat_s_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + + const __m512i lhs_mat_s_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m512i lhs_mat_s_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + + const __m512i lhs_mat_s_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m512i lhs_mat_s_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + + // Shuffle pattern two- left side input + const __m512i lhs_mat_s_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m512i lhs_mat_s_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + + const __m512i lhs_mat_s_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m512i lhs_mat_s_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + + const __m512i lhs_mat_s_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m512i lhs_mat_s_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + + const __m512i lhs_mat_s_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m512i lhs_mat_s_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + + const __m512i lhs_mat_s_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m512i lhs_mat_s_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + + const __m512i lhs_mat_s_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m512i lhs_mat_s_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + + const __m512i lhs_mat_s_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m512i lhs_mat_s_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + + const __m512i lhs_mat_s_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m512i lhs_mat_s_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + + const __m512i lhs_mat_s_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m512i lhs_mat_s_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + + const __m512i lhs_mat_s_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m512i lhs_mat_s_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + + const __m512i lhs_mat_s_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m512i lhs_mat_s_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + + const __m512i lhs_mat_s_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m512i lhs_mat_s_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + + const __m512i lhs_mat_s_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m512i lhs_mat_s_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m512i lhs_mat_s_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m512i lhs_mat_s_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m512i lhs_mat_s_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m512i lhs_mat_s_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m512i lhs_mat_s_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m512i lhs_mat_s_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + + // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane + __m512i iacc_mat_00_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); + __m512i iacc_mat_01_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); + + __m512i iacc_mat_10_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_23_00_sp1), lhs_mat_s_23_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_23_01_sp1), lhs_mat_s_23_01_sp1)); + __m512i iacc_mat_11_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_23_00_sp1), lhs_mat_s_23_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_23_01_sp1), lhs_mat_s_23_01_sp1)); + + __m512i iacc_mat_00_1_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_01_10_sp1), lhs_mat_s_01_10_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_01_11_sp1), lhs_mat_s_01_11_sp1)); + __m512i iacc_mat_01_1_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_01_10_sp1), lhs_mat_s_01_10_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_01_11_sp1), lhs_mat_s_01_11_sp1)); + + __m512i iacc_mat_10_1_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_23_10_sp1), lhs_mat_s_23_10_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_23_11_sp1), lhs_mat_s_23_11_sp1)); + __m512i iacc_mat_11_1_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_23_10_sp1), lhs_mat_s_23_10_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_23_11_sp1), lhs_mat_s_23_11_sp1)); + + __m512i iacc_mat_00_2_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp1, lhs_mat_01_20_sp1), lhs_mat_s_01_20_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp1, lhs_mat_01_21_sp1), lhs_mat_s_01_21_sp1)); + __m512i iacc_mat_01_2_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp1, lhs_mat_01_20_sp1), lhs_mat_s_01_20_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp1, lhs_mat_01_21_sp1), lhs_mat_s_01_21_sp1)); + + __m512i iacc_mat_10_2_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp1, lhs_mat_23_20_sp1), lhs_mat_s_23_20_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp1, lhs_mat_23_21_sp1), lhs_mat_s_23_21_sp1)); + __m512i iacc_mat_11_2_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp1, lhs_mat_23_20_sp1), lhs_mat_s_23_20_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp1, lhs_mat_23_21_sp1), lhs_mat_s_23_21_sp1)); + + __m512i iacc_mat_00_3_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp1, lhs_mat_01_30_sp1), lhs_mat_s_01_30_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp1, lhs_mat_01_31_sp1), lhs_mat_s_01_31_sp1)); + __m512i iacc_mat_01_3_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp1, lhs_mat_01_30_sp1), lhs_mat_s_01_30_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp1, lhs_mat_01_31_sp1), lhs_mat_s_01_31_sp1)); + + __m512i iacc_mat_10_3_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp1, lhs_mat_23_30_sp1), lhs_mat_s_23_30_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp1, lhs_mat_23_31_sp1), lhs_mat_s_23_31_sp1)); + __m512i iacc_mat_11_3_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp1, lhs_mat_23_30_sp1), lhs_mat_s_23_30_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp1, lhs_mat_23_31_sp1), lhs_mat_s_23_31_sp1)); + + __m512i iacc_mat_00_4_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp1, lhs_mat_01_40_sp1), lhs_mat_s_01_40_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp1, lhs_mat_01_41_sp1), lhs_mat_s_01_41_sp1)); + __m512i iacc_mat_01_4_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp1, lhs_mat_01_40_sp1), lhs_mat_s_01_40_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp1, lhs_mat_01_41_sp1), lhs_mat_s_01_41_sp1)); + + __m512i iacc_mat_10_4_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp1, lhs_mat_23_40_sp1), lhs_mat_s_23_40_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp1, lhs_mat_23_41_sp1), lhs_mat_s_23_41_sp1)); + __m512i iacc_mat_11_4_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp1, lhs_mat_23_40_sp1), lhs_mat_s_23_40_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp1, lhs_mat_23_41_sp1), lhs_mat_s_23_41_sp1)); + + __m512i iacc_mat_00_5_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp1, lhs_mat_01_50_sp1), lhs_mat_s_01_50_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp1, lhs_mat_01_51_sp1), lhs_mat_s_01_51_sp1)); + __m512i iacc_mat_01_5_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp1, lhs_mat_01_50_sp1), lhs_mat_s_01_50_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp1, lhs_mat_01_51_sp1), lhs_mat_s_01_51_sp1)); + + __m512i iacc_mat_10_5_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp1, lhs_mat_23_50_sp1), lhs_mat_s_23_50_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp1, lhs_mat_23_51_sp1), lhs_mat_s_23_51_sp1)); + __m512i iacc_mat_11_5_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp1, lhs_mat_23_50_sp1), lhs_mat_s_23_50_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp1, lhs_mat_23_51_sp1), lhs_mat_s_23_51_sp1)); + + __m512i iacc_mat_00_6_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp1, lhs_mat_01_60_sp1), lhs_mat_s_01_60_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp1, lhs_mat_01_61_sp1), lhs_mat_s_01_61_sp1)); + __m512i iacc_mat_01_6_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp1, lhs_mat_01_60_sp1), lhs_mat_s_01_60_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp1, lhs_mat_01_61_sp1), lhs_mat_s_01_61_sp1)); + + __m512i iacc_mat_10_6_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp1, lhs_mat_23_60_sp1), lhs_mat_s_23_60_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp1, lhs_mat_23_61_sp1), lhs_mat_s_23_61_sp1)); + __m512i iacc_mat_11_6_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp1, lhs_mat_23_60_sp1), lhs_mat_s_23_60_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp1, lhs_mat_23_61_sp1), lhs_mat_s_23_61_sp1)); + + __m512i iacc_mat_00_7_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp1, lhs_mat_01_70_sp1), lhs_mat_s_01_70_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp1, lhs_mat_01_71_sp1), lhs_mat_s_01_71_sp1)); + __m512i iacc_mat_01_7_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp1, lhs_mat_01_70_sp1), lhs_mat_s_01_70_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp1, lhs_mat_01_71_sp1), lhs_mat_s_01_71_sp1)); + + __m512i iacc_mat_10_7_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp1, lhs_mat_23_70_sp1), lhs_mat_s_23_70_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp1, lhs_mat_23_71_sp1), lhs_mat_s_23_71_sp1)); + __m512i iacc_mat_11_7_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp1, lhs_mat_23_70_sp1), lhs_mat_s_23_70_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp1, lhs_mat_23_71_sp1), lhs_mat_s_23_71_sp1)); + + __m512i iacc_mat_00_0_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_01_00_sp2), lhs_mat_s_01_00_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_01_01_sp2), lhs_mat_s_01_01_sp2)); + __m512i iacc_mat_01_0_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_01_00_sp2), lhs_mat_s_01_00_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_01_01_sp2), lhs_mat_s_01_01_sp2)); + + __m512i iacc_mat_10_0_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_23_00_sp2), lhs_mat_s_23_00_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_23_01_sp2), lhs_mat_s_23_01_sp2)); + __m512i iacc_mat_11_0_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_23_00_sp2), lhs_mat_s_23_00_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_23_01_sp2), lhs_mat_s_23_01_sp2)); + + __m512i iacc_mat_00_1_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_01_10_sp2), lhs_mat_s_01_10_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_01_11_sp2), lhs_mat_s_01_11_sp2)); + __m512i iacc_mat_01_1_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_01_10_sp2), lhs_mat_s_01_10_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_01_11_sp2), lhs_mat_s_01_11_sp2)); + + __m512i iacc_mat_10_1_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_23_10_sp2), lhs_mat_s_23_10_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_23_11_sp2), lhs_mat_s_23_11_sp2)); + __m512i iacc_mat_11_1_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_23_10_sp2), lhs_mat_s_23_10_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_23_11_sp2), lhs_mat_s_23_11_sp2)); + + __m512i iacc_mat_00_2_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp2, lhs_mat_01_20_sp2), lhs_mat_s_01_20_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp2, lhs_mat_01_21_sp2), lhs_mat_s_01_21_sp2)); + __m512i iacc_mat_01_2_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp2, lhs_mat_01_20_sp2), lhs_mat_s_01_20_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp2, lhs_mat_01_21_sp2), lhs_mat_s_01_21_sp2)); + + __m512i iacc_mat_10_2_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp2, lhs_mat_23_20_sp2), lhs_mat_s_23_20_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp2, lhs_mat_23_21_sp2), lhs_mat_s_23_21_sp2)); + __m512i iacc_mat_11_2_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp2, lhs_mat_23_20_sp2), lhs_mat_s_23_20_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp2, lhs_mat_23_21_sp2), lhs_mat_s_23_21_sp2)); + + __m512i iacc_mat_00_3_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp2, lhs_mat_01_30_sp2), lhs_mat_s_01_30_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp2, lhs_mat_01_31_sp2), lhs_mat_s_01_31_sp2)); + __m512i iacc_mat_01_3_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp2, lhs_mat_01_30_sp2), lhs_mat_s_01_30_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp2, lhs_mat_01_31_sp2), lhs_mat_s_01_31_sp2)); + + __m512i iacc_mat_10_3_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp2, lhs_mat_23_30_sp2), lhs_mat_s_23_30_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp2, lhs_mat_23_31_sp2), lhs_mat_s_23_31_sp2)); + __m512i iacc_mat_11_3_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp2, lhs_mat_23_30_sp2), lhs_mat_s_23_30_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp2, lhs_mat_23_31_sp2), lhs_mat_s_23_31_sp2)); + + __m512i iacc_mat_00_4_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp2, lhs_mat_01_40_sp2), lhs_mat_s_01_40_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp2, lhs_mat_01_41_sp2), lhs_mat_s_01_41_sp2)); + __m512i iacc_mat_01_4_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp2, lhs_mat_01_40_sp2), lhs_mat_s_01_40_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp2, lhs_mat_01_41_sp2), lhs_mat_s_01_41_sp2)); + + __m512i iacc_mat_10_4_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp2, lhs_mat_23_40_sp2), lhs_mat_s_23_40_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp2, lhs_mat_23_41_sp2), lhs_mat_s_23_41_sp2)); + __m512i iacc_mat_11_4_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp2, lhs_mat_23_40_sp2), lhs_mat_s_23_40_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp2, lhs_mat_23_41_sp2), lhs_mat_s_23_41_sp2)); + + __m512i iacc_mat_00_5_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp2, lhs_mat_01_50_sp2), lhs_mat_s_01_50_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp2, lhs_mat_01_51_sp2), lhs_mat_s_01_51_sp2)); + __m512i iacc_mat_01_5_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp2, lhs_mat_01_50_sp2), lhs_mat_s_01_50_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp2, lhs_mat_01_51_sp2), lhs_mat_s_01_51_sp2)); + + __m512i iacc_mat_10_5_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp2, lhs_mat_23_50_sp2), lhs_mat_s_23_50_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp2, lhs_mat_23_51_sp2), lhs_mat_s_23_51_sp2)); + __m512i iacc_mat_11_5_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp2, lhs_mat_23_50_sp2), lhs_mat_s_23_50_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp2, lhs_mat_23_51_sp2), lhs_mat_s_23_51_sp2)); + + __m512i iacc_mat_00_6_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp2, lhs_mat_01_60_sp2), lhs_mat_s_01_60_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp2, lhs_mat_01_61_sp2), lhs_mat_s_01_61_sp2)); + __m512i iacc_mat_01_6_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp2, lhs_mat_01_60_sp2), lhs_mat_s_01_60_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp2, lhs_mat_01_61_sp2), lhs_mat_s_01_61_sp2)); + + __m512i iacc_mat_10_6_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp2, lhs_mat_23_60_sp2), lhs_mat_s_23_60_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp2, lhs_mat_23_61_sp2), lhs_mat_s_23_61_sp2)); + __m512i iacc_mat_11_6_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp2, lhs_mat_23_60_sp2), lhs_mat_s_23_60_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp2, lhs_mat_23_61_sp2), lhs_mat_s_23_61_sp2)); + + __m512i iacc_mat_00_7_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp2, lhs_mat_01_70_sp2), lhs_mat_s_01_70_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp2, lhs_mat_01_71_sp2), lhs_mat_s_01_71_sp2)); + __m512i iacc_mat_01_7_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp2, lhs_mat_01_70_sp2), lhs_mat_s_01_70_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp2, lhs_mat_01_71_sp2), lhs_mat_s_01_71_sp2)); + + __m512i iacc_mat_10_7_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp2, lhs_mat_23_70_sp2), lhs_mat_s_23_70_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp2, lhs_mat_23_71_sp2), lhs_mat_s_23_71_sp2)); + __m512i iacc_mat_11_7_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp2, lhs_mat_23_70_sp2), lhs_mat_s_23_70_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp2, lhs_mat_23_71_sp2), lhs_mat_s_23_71_sp2)); + + // Combine results from both shuffle patterns for each output block + __m512i iacc_mat_00_0 = _mm512_add_epi16(iacc_mat_00_0_sp1, iacc_mat_00_0_sp2); + __m512i iacc_mat_01_0 = _mm512_add_epi16(iacc_mat_01_0_sp1, iacc_mat_01_0_sp2); + __m512i iacc_mat_10_0 = _mm512_add_epi16(iacc_mat_10_0_sp1, iacc_mat_10_0_sp2); + __m512i iacc_mat_11_0 = _mm512_add_epi16(iacc_mat_11_0_sp1, iacc_mat_11_0_sp2); + + __m512i iacc_mat_00_1 = _mm512_add_epi16(iacc_mat_00_1_sp1, iacc_mat_00_1_sp2); + __m512i iacc_mat_01_1 = _mm512_add_epi16(iacc_mat_01_1_sp1, iacc_mat_01_1_sp2); + __m512i iacc_mat_10_1 = _mm512_add_epi16(iacc_mat_10_1_sp1, iacc_mat_10_1_sp2); + __m512i iacc_mat_11_1 = _mm512_add_epi16(iacc_mat_11_1_sp1, iacc_mat_11_1_sp2); + + __m512i iacc_mat_00_2 = _mm512_add_epi16(iacc_mat_00_2_sp1, iacc_mat_00_2_sp2); + __m512i iacc_mat_01_2 = _mm512_add_epi16(iacc_mat_01_2_sp1, iacc_mat_01_2_sp2); + __m512i iacc_mat_10_2 = _mm512_add_epi16(iacc_mat_10_2_sp1, iacc_mat_10_2_sp2); + __m512i iacc_mat_11_2 = _mm512_add_epi16(iacc_mat_11_2_sp1, iacc_mat_11_2_sp2); + + __m512i iacc_mat_00_3 = _mm512_add_epi16(iacc_mat_00_3_sp1, iacc_mat_00_3_sp2); + __m512i iacc_mat_01_3 = _mm512_add_epi16(iacc_mat_01_3_sp1, iacc_mat_01_3_sp2); + __m512i iacc_mat_10_3 = _mm512_add_epi16(iacc_mat_10_3_sp1, iacc_mat_10_3_sp2); + __m512i iacc_mat_11_3 = _mm512_add_epi16(iacc_mat_11_3_sp1, iacc_mat_11_3_sp2); + + __m512i iacc_mat_00_4 = _mm512_add_epi16(iacc_mat_00_4_sp1, iacc_mat_00_4_sp2); + __m512i iacc_mat_01_4 = _mm512_add_epi16(iacc_mat_01_4_sp1, iacc_mat_01_4_sp2); + __m512i iacc_mat_10_4 = _mm512_add_epi16(iacc_mat_10_4_sp1, iacc_mat_10_4_sp2); + __m512i iacc_mat_11_4 = _mm512_add_epi16(iacc_mat_11_4_sp1, iacc_mat_11_4_sp2); + + __m512i iacc_mat_00_5 = _mm512_add_epi16(iacc_mat_00_5_sp1, iacc_mat_00_5_sp2); + __m512i iacc_mat_01_5 = _mm512_add_epi16(iacc_mat_01_5_sp1, iacc_mat_01_5_sp2); + __m512i iacc_mat_10_5 = _mm512_add_epi16(iacc_mat_10_5_sp1, iacc_mat_10_5_sp2); + __m512i iacc_mat_11_5 = _mm512_add_epi16(iacc_mat_11_5_sp1, iacc_mat_11_5_sp2); + + __m512i iacc_mat_00_6 = _mm512_add_epi16(iacc_mat_00_6_sp1, iacc_mat_00_6_sp2); + __m512i iacc_mat_01_6 = _mm512_add_epi16(iacc_mat_01_6_sp1, iacc_mat_01_6_sp2); + __m512i iacc_mat_10_6 = _mm512_add_epi16(iacc_mat_10_6_sp1, iacc_mat_10_6_sp2); + __m512i iacc_mat_11_6 = _mm512_add_epi16(iacc_mat_11_6_sp1, iacc_mat_11_6_sp2); + + __m512i iacc_mat_00_7 = _mm512_add_epi16(iacc_mat_00_7_sp1, iacc_mat_00_7_sp2); + __m512i iacc_mat_01_7 = _mm512_add_epi16(iacc_mat_01_7_sp1, iacc_mat_01_7_sp2); + __m512i iacc_mat_10_7 = _mm512_add_epi16(iacc_mat_10_7_sp1, iacc_mat_10_7_sp2); + __m512i iacc_mat_11_7 = _mm512_add_epi16(iacc_mat_11_7_sp1, iacc_mat_11_7_sp2); + + // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block + iacc_mat_00_0 = _mm512_madd_epi16(iacc_mat_00_0, scale_014589CD_0); + iacc_mat_01_0 = _mm512_madd_epi16(iacc_mat_01_0, scale_2367ABEF_0); + iacc_mat_10_0 = _mm512_madd_epi16(iacc_mat_10_0, scale_014589CD_0); + iacc_mat_11_0 = _mm512_madd_epi16(iacc_mat_11_0, scale_2367ABEF_0); + + iacc_mat_00_1 = _mm512_madd_epi16(iacc_mat_00_1, scale_014589CD_1); + iacc_mat_01_1 = _mm512_madd_epi16(iacc_mat_01_1, scale_2367ABEF_1); + iacc_mat_10_1 = _mm512_madd_epi16(iacc_mat_10_1, scale_014589CD_1); + iacc_mat_11_1 = _mm512_madd_epi16(iacc_mat_11_1, scale_2367ABEF_1); + + iacc_mat_00_2 = _mm512_madd_epi16(iacc_mat_00_2, scale_014589CD_2); + iacc_mat_01_2 = _mm512_madd_epi16(iacc_mat_01_2, scale_2367ABEF_2); + iacc_mat_10_2 = _mm512_madd_epi16(iacc_mat_10_2, scale_014589CD_2); + iacc_mat_11_2 = _mm512_madd_epi16(iacc_mat_11_2, scale_2367ABEF_2); + + iacc_mat_00_3 = _mm512_madd_epi16(iacc_mat_00_3, scale_014589CD_3); + iacc_mat_01_3 = _mm512_madd_epi16(iacc_mat_01_3, scale_2367ABEF_3); + iacc_mat_10_3 = _mm512_madd_epi16(iacc_mat_10_3, scale_014589CD_3); + iacc_mat_11_3 = _mm512_madd_epi16(iacc_mat_11_3, scale_2367ABEF_3); + + iacc_mat_00_4 = _mm512_madd_epi16(iacc_mat_00_4, scale_014589CD_4); + iacc_mat_01_4 = _mm512_madd_epi16(iacc_mat_01_4, scale_2367ABEF_4); + iacc_mat_10_4 = _mm512_madd_epi16(iacc_mat_10_4, scale_014589CD_4); + iacc_mat_11_4 = _mm512_madd_epi16(iacc_mat_11_4, scale_2367ABEF_4); + + iacc_mat_00_5 = _mm512_madd_epi16(iacc_mat_00_5, scale_014589CD_5); + iacc_mat_01_5 = _mm512_madd_epi16(iacc_mat_01_5, scale_2367ABEF_5); + iacc_mat_10_5 = _mm512_madd_epi16(iacc_mat_10_5, scale_014589CD_5); + iacc_mat_11_5 = _mm512_madd_epi16(iacc_mat_11_5, scale_2367ABEF_5); + + iacc_mat_00_6 = _mm512_madd_epi16(iacc_mat_00_6, scale_014589CD_6); + iacc_mat_01_6 = _mm512_madd_epi16(iacc_mat_01_6, scale_2367ABEF_6); + iacc_mat_10_6 = _mm512_madd_epi16(iacc_mat_10_6, scale_014589CD_6); + iacc_mat_11_6 = _mm512_madd_epi16(iacc_mat_11_6, scale_2367ABEF_6); + + iacc_mat_00_7 = _mm512_madd_epi16(iacc_mat_00_7, scale_014589CD_7); + iacc_mat_01_7 = _mm512_madd_epi16(iacc_mat_01_7, scale_2367ABEF_7); + iacc_mat_10_7 = _mm512_madd_epi16(iacc_mat_10_7, scale_014589CD_7); + iacc_mat_11_7 = _mm512_madd_epi16(iacc_mat_11_7, scale_2367ABEF_7); + + + __m512i iacc_mat_00 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_0, iacc_mat_00_1), _mm512_add_epi32(iacc_mat_00_2, iacc_mat_00_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_4, iacc_mat_00_5), _mm512_add_epi32(iacc_mat_00_6, iacc_mat_00_7))); + __m512i iacc_mat_01 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_0, iacc_mat_01_1), _mm512_add_epi32(iacc_mat_01_2, iacc_mat_01_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_4, iacc_mat_01_5), _mm512_add_epi32(iacc_mat_01_6, iacc_mat_01_7))); + __m512i iacc_mat_10 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_0, iacc_mat_10_1), _mm512_add_epi32(iacc_mat_10_2, iacc_mat_10_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_4, iacc_mat_10_5), _mm512_add_epi32(iacc_mat_10_6, iacc_mat_10_7))); + __m512i iacc_mat_11 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_11_0, iacc_mat_11_1), _mm512_add_epi32(iacc_mat_11_2, iacc_mat_11_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_11_4, iacc_mat_11_5), _mm512_add_epi32(iacc_mat_11_6, iacc_mat_11_7))); + + // Straighten out to make 4 row vectors + __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78)); + __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01); + __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78)); + __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11); + + // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes + const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d); + const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse); + const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1); + + // Multiply with appropiate scales and accumulate (for both d and dmin) below + acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]); + acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]); + acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]); + acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]); + } + } + } + + // Store the accumulated values + for (int i = 0; i < 16; i++) { + _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]); + printf("Index = %d",((y * 4 + i) * bs + x * 8)); + printf("\n"); + printf("sub_ps values = "); + print_m512f(acc_rows[i]); + } + } + } + exit(0); +#endif +*/ for (; y < anr / 4; y += 4){ const block_q8_Kx4 * a_ptrs[4]; From ed662687cfb236940ca2d78303d3ec5a857c3b0b Mon Sep 17 00:00:00 2001 From: Manogna-Sree Date: Mon, 11 Aug 2025 06:38:17 -0700 Subject: [PATCH 05/23] Avx512 implementation of GEMM Q6K for edge handling case --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 974 +++++++++++++++++++++++++- 1 file changed, 966 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 9a9722f097..68fecfdbb5 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -24,6 +24,14 @@ #define UNUSED GGML_UNUSED +void print_m512f(const __m512 vec) { + const float *values = (const float*)&vec; + for (int i = 0; i < 16; i++) { + printf("%f ", values[i]); + } + printf("\n"); +} + #if defined(__AVX__) #if defined(__F16C__) #if defined(__AVX512F__) @@ -6720,7 +6728,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo scalesmask1 = _mm256_permute2f128_si256(scalesmask1, scalesmask1, 0); __m256i scalesmask2 = _mm256_castsi128_si256(scalesmask2_sse); scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0); -/* + #ifdef __AVX512F__ int anc = nc - nc % 16; // Used to align nc with boundary of 16 const __m512i m4_expanded = _mm512_set1_epi8(0xF); @@ -6736,7 +6744,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo a_ptrs[i + 1] = a_ptrs[i] + nb; } // Take group of eight block_q6_kx8 structures at each pass of the loop and perform dot product operation - for (int64_t x = xstart; x < anc / 8; x += 2) { + for (int64_t x = 0; x < anc / 8; x += 2) { const block_q6_Kx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb); const block_q6_Kx8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb); @@ -7677,16 +7685,966 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Store the accumulated values for (int i = 0; i < 16; i++) { _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]); - printf("Index = %d",((y * 4 + i) * bs + x * 8)); - printf("\n"); - printf("sub_ps values = "); - print_m512f(acc_rows[i]); + } } } - exit(0); + + for (; y < nr / 4; y ++){ + + const block_q8_Kx4 * a_ptr = a_ptr_start + (y * nb); + + // Take group of eight block_q6_kx8 structures at each pass of the loop and perform dot product operation + for (int64_t x = 0; x < anc / 8; x += 2) { + + const block_q6_Kx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb); + const block_q6_Kx8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb); + + // Master FP accumulators + __m512 acc_rows[4]; + for (int i = 0; i < 4; i++) { + acc_rows[i] = _mm512_setzero_ps(); + } + + // For super block + for (int64_t b = 0; b < nb; b++) { + // Delta values - Load the sixteen scale values from two block_q2_kx8 structures + const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d); + + for (int sb = 0; sb < QK_K / 128; sb++) { + const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + sb * 512)); + const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 224 + sb * 512)); + + const __m256i rhs_raw_mat_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_mat_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_mat_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_mat_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_mat_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_mat_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_mat_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_mat_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 480 + sb * 512)); + + const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + sb * 512)); + const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_mat_89AB_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_mat_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 224 + sb * 512)); + + const __m256i rhs_raw_mat_89AB_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_mat_89AB_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_mat_89AB_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_mat_89AB_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_mat_CDEF_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 480 + sb * 512)); + + const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + sb * 256)); + const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 32 + sb * 256)); + const __m256i rhs_raw_hbit_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 64 + sb * 256)); + const __m256i rhs_raw_hbit_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 96 + sb * 256)); + const __m256i rhs_raw_hbit_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 128 + sb * 256)); + const __m256i rhs_raw_hbit_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 160 + sb * 256)); + const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 192 + sb * 256)); + const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 224 + sb * 256)); + + const __m256i rhs_raw_hbit_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + sb * 256)); + const __m256i rhs_raw_hbit_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 32 + sb * 256)); + const __m256i rhs_raw_hbit_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 64 + sb * 256)); + const __m256i rhs_raw_hbit_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 96 + sb * 256)); + const __m256i rhs_raw_hbit_89AB_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 128 + sb * 256)); + const __m256i rhs_raw_hbit_CDEF_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 160 + sb * 256)); + const __m256i rhs_raw_hbit_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 192 + sb * 256)); + const __m256i rhs_raw_hbit_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 224 + sb * 256)); + + // Indices 0 through 7 (first block): + const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); + const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240); + const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240); + const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240); + + // Indices 4 through 7 (second block): + const __m256i rhs_raw_mat_0145_4 = _mm256_blend_epi32(rhs_raw_mat_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_4, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_4, requiredOrder), rhs_raw_mat_4567_4, 240); + const __m256i rhs_raw_mat_0145_5 = _mm256_blend_epi32(rhs_raw_mat_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_5, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_5, requiredOrder), rhs_raw_mat_4567_5, 240); + const __m256i rhs_raw_mat_0145_6 = _mm256_blend_epi32(rhs_raw_mat_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_6, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_6, requiredOrder), rhs_raw_mat_4567_6, 240); + const __m256i rhs_raw_mat_0145_7 = _mm256_blend_epi32(rhs_raw_mat_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_7, requiredOrder), 240); + const __m256i rhs_raw_mat_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_7, requiredOrder), rhs_raw_mat_4567_7, 240); + + // Indices 8 through F (first block): + const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240); + const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240); + const __m256i rhs_raw_mat_89CD_2 = _mm256_blend_epi32(rhs_raw_mat_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_2, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_2, requiredOrder), rhs_raw_mat_CDEF_2, 240); + const __m256i rhs_raw_mat_89CD_3 = _mm256_blend_epi32(rhs_raw_mat_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_3, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_3, requiredOrder), rhs_raw_mat_CDEF_3, 240); + + // Indices 8 through F (second block): + const __m256i rhs_raw_mat_89CD_4 = _mm256_blend_epi32(rhs_raw_mat_89AB_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_4, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_4, requiredOrder), rhs_raw_mat_CDEF_4, 240); + const __m256i rhs_raw_mat_89CD_5 = _mm256_blend_epi32(rhs_raw_mat_89AB_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_5, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_5, requiredOrder), rhs_raw_mat_CDEF_5, 240); + const __m256i rhs_raw_mat_89CD_6 = _mm256_blend_epi32(rhs_raw_mat_89AB_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_6, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_6, requiredOrder), rhs_raw_mat_CDEF_6, 240); + const __m256i rhs_raw_mat_89CD_7 = _mm256_blend_epi32(rhs_raw_mat_89AB_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_7, requiredOrder), 240); + const __m256i rhs_raw_mat_ABEF_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_7, requiredOrder), rhs_raw_mat_CDEF_7, 240); + + const __m256i rhs_raw_hbit_0145_0 = _mm256_blend_epi32(rhs_raw_hbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_0, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_0, requiredOrder), rhs_raw_hbit_4567_0, 240); + const __m256i rhs_raw_hbit_0145_1 = _mm256_blend_epi32(rhs_raw_hbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_1, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_1, requiredOrder), rhs_raw_hbit_4567_1, 240); + const __m256i rhs_raw_hbit_0145_2 = _mm256_blend_epi32(rhs_raw_hbit_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_2, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_2, requiredOrder), rhs_raw_hbit_4567_2, 240); + const __m256i rhs_raw_hbit_0145_3 = _mm256_blend_epi32(rhs_raw_hbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_3, requiredOrder), 240); + const __m256i rhs_raw_hbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_3, requiredOrder), rhs_raw_hbit_4567_3, 240); + + const __m256i rhs_raw_hbit_89CD_0 = _mm256_blend_epi32(rhs_raw_hbit_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_0, requiredOrder), 240); + const __m256i rhs_raw_hbit_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_0, requiredOrder), rhs_raw_hbit_CDEF_0, 240); + const __m256i rhs_raw_hbit_89CD_1 = _mm256_blend_epi32(rhs_raw_hbit_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_1, requiredOrder), 240); + const __m256i rhs_raw_hbit_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_1, requiredOrder), rhs_raw_hbit_CDEF_1, 240); + const __m256i rhs_raw_hbit_89CD_2 = _mm256_blend_epi32(rhs_raw_hbit_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_2, requiredOrder), 240); + const __m256i rhs_raw_hbit_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_2, requiredOrder), rhs_raw_hbit_CDEF_2, 240); + const __m256i rhs_raw_hbit_89CD_3 = _mm256_blend_epi32(rhs_raw_hbit_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_3, requiredOrder), 240); + const __m256i rhs_raw_hbit_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_3, requiredOrder), rhs_raw_hbit_CDEF_3, 240); + + const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1); + const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1); + const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1); + const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1); + + const __m512i rhs_raw_mat_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_2), rhs_raw_mat_89CD_2, 1); + const __m512i rhs_raw_mat_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_2), rhs_raw_mat_ABEF_2, 1); + const __m512i rhs_raw_mat_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_3), rhs_raw_mat_89CD_3, 1); + const __m512i rhs_raw_mat_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_3), rhs_raw_mat_ABEF_3, 1); + + const __m512i rhs_raw_mat_014589CD_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_4), rhs_raw_mat_89CD_4, 1); + const __m512i rhs_raw_mat_2367ABEF_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_4), rhs_raw_mat_ABEF_4, 1); + const __m512i rhs_raw_mat_014589CD_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_5), rhs_raw_mat_89CD_5, 1); + const __m512i rhs_raw_mat_2367ABEF_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_5), rhs_raw_mat_ABEF_5, 1); + + const __m512i rhs_raw_mat_014589CD_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_6), rhs_raw_mat_89CD_6, 1); + const __m512i rhs_raw_mat_2367ABEF_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_6), rhs_raw_mat_ABEF_6, 1); + const __m512i rhs_raw_mat_014589CD_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_7), rhs_raw_mat_89CD_7, 1); + const __m512i rhs_raw_mat_2367ABEF_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_7), rhs_raw_mat_ABEF_7, 1); + + const __m512i rhs_raw_hbit_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_0), rhs_raw_hbit_89CD_0, 1); + const __m512i rhs_raw_hbit_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_0), rhs_raw_hbit_ABEF_0, 1); + const __m512i rhs_raw_hbit_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_1), rhs_raw_hbit_89CD_1, 1); + const __m512i rhs_raw_hbit_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_1), rhs_raw_hbit_ABEF_1, 1); + + const __m512i rhs_raw_hbit_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_2), rhs_raw_hbit_89CD_2, 1); + const __m512i rhs_raw_hbit_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_2), rhs_raw_hbit_ABEF_2, 1); + const __m512i rhs_raw_hbit_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_3), rhs_raw_hbit_89CD_3, 1); + const __m512i rhs_raw_hbit_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_3), rhs_raw_hbit_ABEF_3, 1); + + // 2-bit -> 8-bit + // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop + const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) + const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) + const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) + const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + + const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) + const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) + const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) + const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + + const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) + const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) + const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) + const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + + const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) + const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) + const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) + const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + + // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop + const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) + const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) + const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) + const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + + const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) + const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) + const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) + const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + + const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) + const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) + const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) + const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + + const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) + const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) + const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) + const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + + // 0 -7, 64 - 71 + const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); + const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4_expanded), rhs_hbit_014589CD_40); + + const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4_expanded), rhs_hbit_2367ABEF_00); + const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4_expanded), rhs_hbit_2367ABEF_40); + + // 8 - 15, 72 - 79 + const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_1, m4_expanded), rhs_hbit_014589CD_01); + const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4_expanded), rhs_hbit_014589CD_41); + + const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4_expanded), rhs_hbit_2367ABEF_01); + const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4_expanded), rhs_hbit_2367ABEF_41); + + // 16 - 23, 80 - 87 + const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_2, m4_expanded), rhs_hbit_014589CD_10); + const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 4), m4_expanded), rhs_hbit_014589CD_50); + + const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_2, m4_expanded), rhs_hbit_2367ABEF_10); + const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 4), m4_expanded), rhs_hbit_2367ABEF_50); + + // 24 - 31, 88 - 95 + const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_3, m4_expanded), rhs_hbit_014589CD_11); + const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 4), m4_expanded), rhs_hbit_014589CD_51); + + const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_3, m4_expanded), rhs_hbit_2367ABEF_11); + const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 4), m4_expanded), rhs_hbit_2367ABEF_51); + + // 32 - 39, 96 - 103 + const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_4, m4_expanded), rhs_hbit_014589CD_20); + const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_4, 4), m4_expanded), rhs_hbit_014589CD_60); + + const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_4, m4_expanded), rhs_hbit_2367ABEF_20); + const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_4, 4), m4_expanded), rhs_hbit_2367ABEF_60); + + // 40 - 47, 104 - 111 + const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_5, m4_expanded), rhs_hbit_014589CD_21); + const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_5, 4), m4_expanded), rhs_hbit_014589CD_61); + + const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_5, m4_expanded), rhs_hbit_2367ABEF_21); + const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_5, 4), m4_expanded), rhs_hbit_2367ABEF_61); + + // 48 - 55, 112 - 119 + const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_6, m4_expanded), rhs_hbit_014589CD_30); + const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_6, 4), m4_expanded), rhs_hbit_014589CD_70); + + const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_6, m4_expanded), rhs_hbit_2367ABEF_30); + const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_6, 4), m4_expanded), rhs_hbit_2367ABEF_70); + + // 56 - 63, 120 - 127 + const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_7, m4_expanded), rhs_hbit_014589CD_31); + const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_7, 4), m4_expanded), rhs_hbit_014589CD_71); + + const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_7, m4_expanded), rhs_hbit_2367ABEF_31); + const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_7, 4), m4_expanded), rhs_hbit_2367ABEF_71); + + // Shuffle pattern one - right side input + const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) + const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) + + const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) + const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) + + const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) + const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) + + const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) + const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) + + const __m512i rhs_mat_014589CD_20_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) + const __m512i rhs_mat_2367ABEF_20_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) + + const __m512i rhs_mat_014589CD_21_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) + const __m512i rhs_mat_2367ABEF_21_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) + + const __m512i rhs_mat_014589CD_30_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) + const __m512i rhs_mat_2367ABEF_30_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) + + const __m512i rhs_mat_014589CD_31_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11 + const __m512i rhs_mat_2367ABEF_31_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) + + const __m512i rhs_mat_014589CD_40_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) + const __m512i rhs_mat_2367ABEF_40_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) + + const __m512i rhs_mat_014589CD_41_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) + const __m512i rhs_mat_2367ABEF_41_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) + + const __m512i rhs_mat_014589CD_50_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) + const __m512i rhs_mat_2367ABEF_50_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) + + const __m512i rhs_mat_014589CD_51_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) + const __m512i rhs_mat_2367ABEF_51_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) + + const __m512i rhs_mat_014589CD_60_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) + const __m512i rhs_mat_2367ABEF_60_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) + + const __m512i rhs_mat_014589CD_61_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) + const __m512i rhs_mat_2367ABEF_61_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) + + const __m512i rhs_mat_014589CD_70_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) + const __m512i rhs_mat_2367ABEF_70_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) + + const __m512i rhs_mat_014589CD_71_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) + const __m512i rhs_mat_2367ABEF_71_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) + + + // Shuffle pattern two - right side input + const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) + const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) + + const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) + const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) + + const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) + const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) + + const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) + const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) + + const __m512i rhs_mat_014589CD_20_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) + const __m512i rhs_mat_2367ABEF_20_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) + + const __m512i rhs_mat_014589CD_21_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) + const __m512i rhs_mat_2367ABEF_21_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) + + const __m512i rhs_mat_014589CD_30_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) + const __m512i rhs_mat_2367ABEF_30_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) + + const __m512i rhs_mat_014589CD_31_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) + const __m512i rhs_mat_2367ABEF_31_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) + + const __m512i rhs_mat_014589CD_40_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) + const __m512i rhs_mat_2367ABEF_40_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) + + const __m512i rhs_mat_014589CD_41_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) + const __m512i rhs_mat_2367ABEF_41_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) + + const __m512i rhs_mat_014589CD_50_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) + const __m512i rhs_mat_2367ABEF_50_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) + + const __m512i rhs_mat_014589CD_51_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) + const __m512i rhs_mat_2367ABEF_51_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) + + const __m512i rhs_mat_014589CD_60_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) + const __m512i rhs_mat_2367ABEF_60_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) + + const __m512i rhs_mat_014589CD_61_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) + const __m512i rhs_mat_2367ABEF_61_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) + + const __m512i rhs_mat_014589CD_70_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) + const __m512i rhs_mat_2367ABEF_70_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) + + const __m512i rhs_mat_014589CD_71_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) + const __m512i rhs_mat_2367ABEF_71_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) + + //Scales of corresponding sub blocks from different Q6_K structures are stored together + //s00 s01 s10 s11 s20 s21 ...... s70 s71 + // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop + const __m128i scales_01_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + sb * 64)); + const __m128i scales_23_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 16 + sb * 64)); + const __m128i scales_45_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 32 + sb * 64)); + const __m128i scales_67_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 48 + sb * 64)); + + const __m128i scales_01_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + sb * 64)); + const __m128i scales_23_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 16 + sb * 64)); + const __m128i scales_45_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 32 + sb * 64)); + const __m128i scales_67_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 48 + sb * 64)); + + // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop + const __m256i scales_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_01_0), scales_01_1, 1); + const __m256i scales_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_23_0), scales_23_1, 1); + const __m256i scales_45 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_45_0), scales_45_1, 1); + const __m256i scales_67 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_67_0), scales_67_1, 1); + + const __m512i scales_0 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_01,scalesmask1)); + const __m512i scales_1 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_01,scalesmask2)); + const __m512i scales_2 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_23,scalesmask1)); + const __m512i scales_3 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_23,scalesmask2)); + const __m512i scales_4 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_45,scalesmask1)); + const __m512i scales_5 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_45,scalesmask2)); + const __m512i scales_6 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_67,scalesmask1)); + const __m512i scales_7 = _mm512_cvtepi8_epi16(_mm256_shuffle_epi8(scales_67,scalesmask2)); + + const __m512i scale_014589CD_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_2 = _mm512_shuffle_epi32(scales_2, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_2 = _mm512_shuffle_epi32(scales_2, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_3 = _mm512_shuffle_epi32(scales_3, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_3 = _mm512_shuffle_epi32(scales_3, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_4 = _mm512_shuffle_epi32(scales_4, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_4 = _mm512_shuffle_epi32(scales_4, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_5 = _mm512_shuffle_epi32(scales_5, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_5 = _mm512_shuffle_epi32(scales_5, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_6 = _mm512_shuffle_epi32(scales_6, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_6 = _mm512_shuffle_epi32(scales_6, (_MM_PERM_ENUM)238); + + const __m512i scale_014589CD_7 = _mm512_shuffle_epi32(scales_7, (_MM_PERM_ENUM)68); + const __m512i scale_2367ABEF_7 = _mm512_shuffle_epi32(scales_7, (_MM_PERM_ENUM)238); + + + // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3 + // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector + __m256i lhs_mat_ymm_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 512 * sb))); + __m256i lhs_mat_ymm_01_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 0); + __m256i lhs_mat_ymm_23_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 17); + __m256i lhs_mat_ymm_0123_01 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 32 + 512 * sb))); + __m256i lhs_mat_ymm_01_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 0); + __m256i lhs_mat_ymm_23_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 17); + __m256i lhs_mat_ymm_0123_10 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 64 + 512 * sb))); + __m256i lhs_mat_ymm_01_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 0); + __m256i lhs_mat_ymm_23_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 17); + __m256i lhs_mat_ymm_0123_11 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 96 + 512 * sb))); + __m256i lhs_mat_ymm_01_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 0); + __m256i lhs_mat_ymm_23_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 17); + __m256i lhs_mat_ymm_0123_20 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 128 + 512 * sb))); + __m256i lhs_mat_ymm_01_20 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_20, lhs_mat_ymm_0123_20, 0); + __m256i lhs_mat_ymm_23_20 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_20, lhs_mat_ymm_0123_20, 17); + __m256i lhs_mat_ymm_0123_21 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 160 + 512 * sb))); + __m256i lhs_mat_ymm_01_21 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_21, lhs_mat_ymm_0123_21, 0); + __m256i lhs_mat_ymm_23_21 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_21, lhs_mat_ymm_0123_21, 17); + __m256i lhs_mat_ymm_0123_30 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 192 + 512 * sb))); + __m256i lhs_mat_ymm_01_30 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_30, lhs_mat_ymm_0123_30, 0); + __m256i lhs_mat_ymm_23_30 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_30, lhs_mat_ymm_0123_30, 17); + __m256i lhs_mat_ymm_0123_31 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 224 + 512 * sb))); + __m256i lhs_mat_ymm_01_31 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_31, lhs_mat_ymm_0123_31, 0); + __m256i lhs_mat_ymm_23_31 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_31, lhs_mat_ymm_0123_31, 17); + + __m256i lhs_mat_ymm_0123_40 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 256 + 512 * sb))); + __m256i lhs_mat_ymm_01_40 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_40, lhs_mat_ymm_0123_40, 0); + __m256i lhs_mat_ymm_23_40 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_40, lhs_mat_ymm_0123_40, 17); + __m256i lhs_mat_ymm_0123_41 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 288 + 512 * sb))); + __m256i lhs_mat_ymm_01_41 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_41, lhs_mat_ymm_0123_41, 0); + __m256i lhs_mat_ymm_23_41 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_41, lhs_mat_ymm_0123_41, 17); + __m256i lhs_mat_ymm_0123_50 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 320 + 512 * sb))); + __m256i lhs_mat_ymm_01_50 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_50, lhs_mat_ymm_0123_50, 0); + __m256i lhs_mat_ymm_23_50 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_50, lhs_mat_ymm_0123_50, 17); + __m256i lhs_mat_ymm_0123_51 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 352 + 512 * sb))); + __m256i lhs_mat_ymm_01_51 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_51, lhs_mat_ymm_0123_51, 0); + __m256i lhs_mat_ymm_23_51 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_51, lhs_mat_ymm_0123_51, 17); + __m256i lhs_mat_ymm_0123_60 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 384 + 512 * sb))); + __m256i lhs_mat_ymm_01_60 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_60, lhs_mat_ymm_0123_60, 0); + __m256i lhs_mat_ymm_23_60 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_60, lhs_mat_ymm_0123_60, 17); + __m256i lhs_mat_ymm_0123_61 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 416 + 512 * sb))); + __m256i lhs_mat_ymm_01_61 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_61, lhs_mat_ymm_0123_61, 0); + __m256i lhs_mat_ymm_23_61 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_61, lhs_mat_ymm_0123_61, 17); + __m256i lhs_mat_ymm_0123_70 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 448 + 512 * sb))); + __m256i lhs_mat_ymm_01_70 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_70, lhs_mat_ymm_0123_70, 0); + __m256i lhs_mat_ymm_23_70 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_70, lhs_mat_ymm_0123_70, 17); + __m256i lhs_mat_ymm_0123_71 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 480 + 512 * sb))); + __m256i lhs_mat_ymm_01_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 0); + __m256i lhs_mat_ymm_23_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 17); + + + __m512i lhs_mat_01_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_00), lhs_mat_ymm_01_00, 1); + __m512i lhs_mat_23_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_00), lhs_mat_ymm_23_00, 1); + __m512i lhs_mat_01_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_01), lhs_mat_ymm_01_01, 1); + __m512i lhs_mat_23_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_01), lhs_mat_ymm_23_01, 1); + + __m512i lhs_mat_01_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_10), lhs_mat_ymm_01_10, 1); + __m512i lhs_mat_23_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_10), lhs_mat_ymm_23_10, 1); + __m512i lhs_mat_01_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_11), lhs_mat_ymm_01_11, 1); + __m512i lhs_mat_23_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_11), lhs_mat_ymm_23_11, 1); + + __m512i lhs_mat_01_20 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_20), lhs_mat_ymm_01_20, 1); + __m512i lhs_mat_23_20 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_20), lhs_mat_ymm_23_20, 1); + __m512i lhs_mat_01_21 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_21), lhs_mat_ymm_01_21, 1); + __m512i lhs_mat_23_21 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_21), lhs_mat_ymm_23_21, 1); + + __m512i lhs_mat_01_30 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_30), lhs_mat_ymm_01_30, 1); + __m512i lhs_mat_23_30 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_30), lhs_mat_ymm_23_30, 1); + __m512i lhs_mat_01_31 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_31), lhs_mat_ymm_01_31, 1); + __m512i lhs_mat_23_31 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_31), lhs_mat_ymm_23_31, 1); + + __m512i lhs_mat_01_40 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_40), lhs_mat_ymm_01_40, 1); + __m512i lhs_mat_23_40 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_40), lhs_mat_ymm_23_40, 1); + __m512i lhs_mat_01_41 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_41), lhs_mat_ymm_01_41, 1); + __m512i lhs_mat_23_41 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_41), lhs_mat_ymm_23_41, 1); + + __m512i lhs_mat_01_50 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_50), lhs_mat_ymm_01_50, 1); + __m512i lhs_mat_23_50 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_50), lhs_mat_ymm_23_50, 1); + __m512i lhs_mat_01_51 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_51), lhs_mat_ymm_01_51, 1); + __m512i lhs_mat_23_51 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_51), lhs_mat_ymm_23_51, 1); + + __m512i lhs_mat_01_60 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_60), lhs_mat_ymm_01_60, 1); + __m512i lhs_mat_23_60 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_60), lhs_mat_ymm_23_60, 1); + __m512i lhs_mat_01_61 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_61), lhs_mat_ymm_01_61, 1); + __m512i lhs_mat_23_61 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_61), lhs_mat_ymm_23_61, 1); + + __m512i lhs_mat_01_70 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_70), lhs_mat_ymm_01_70, 1); + __m512i lhs_mat_23_70 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_70), lhs_mat_ymm_23_70, 1); + __m512i lhs_mat_01_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_71), lhs_mat_ymm_01_71, 1); + __m512i lhs_mat_23_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_71), lhs_mat_ymm_23_71, 1); + + __m512i lhs_mat_s_01_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_00); + __m512i lhs_mat_s_23_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_00); + __m512i lhs_mat_s_01_01 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_01); + __m512i lhs_mat_s_23_01 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_01); + __m512i lhs_mat_s_01_10 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_10); + __m512i lhs_mat_s_23_10 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_10); + __m512i lhs_mat_s_01_11 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_11); + __m512i lhs_mat_s_23_11 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_11); + __m512i lhs_mat_s_01_20 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_20); + __m512i lhs_mat_s_23_20 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_20); + __m512i lhs_mat_s_01_21 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_21); + __m512i lhs_mat_s_23_21 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_21); + __m512i lhs_mat_s_01_30 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_30); + __m512i lhs_mat_s_23_30 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_30); + __m512i lhs_mat_s_01_31 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_31); + __m512i lhs_mat_s_23_31 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_31); + __m512i lhs_mat_s_01_40 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_40); + __m512i lhs_mat_s_23_40 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_40); + __m512i lhs_mat_s_01_41 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_41); + __m512i lhs_mat_s_23_41 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_41); + __m512i lhs_mat_s_01_50 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_50); + __m512i lhs_mat_s_23_50 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_50); + __m512i lhs_mat_s_01_51 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_51); + __m512i lhs_mat_s_23_51 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_51); + __m512i lhs_mat_s_01_60 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_60); + __m512i lhs_mat_s_23_60 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_60); + __m512i lhs_mat_s_01_61 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_61); + __m512i lhs_mat_s_23_61 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_61); + __m512i lhs_mat_s_01_70 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_70); + __m512i lhs_mat_s_23_70 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_70); + __m512i lhs_mat_s_01_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_71); + __m512i lhs_mat_s_23_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_71); + + // Shuffle pattern one - left side input + const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + + const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + + const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + + const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + + const __m512i lhs_mat_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m512i lhs_mat_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + + const __m512i lhs_mat_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m512i lhs_mat_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + + const __m512i lhs_mat_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m512i lhs_mat_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + + const __m512i lhs_mat_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m512i lhs_mat_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + + const __m512i lhs_mat_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m512i lhs_mat_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + + const __m512i lhs_mat_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m512i lhs_mat_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + + const __m512i lhs_mat_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m512i lhs_mat_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + + const __m512i lhs_mat_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m512i lhs_mat_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + + const __m512i lhs_mat_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m512i lhs_mat_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + + const __m512i lhs_mat_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m512i lhs_mat_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + + const __m512i lhs_mat_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m512i lhs_mat_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + + const __m512i lhs_mat_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m512i lhs_mat_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + + // Shuffle pattern two- left side input + const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + + const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + + const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + + const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + + const __m512i lhs_mat_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m512i lhs_mat_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + + const __m512i lhs_mat_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m512i lhs_mat_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + + const __m512i lhs_mat_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m512i lhs_mat_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + + const __m512i lhs_mat_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m512i lhs_mat_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + + const __m512i lhs_mat_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m512i lhs_mat_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + + const __m512i lhs_mat_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m512i lhs_mat_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + + const __m512i lhs_mat_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m512i lhs_mat_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + + const __m512i lhs_mat_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m512i lhs_mat_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + + const __m512i lhs_mat_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m512i lhs_mat_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m512i lhs_mat_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m512i lhs_mat_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m512i lhs_mat_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m512i lhs_mat_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m512i lhs_mat_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m512i lhs_mat_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + + // Shuffle pattern one - left side input + const __m512i lhs_mat_s_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m512i lhs_mat_s_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + + const __m512i lhs_mat_s_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m512i lhs_mat_s_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + + const __m512i lhs_mat_s_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m512i lhs_mat_s_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + + const __m512i lhs_mat_s_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m512i lhs_mat_s_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + + const __m512i lhs_mat_s_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m512i lhs_mat_s_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + + const __m512i lhs_mat_s_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m512i lhs_mat_s_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + + const __m512i lhs_mat_s_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m512i lhs_mat_s_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + + const __m512i lhs_mat_s_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m512i lhs_mat_s_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + + const __m512i lhs_mat_s_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m512i lhs_mat_s_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + + const __m512i lhs_mat_s_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m512i lhs_mat_s_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + + const __m512i lhs_mat_s_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m512i lhs_mat_s_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + + const __m512i lhs_mat_s_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m512i lhs_mat_s_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + + const __m512i lhs_mat_s_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m512i lhs_mat_s_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + + const __m512i lhs_mat_s_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m512i lhs_mat_s_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + + const __m512i lhs_mat_s_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m512i lhs_mat_s_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + + const __m512i lhs_mat_s_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m512i lhs_mat_s_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + + // Shuffle pattern two- left side input + const __m512i lhs_mat_s_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m512i lhs_mat_s_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + + const __m512i lhs_mat_s_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m512i lhs_mat_s_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + + const __m512i lhs_mat_s_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m512i lhs_mat_s_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + + const __m512i lhs_mat_s_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m512i lhs_mat_s_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + + const __m512i lhs_mat_s_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m512i lhs_mat_s_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + + const __m512i lhs_mat_s_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m512i lhs_mat_s_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + + const __m512i lhs_mat_s_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m512i lhs_mat_s_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + + const __m512i lhs_mat_s_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m512i lhs_mat_s_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + + const __m512i lhs_mat_s_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m512i lhs_mat_s_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + + const __m512i lhs_mat_s_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m512i lhs_mat_s_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + + const __m512i lhs_mat_s_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m512i lhs_mat_s_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + + const __m512i lhs_mat_s_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m512i lhs_mat_s_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + + const __m512i lhs_mat_s_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m512i lhs_mat_s_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m512i lhs_mat_s_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m512i lhs_mat_s_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m512i lhs_mat_s_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m512i lhs_mat_s_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m512i lhs_mat_s_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m512i lhs_mat_s_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + + // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane + __m512i iacc_mat_00_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); + __m512i iacc_mat_01_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); + + __m512i iacc_mat_10_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_23_00_sp1), lhs_mat_s_23_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_23_01_sp1), lhs_mat_s_23_01_sp1)); + __m512i iacc_mat_11_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_23_00_sp1), lhs_mat_s_23_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_23_01_sp1), lhs_mat_s_23_01_sp1)); + + __m512i iacc_mat_00_1_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_01_10_sp1), lhs_mat_s_01_10_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_01_11_sp1), lhs_mat_s_01_11_sp1)); + __m512i iacc_mat_01_1_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_01_10_sp1), lhs_mat_s_01_10_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_01_11_sp1), lhs_mat_s_01_11_sp1)); + + __m512i iacc_mat_10_1_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_23_10_sp1), lhs_mat_s_23_10_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_23_11_sp1), lhs_mat_s_23_11_sp1)); + __m512i iacc_mat_11_1_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_23_10_sp1), lhs_mat_s_23_10_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_23_11_sp1), lhs_mat_s_23_11_sp1)); + + __m512i iacc_mat_00_2_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp1, lhs_mat_01_20_sp1), lhs_mat_s_01_20_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp1, lhs_mat_01_21_sp1), lhs_mat_s_01_21_sp1)); + __m512i iacc_mat_01_2_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp1, lhs_mat_01_20_sp1), lhs_mat_s_01_20_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp1, lhs_mat_01_21_sp1), lhs_mat_s_01_21_sp1)); + + __m512i iacc_mat_10_2_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp1, lhs_mat_23_20_sp1), lhs_mat_s_23_20_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp1, lhs_mat_23_21_sp1), lhs_mat_s_23_21_sp1)); + __m512i iacc_mat_11_2_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp1, lhs_mat_23_20_sp1), lhs_mat_s_23_20_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp1, lhs_mat_23_21_sp1), lhs_mat_s_23_21_sp1)); + + __m512i iacc_mat_00_3_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp1, lhs_mat_01_30_sp1), lhs_mat_s_01_30_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp1, lhs_mat_01_31_sp1), lhs_mat_s_01_31_sp1)); + __m512i iacc_mat_01_3_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp1, lhs_mat_01_30_sp1), lhs_mat_s_01_30_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp1, lhs_mat_01_31_sp1), lhs_mat_s_01_31_sp1)); + + __m512i iacc_mat_10_3_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp1, lhs_mat_23_30_sp1), lhs_mat_s_23_30_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp1, lhs_mat_23_31_sp1), lhs_mat_s_23_31_sp1)); + __m512i iacc_mat_11_3_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp1, lhs_mat_23_30_sp1), lhs_mat_s_23_30_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp1, lhs_mat_23_31_sp1), lhs_mat_s_23_31_sp1)); + + __m512i iacc_mat_00_4_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp1, lhs_mat_01_40_sp1), lhs_mat_s_01_40_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp1, lhs_mat_01_41_sp1), lhs_mat_s_01_41_sp1)); + __m512i iacc_mat_01_4_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp1, lhs_mat_01_40_sp1), lhs_mat_s_01_40_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp1, lhs_mat_01_41_sp1), lhs_mat_s_01_41_sp1)); + + __m512i iacc_mat_10_4_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp1, lhs_mat_23_40_sp1), lhs_mat_s_23_40_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp1, lhs_mat_23_41_sp1), lhs_mat_s_23_41_sp1)); + __m512i iacc_mat_11_4_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp1, lhs_mat_23_40_sp1), lhs_mat_s_23_40_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp1, lhs_mat_23_41_sp1), lhs_mat_s_23_41_sp1)); + + __m512i iacc_mat_00_5_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp1, lhs_mat_01_50_sp1), lhs_mat_s_01_50_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp1, lhs_mat_01_51_sp1), lhs_mat_s_01_51_sp1)); + __m512i iacc_mat_01_5_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp1, lhs_mat_01_50_sp1), lhs_mat_s_01_50_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp1, lhs_mat_01_51_sp1), lhs_mat_s_01_51_sp1)); + + __m512i iacc_mat_10_5_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp1, lhs_mat_23_50_sp1), lhs_mat_s_23_50_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp1, lhs_mat_23_51_sp1), lhs_mat_s_23_51_sp1)); + __m512i iacc_mat_11_5_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp1, lhs_mat_23_50_sp1), lhs_mat_s_23_50_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp1, lhs_mat_23_51_sp1), lhs_mat_s_23_51_sp1)); + + __m512i iacc_mat_00_6_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp1, lhs_mat_01_60_sp1), lhs_mat_s_01_60_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp1, lhs_mat_01_61_sp1), lhs_mat_s_01_61_sp1)); + __m512i iacc_mat_01_6_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp1, lhs_mat_01_60_sp1), lhs_mat_s_01_60_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp1, lhs_mat_01_61_sp1), lhs_mat_s_01_61_sp1)); + + __m512i iacc_mat_10_6_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp1, lhs_mat_23_60_sp1), lhs_mat_s_23_60_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp1, lhs_mat_23_61_sp1), lhs_mat_s_23_61_sp1)); + __m512i iacc_mat_11_6_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp1, lhs_mat_23_60_sp1), lhs_mat_s_23_60_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp1, lhs_mat_23_61_sp1), lhs_mat_s_23_61_sp1)); + + __m512i iacc_mat_00_7_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp1, lhs_mat_01_70_sp1), lhs_mat_s_01_70_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp1, lhs_mat_01_71_sp1), lhs_mat_s_01_71_sp1)); + __m512i iacc_mat_01_7_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp1, lhs_mat_01_70_sp1), lhs_mat_s_01_70_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp1, lhs_mat_01_71_sp1), lhs_mat_s_01_71_sp1)); + + __m512i iacc_mat_10_7_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp1, lhs_mat_23_70_sp1), lhs_mat_s_23_70_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp1, lhs_mat_23_71_sp1), lhs_mat_s_23_71_sp1)); + __m512i iacc_mat_11_7_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp1, lhs_mat_23_70_sp1), lhs_mat_s_23_70_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp1, lhs_mat_23_71_sp1), lhs_mat_s_23_71_sp1)); + + __m512i iacc_mat_00_0_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_01_00_sp2), lhs_mat_s_01_00_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_01_01_sp2), lhs_mat_s_01_01_sp2)); + __m512i iacc_mat_01_0_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_01_00_sp2), lhs_mat_s_01_00_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_01_01_sp2), lhs_mat_s_01_01_sp2)); + + __m512i iacc_mat_10_0_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_23_00_sp2), lhs_mat_s_23_00_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_23_01_sp2), lhs_mat_s_23_01_sp2)); + __m512i iacc_mat_11_0_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_23_00_sp2), lhs_mat_s_23_00_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_23_01_sp2), lhs_mat_s_23_01_sp2)); + + __m512i iacc_mat_00_1_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_01_10_sp2), lhs_mat_s_01_10_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_01_11_sp2), lhs_mat_s_01_11_sp2)); + __m512i iacc_mat_01_1_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_01_10_sp2), lhs_mat_s_01_10_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_01_11_sp2), lhs_mat_s_01_11_sp2)); + + __m512i iacc_mat_10_1_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_23_10_sp2), lhs_mat_s_23_10_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_23_11_sp2), lhs_mat_s_23_11_sp2)); + __m512i iacc_mat_11_1_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_23_10_sp2), lhs_mat_s_23_10_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_23_11_sp2), lhs_mat_s_23_11_sp2)); + + __m512i iacc_mat_00_2_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp2, lhs_mat_01_20_sp2), lhs_mat_s_01_20_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp2, lhs_mat_01_21_sp2), lhs_mat_s_01_21_sp2)); + __m512i iacc_mat_01_2_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp2, lhs_mat_01_20_sp2), lhs_mat_s_01_20_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp2, lhs_mat_01_21_sp2), lhs_mat_s_01_21_sp2)); + + __m512i iacc_mat_10_2_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp2, lhs_mat_23_20_sp2), lhs_mat_s_23_20_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp2, lhs_mat_23_21_sp2), lhs_mat_s_23_21_sp2)); + __m512i iacc_mat_11_2_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp2, lhs_mat_23_20_sp2), lhs_mat_s_23_20_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp2, lhs_mat_23_21_sp2), lhs_mat_s_23_21_sp2)); + + __m512i iacc_mat_00_3_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp2, lhs_mat_01_30_sp2), lhs_mat_s_01_30_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp2, lhs_mat_01_31_sp2), lhs_mat_s_01_31_sp2)); + __m512i iacc_mat_01_3_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp2, lhs_mat_01_30_sp2), lhs_mat_s_01_30_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp2, lhs_mat_01_31_sp2), lhs_mat_s_01_31_sp2)); + + __m512i iacc_mat_10_3_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp2, lhs_mat_23_30_sp2), lhs_mat_s_23_30_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp2, lhs_mat_23_31_sp2), lhs_mat_s_23_31_sp2)); + __m512i iacc_mat_11_3_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp2, lhs_mat_23_30_sp2), lhs_mat_s_23_30_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp2, lhs_mat_23_31_sp2), lhs_mat_s_23_31_sp2)); + + __m512i iacc_mat_00_4_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp2, lhs_mat_01_40_sp2), lhs_mat_s_01_40_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp2, lhs_mat_01_41_sp2), lhs_mat_s_01_41_sp2)); + __m512i iacc_mat_01_4_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp2, lhs_mat_01_40_sp2), lhs_mat_s_01_40_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp2, lhs_mat_01_41_sp2), lhs_mat_s_01_41_sp2)); + + __m512i iacc_mat_10_4_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp2, lhs_mat_23_40_sp2), lhs_mat_s_23_40_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp2, lhs_mat_23_41_sp2), lhs_mat_s_23_41_sp2)); + __m512i iacc_mat_11_4_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp2, lhs_mat_23_40_sp2), lhs_mat_s_23_40_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp2, lhs_mat_23_41_sp2), lhs_mat_s_23_41_sp2)); + + __m512i iacc_mat_00_5_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp2, lhs_mat_01_50_sp2), lhs_mat_s_01_50_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp2, lhs_mat_01_51_sp2), lhs_mat_s_01_51_sp2)); + __m512i iacc_mat_01_5_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp2, lhs_mat_01_50_sp2), lhs_mat_s_01_50_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp2, lhs_mat_01_51_sp2), lhs_mat_s_01_51_sp2)); + + __m512i iacc_mat_10_5_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp2, lhs_mat_23_50_sp2), lhs_mat_s_23_50_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp2, lhs_mat_23_51_sp2), lhs_mat_s_23_51_sp2)); + __m512i iacc_mat_11_5_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp2, lhs_mat_23_50_sp2), lhs_mat_s_23_50_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp2, lhs_mat_23_51_sp2), lhs_mat_s_23_51_sp2)); + + __m512i iacc_mat_00_6_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp2, lhs_mat_01_60_sp2), lhs_mat_s_01_60_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp2, lhs_mat_01_61_sp2), lhs_mat_s_01_61_sp2)); + __m512i iacc_mat_01_6_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp2, lhs_mat_01_60_sp2), lhs_mat_s_01_60_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp2, lhs_mat_01_61_sp2), lhs_mat_s_01_61_sp2)); + + __m512i iacc_mat_10_6_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp2, lhs_mat_23_60_sp2), lhs_mat_s_23_60_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp2, lhs_mat_23_61_sp2), lhs_mat_s_23_61_sp2)); + __m512i iacc_mat_11_6_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp2, lhs_mat_23_60_sp2), lhs_mat_s_23_60_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp2, lhs_mat_23_61_sp2), lhs_mat_s_23_61_sp2)); + + __m512i iacc_mat_00_7_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp2, lhs_mat_01_70_sp2), lhs_mat_s_01_70_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp2, lhs_mat_01_71_sp2), lhs_mat_s_01_71_sp2)); + __m512i iacc_mat_01_7_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp2, lhs_mat_01_70_sp2), lhs_mat_s_01_70_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp2, lhs_mat_01_71_sp2), lhs_mat_s_01_71_sp2)); + + __m512i iacc_mat_10_7_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp2, lhs_mat_23_70_sp2), lhs_mat_s_23_70_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp2, lhs_mat_23_71_sp2), lhs_mat_s_23_71_sp2)); + __m512i iacc_mat_11_7_sp2 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp2, lhs_mat_23_70_sp2), lhs_mat_s_23_70_sp2), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp2, lhs_mat_23_71_sp2), lhs_mat_s_23_71_sp2)); + + // Combine results from both shuffle patterns for each output block + __m512i iacc_mat_00_0 = _mm512_add_epi16(iacc_mat_00_0_sp1, iacc_mat_00_0_sp2); + __m512i iacc_mat_01_0 = _mm512_add_epi16(iacc_mat_01_0_sp1, iacc_mat_01_0_sp2); + __m512i iacc_mat_10_0 = _mm512_add_epi16(iacc_mat_10_0_sp1, iacc_mat_10_0_sp2); + __m512i iacc_mat_11_0 = _mm512_add_epi16(iacc_mat_11_0_sp1, iacc_mat_11_0_sp2); + + __m512i iacc_mat_00_1 = _mm512_add_epi16(iacc_mat_00_1_sp1, iacc_mat_00_1_sp2); + __m512i iacc_mat_01_1 = _mm512_add_epi16(iacc_mat_01_1_sp1, iacc_mat_01_1_sp2); + __m512i iacc_mat_10_1 = _mm512_add_epi16(iacc_mat_10_1_sp1, iacc_mat_10_1_sp2); + __m512i iacc_mat_11_1 = _mm512_add_epi16(iacc_mat_11_1_sp1, iacc_mat_11_1_sp2); + + __m512i iacc_mat_00_2 = _mm512_add_epi16(iacc_mat_00_2_sp1, iacc_mat_00_2_sp2); + __m512i iacc_mat_01_2 = _mm512_add_epi16(iacc_mat_01_2_sp1, iacc_mat_01_2_sp2); + __m512i iacc_mat_10_2 = _mm512_add_epi16(iacc_mat_10_2_sp1, iacc_mat_10_2_sp2); + __m512i iacc_mat_11_2 = _mm512_add_epi16(iacc_mat_11_2_sp1, iacc_mat_11_2_sp2); + + __m512i iacc_mat_00_3 = _mm512_add_epi16(iacc_mat_00_3_sp1, iacc_mat_00_3_sp2); + __m512i iacc_mat_01_3 = _mm512_add_epi16(iacc_mat_01_3_sp1, iacc_mat_01_3_sp2); + __m512i iacc_mat_10_3 = _mm512_add_epi16(iacc_mat_10_3_sp1, iacc_mat_10_3_sp2); + __m512i iacc_mat_11_3 = _mm512_add_epi16(iacc_mat_11_3_sp1, iacc_mat_11_3_sp2); + + __m512i iacc_mat_00_4 = _mm512_add_epi16(iacc_mat_00_4_sp1, iacc_mat_00_4_sp2); + __m512i iacc_mat_01_4 = _mm512_add_epi16(iacc_mat_01_4_sp1, iacc_mat_01_4_sp2); + __m512i iacc_mat_10_4 = _mm512_add_epi16(iacc_mat_10_4_sp1, iacc_mat_10_4_sp2); + __m512i iacc_mat_11_4 = _mm512_add_epi16(iacc_mat_11_4_sp1, iacc_mat_11_4_sp2); + + __m512i iacc_mat_00_5 = _mm512_add_epi16(iacc_mat_00_5_sp1, iacc_mat_00_5_sp2); + __m512i iacc_mat_01_5 = _mm512_add_epi16(iacc_mat_01_5_sp1, iacc_mat_01_5_sp2); + __m512i iacc_mat_10_5 = _mm512_add_epi16(iacc_mat_10_5_sp1, iacc_mat_10_5_sp2); + __m512i iacc_mat_11_5 = _mm512_add_epi16(iacc_mat_11_5_sp1, iacc_mat_11_5_sp2); + + __m512i iacc_mat_00_6 = _mm512_add_epi16(iacc_mat_00_6_sp1, iacc_mat_00_6_sp2); + __m512i iacc_mat_01_6 = _mm512_add_epi16(iacc_mat_01_6_sp1, iacc_mat_01_6_sp2); + __m512i iacc_mat_10_6 = _mm512_add_epi16(iacc_mat_10_6_sp1, iacc_mat_10_6_sp2); + __m512i iacc_mat_11_6 = _mm512_add_epi16(iacc_mat_11_6_sp1, iacc_mat_11_6_sp2); + + __m512i iacc_mat_00_7 = _mm512_add_epi16(iacc_mat_00_7_sp1, iacc_mat_00_7_sp2); + __m512i iacc_mat_01_7 = _mm512_add_epi16(iacc_mat_01_7_sp1, iacc_mat_01_7_sp2); + __m512i iacc_mat_10_7 = _mm512_add_epi16(iacc_mat_10_7_sp1, iacc_mat_10_7_sp2); + __m512i iacc_mat_11_7 = _mm512_add_epi16(iacc_mat_11_7_sp1, iacc_mat_11_7_sp2); + + // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block + iacc_mat_00_0 = _mm512_madd_epi16(iacc_mat_00_0, scale_014589CD_0); + iacc_mat_01_0 = _mm512_madd_epi16(iacc_mat_01_0, scale_2367ABEF_0); + iacc_mat_10_0 = _mm512_madd_epi16(iacc_mat_10_0, scale_014589CD_0); + iacc_mat_11_0 = _mm512_madd_epi16(iacc_mat_11_0, scale_2367ABEF_0); + + iacc_mat_00_1 = _mm512_madd_epi16(iacc_mat_00_1, scale_014589CD_1); + iacc_mat_01_1 = _mm512_madd_epi16(iacc_mat_01_1, scale_2367ABEF_1); + iacc_mat_10_1 = _mm512_madd_epi16(iacc_mat_10_1, scale_014589CD_1); + iacc_mat_11_1 = _mm512_madd_epi16(iacc_mat_11_1, scale_2367ABEF_1); + + iacc_mat_00_2 = _mm512_madd_epi16(iacc_mat_00_2, scale_014589CD_2); + iacc_mat_01_2 = _mm512_madd_epi16(iacc_mat_01_2, scale_2367ABEF_2); + iacc_mat_10_2 = _mm512_madd_epi16(iacc_mat_10_2, scale_014589CD_2); + iacc_mat_11_2 = _mm512_madd_epi16(iacc_mat_11_2, scale_2367ABEF_2); + + iacc_mat_00_3 = _mm512_madd_epi16(iacc_mat_00_3, scale_014589CD_3); + iacc_mat_01_3 = _mm512_madd_epi16(iacc_mat_01_3, scale_2367ABEF_3); + iacc_mat_10_3 = _mm512_madd_epi16(iacc_mat_10_3, scale_014589CD_3); + iacc_mat_11_3 = _mm512_madd_epi16(iacc_mat_11_3, scale_2367ABEF_3); + + iacc_mat_00_4 = _mm512_madd_epi16(iacc_mat_00_4, scale_014589CD_4); + iacc_mat_01_4 = _mm512_madd_epi16(iacc_mat_01_4, scale_2367ABEF_4); + iacc_mat_10_4 = _mm512_madd_epi16(iacc_mat_10_4, scale_014589CD_4); + iacc_mat_11_4 = _mm512_madd_epi16(iacc_mat_11_4, scale_2367ABEF_4); + + iacc_mat_00_5 = _mm512_madd_epi16(iacc_mat_00_5, scale_014589CD_5); + iacc_mat_01_5 = _mm512_madd_epi16(iacc_mat_01_5, scale_2367ABEF_5); + iacc_mat_10_5 = _mm512_madd_epi16(iacc_mat_10_5, scale_014589CD_5); + iacc_mat_11_5 = _mm512_madd_epi16(iacc_mat_11_5, scale_2367ABEF_5); + + iacc_mat_00_6 = _mm512_madd_epi16(iacc_mat_00_6, scale_014589CD_6); + iacc_mat_01_6 = _mm512_madd_epi16(iacc_mat_01_6, scale_2367ABEF_6); + iacc_mat_10_6 = _mm512_madd_epi16(iacc_mat_10_6, scale_014589CD_6); + iacc_mat_11_6 = _mm512_madd_epi16(iacc_mat_11_6, scale_2367ABEF_6); + + iacc_mat_00_7 = _mm512_madd_epi16(iacc_mat_00_7, scale_014589CD_7); + iacc_mat_01_7 = _mm512_madd_epi16(iacc_mat_01_7, scale_2367ABEF_7); + iacc_mat_10_7 = _mm512_madd_epi16(iacc_mat_10_7, scale_014589CD_7); + iacc_mat_11_7 = _mm512_madd_epi16(iacc_mat_11_7, scale_2367ABEF_7); + + + __m512i iacc_mat_00 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_0, iacc_mat_00_1), _mm512_add_epi32(iacc_mat_00_2, iacc_mat_00_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_4, iacc_mat_00_5), _mm512_add_epi32(iacc_mat_00_6, iacc_mat_00_7))); + __m512i iacc_mat_01 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_0, iacc_mat_01_1), _mm512_add_epi32(iacc_mat_01_2, iacc_mat_01_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_4, iacc_mat_01_5), _mm512_add_epi32(iacc_mat_01_6, iacc_mat_01_7))); + __m512i iacc_mat_10 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_0, iacc_mat_10_1), _mm512_add_epi32(iacc_mat_10_2, iacc_mat_10_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_4, iacc_mat_10_5), _mm512_add_epi32(iacc_mat_10_6, iacc_mat_10_7))); + __m512i iacc_mat_11 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_11_0, iacc_mat_11_1), _mm512_add_epi32(iacc_mat_11_2, iacc_mat_11_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_11_4, iacc_mat_11_5), _mm512_add_epi32(iacc_mat_11_6, iacc_mat_11_7))); + + // Straighten out to make 4 row vectors + __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78)); + __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01); + __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78)); + __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11); + + // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes + const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d); + const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse); + const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1); + + // Multiply with appropiate scales and accumulate (for both d and dmin) below + acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]); + acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]); + acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]); + acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]); + } + } + + // Store the accumulated values + for (int i = 0; i < 4; i++) { + _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]); + } + } + } + + if (anc != nc) { + xstart = anc/8; + y = 0; + } + #endif -*/ + for (; y < anr / 4; y += 4){ const block_q8_Kx4 * a_ptrs[4]; From 61a8c046dd1701690d535fa3ddf7ae600a108e63 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 15:08:06 +0530 Subject: [PATCH 06/23] GEMV scalar implementation --- ggml/src/ggml-cpu/repack.cpp | 77 ++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index f88cd7f627..fefe39785a 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -617,7 +617,84 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK_K; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + float sumf[8]; + int sumi1,sumi2,sumi3,sumi4; + int sumi; + + const block_q8_K * a_ptr = (const block_q8_K *)vy; + for(int x = 0; x < nc / ncols_interleaved; x++) { + const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb); + for (int j = 0; j < ncols_interleaved; j++) { + sumf[j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (4 * blocklen)); k++) { + const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ; + const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; + const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; + const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; + for (int j = 0; j < ncols_interleaved; j++) { + sumi1 = 0; + sumi2 = 0; + sumi3 = 0; + sumi4 = 0; + sumi = 0; + int offset = ((k / 2) % 2) + j * 2; + for (int i = 0; i < blocklen; ++i) { + const int hbits_index = k * ncols_interleaved * blocklen + j * blocklen + i; + const int lbits_index = (hbits_index / 32) * 64 + (hbits_index % 32); + const int v0_hbits = (int8_t) ((b_ptr[l].qh[hbits_index] & 3) << 4); + const int v1_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4); + const int v2_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4); + const int v3_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4); + + const int v0_lbits = (int8_t) (b_ptr[l].qh[lbits_index] & 0xF); + const int v1_lbits = (int8_t) (b_ptr[l].qh[lbits_index + 32] & 0xF); + const int v2_lbits = (int8_t) ((b_ptr[l].qh[lbits_index] >> 4) & 0xF); + const int v3_lbits = (int8_t) ((b_ptr[l].qh[lbits_index + 32] >> 4) & 0xF); + + const int v0 = v0_hbits | v0_lbits; + const int v1 = v1_hbits | v1_lbits; + const int v2 = v2_hbits | v2_lbits; + const int v3 = v3_hbits | v3_lbits; + + sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]); + sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]); + sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]); + sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]); + + sumi1 = sumi1 * scales_0[offset]; + sumi2 = sumi2 * scales_1[offset]; + sumi3 = sumi3 * scales_2[offset]; + sumi4 = sumi4 * scales_3[offset]; + sumi += sumi1 + sumi2 + sumi3 + sumi4; + } + sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; + } + } + } + for (int j = 0; j < ncols_interleaved; j++) { + s[x * ncols_interleaved + j] = sumf[j]; + } + } } void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { From 6e46dc1108de160356810758565d516e0e61c910 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 15:21:29 +0530 Subject: [PATCH 07/23] GEMM scalar implementation --- ggml/src/ggml-cpu/repack.cpp | 88 ++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index fefe39785a..7182479f61 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1200,7 +1200,95 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK_K; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + float sumf[4][8]; + int sumi1, sumi2, sumi3, sumi4; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumf[m][j] = 0.0; + sum_minf[m][j] = 0.0; + } + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (4 * blocklen)); k++) { + + const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ; + const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; + const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; + const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi1 = 0; + sumi2 = 0; + sumi3 = 0; + sumi4 = 0; + sumi = 0; + int offset = ((k / 2) % 2) + j * 2; + for (int i = 0; i < blocklen; ++i){ + const int hbits_index = k * ncols_interleaved * blocklen + j * blocklen + i; + const int lbits_index = (hbits_index / 32) * 64 + (hbits_index % 32); + const int v0_hbits = (int8_t) ((b_ptr[l].qh[hbits_index] & 3) << 4); + const int v1_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4); + const int v2_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4); + const int v3_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4); + + const int v0_lbits = (int8_t) (b_ptr[l].qh[lbits_index] & 0xF); + const int v1_lbits = (int8_t) (b_ptr[l].qh[lbits_index + 32] & 0xF); + const int v2_lbits = (int8_t) ((b_ptr[l].qh[lbits_index] >> 4) & 0xF); + const int v3_lbits = (int8_t) ((b_ptr[l].qh[lbits_index + 32] >> 4) & 0xF); + + const int v0 = v0_hbits | v0_lbits; + const int v1 = v1_hbits | v1_lbits; + const int v2 = v2_hbits | v2_lbits; + const int v3 = v3_hbits | v3_lbits; + + sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]); + sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]); + sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]); + sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]); + sumi1 = sumi1 * (scales_0[offset] & 0xF); + sumi2 = sumi2 * (scales_1[offset] & 0xF); + sumi3 = sumi3 * (scales_2[offset] & 0xF); + sumi4 = sumi4 * (scales_3[offset] & 0xF); + sumi += sumi1 + sumi2 + sumi3 + sumi4; + } + sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; + } + } + } + } + + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } + } } void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { From 56b1f7d6480eb6904e6dd38ffc8e66f2453f5deb Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 17:02:35 +0530 Subject: [PATCH 08/23] Initial cleanup of GEMM --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 930 +++++++++++++------------- 1 file changed, 471 insertions(+), 459 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 68fecfdbb5..0f3b6e40b3 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -6708,33 +6708,39 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo int64_t b_nb = n / QK_K; int64_t y = 0; - // Mask to mask out nibbles from packed bytes // Permute mask used for easier vector processing at later stages __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4); int64_t xstart = 0; int anr = nr - nr % 16;; // Used to align nr with boundary of 16 - // Mask to mask out nibbles from packed bytes + // Mask to extract nibbles from packed bytes const __m256i m4 = _mm256_set1_epi8(0xF); + // Mask to extract 2 bit values from packed bytes const __m256i m2 = _mm256_set1_epi8(3); + // Vector with each byte value 32 - Used as an subtract offset for 6 bit quantized values const __m256i m32s = _mm256_set1_epi8(32); //Mask to get appropriate scales __m128i scalesmask1_sse = _mm_set_epi8(14,14,12,12,10,10,8,8,6,6,4,4,2,2,0,0); __m128i scalesmask2_sse = _mm_set_epi8(15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1); +#ifdef __AVX512F__ + int anc = nc - nc % 16; // Used to align nc with boundary of 16 + //Expanded mask to get appropriate scales __m256i scalesmask1 = _mm256_castsi128_si256(scalesmask1_sse); scalesmask1 = _mm256_permute2f128_si256(scalesmask1, scalesmask1, 0); __m256i scalesmask2 = _mm256_castsi128_si256(scalesmask2_sse); scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0); -#ifdef __AVX512F__ - int anc = nc - nc % 16; // Used to align nc with boundary of 16 + // Mask to extract nibbles from packed bytes const __m512i m4_expanded = _mm512_set1_epi8(0xF); + // Mask to extract 2 bit values from packed bytes const __m512i m2_expanded = _mm512_set1_epi8(3); + // Vector with each byte set to 32 - Used as an subtraction adjustment factor for 6 bit quantization const __m512i m32s_expanded = _mm512_set1_epi8(32); + //Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation for (; y < anr / 4; y += 4){ const block_q8_Kx4 * a_ptrs[4]; @@ -6743,7 +6749,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo for (int i = 0; i < 3; ++i) { a_ptrs[i + 1] = a_ptrs[i] + nb; } - // Take group of eight block_q6_kx8 structures at each pass of the loop and perform dot product operation + // Take group of two block_q6_kx8 structures at each pass of the loop and perform dot product operation for (int64_t x = 0; x < anc / 8; x += 2) { const block_q6_Kx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb); @@ -6757,45 +6763,49 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // For super block for (int64_t b = 0; b < nb; b++) { - // Delta values - Load the sixteen scale values from two block_q2_kx8 structures + // Delta values - Load the sixteen scale values from two block_q6_kx8 structures const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d); for (int sb = 0; sb < QK_K / 128; sb++) { - const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + sb * 512)); - const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 32 + sb * 512)); - const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 64 + sb * 512)); - const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 96 + sb * 512)); - const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 128 + sb * 512)); - const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 160 + sb * 512)); - const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 192 + sb * 512)); - const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 224 + sb * 512)); - const __m256i rhs_raw_mat_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 256 + sb * 512)); - const __m256i rhs_raw_mat_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 288 + sb * 512)); - const __m256i rhs_raw_mat_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 320 + sb * 512)); - const __m256i rhs_raw_mat_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 352 + sb * 512)); - const __m256i rhs_raw_mat_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 384 + sb * 512)); - const __m256i rhs_raw_mat_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 416 + sb * 512)); - const __m256i rhs_raw_mat_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 448 + sb * 512)); - const __m256i rhs_raw_mat_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 480 + sb * 512)); + // Load the sixteen block_q6_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7 + // The lower and higher packed bits are loaded, unpacked and individual bytes representing 6 bits each are formed from the same + // They are blended/permuted for further mul mat operations within the pipeline + const __m256i rhs_raw_lbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + sb * 512)); + const __m256i rhs_raw_lbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_lbit_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_lbit_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_lbit_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_lbit_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_lbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_lbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 224 + sb * 512)); - const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + sb * 512)); - const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 32 + sb * 512)); - const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 64 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 96 + sb * 512)); - const __m256i rhs_raw_mat_89AB_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 128 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 160 + sb * 512)); - const __m256i rhs_raw_mat_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 192 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 224 + sb * 512)); + const __m256i rhs_raw_lbit_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_lbit_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_lbit_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_lbit_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_lbit_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_lbit_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_lbit_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_lbit_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 480 + sb * 512)); - const __m256i rhs_raw_mat_89AB_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 256 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 288 + sb * 512)); - const __m256i rhs_raw_mat_89AB_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 320 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 352 + sb * 512)); - const __m256i rhs_raw_mat_89AB_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 384 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 416 + sb * 512)); - const __m256i rhs_raw_mat_89AB_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 448 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 480 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 224 + sb * 512)); + + const __m256i rhs_raw_lbit_89AB_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 480 + sb * 512)); const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + sb * 256)); const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 32 + sb * 256)); @@ -6815,45 +6825,41 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 192 + sb * 256)); const __m256i rhs_raw_hbit_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 224 + sb * 256)); - // Indices 0 through 7 (first block): - const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); - const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240); - const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240); - const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240); + const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240); + const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_1, requiredOrder), rhs_raw_lbit_4567_1, 240); + const __m256i rhs_raw_lbit_0145_2 = _mm256_blend_epi32(rhs_raw_lbit_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_2, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_2, requiredOrder), rhs_raw_lbit_4567_2, 240); + const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240); - // Indices 4 through 7 (second block): - const __m256i rhs_raw_mat_0145_4 = _mm256_blend_epi32(rhs_raw_mat_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_4, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_4, requiredOrder), rhs_raw_mat_4567_4, 240); - const __m256i rhs_raw_mat_0145_5 = _mm256_blend_epi32(rhs_raw_mat_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_5, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_5, requiredOrder), rhs_raw_mat_4567_5, 240); - const __m256i rhs_raw_mat_0145_6 = _mm256_blend_epi32(rhs_raw_mat_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_6, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_6, requiredOrder), rhs_raw_mat_4567_6, 240); - const __m256i rhs_raw_mat_0145_7 = _mm256_blend_epi32(rhs_raw_mat_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_7, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_7, requiredOrder), rhs_raw_mat_4567_7, 240); + const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240); + const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_5, requiredOrder), rhs_raw_lbit_4567_5, 240); + const __m256i rhs_raw_lbit_0145_6 = _mm256_blend_epi32(rhs_raw_lbit_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_6, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_6, requiredOrder), rhs_raw_lbit_4567_6, 240); + const __m256i rhs_raw_lbit_0145_7 = _mm256_blend_epi32(rhs_raw_lbit_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_7, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_7, requiredOrder), rhs_raw_lbit_4567_7, 240); - // Indices 8 through F (first block): - const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240); - const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240); - const __m256i rhs_raw_mat_89CD_2 = _mm256_blend_epi32(rhs_raw_mat_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_2, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_2, requiredOrder), rhs_raw_mat_CDEF_2, 240); - const __m256i rhs_raw_mat_89CD_3 = _mm256_blend_epi32(rhs_raw_mat_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_3, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_3, requiredOrder), rhs_raw_mat_CDEF_3, 240); + const __m256i rhs_raw_lbit_89CD_0 = _mm256_blend_epi32(rhs_raw_lbit_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_0, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_0, requiredOrder), rhs_raw_lbit_CDEF_0, 240); + const __m256i rhs_raw_lbit_89CD_1 = _mm256_blend_epi32(rhs_raw_lbit_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_1, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_1, requiredOrder), rhs_raw_lbit_CDEF_1, 240); + const __m256i rhs_raw_lbit_89CD_2 = _mm256_blend_epi32(rhs_raw_lbit_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_2, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_2, requiredOrder), rhs_raw_lbit_CDEF_2, 240); + const __m256i rhs_raw_lbit_89CD_3 = _mm256_blend_epi32(rhs_raw_lbit_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_3, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_3, requiredOrder), rhs_raw_lbit_CDEF_3, 240); - // Indices 8 through F (second block): - const __m256i rhs_raw_mat_89CD_4 = _mm256_blend_epi32(rhs_raw_mat_89AB_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_4, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_4, requiredOrder), rhs_raw_mat_CDEF_4, 240); - const __m256i rhs_raw_mat_89CD_5 = _mm256_blend_epi32(rhs_raw_mat_89AB_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_5, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_5, requiredOrder), rhs_raw_mat_CDEF_5, 240); - const __m256i rhs_raw_mat_89CD_6 = _mm256_blend_epi32(rhs_raw_mat_89AB_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_6, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_6, requiredOrder), rhs_raw_mat_CDEF_6, 240); - const __m256i rhs_raw_mat_89CD_7 = _mm256_blend_epi32(rhs_raw_mat_89AB_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_7, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_7, requiredOrder), rhs_raw_mat_CDEF_7, 240); + const __m256i rhs_raw_lbit_89CD_4 = _mm256_blend_epi32(rhs_raw_lbit_89AB_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_4, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_4, requiredOrder), rhs_raw_lbit_CDEF_4, 240); + const __m256i rhs_raw_lbit_89CD_5 = _mm256_blend_epi32(rhs_raw_lbit_89AB_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_5, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_5, requiredOrder), rhs_raw_lbit_CDEF_5, 240); + const __m256i rhs_raw_lbit_89CD_6 = _mm256_blend_epi32(rhs_raw_lbit_89AB_6, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_6, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_6, requiredOrder), rhs_raw_lbit_CDEF_6, 240); + const __m256i rhs_raw_lbit_89CD_7 = _mm256_blend_epi32(rhs_raw_lbit_89AB_7, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_7, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_7, requiredOrder), rhs_raw_lbit_CDEF_7, 240); const __m256i rhs_raw_hbit_0145_0 = _mm256_blend_epi32(rhs_raw_hbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_0, requiredOrder), 240); const __m256i rhs_raw_hbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_0, requiredOrder), rhs_raw_hbit_4567_0, 240); @@ -6873,25 +6879,25 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_89CD_3 = _mm256_blend_epi32(rhs_raw_hbit_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_3, requiredOrder), 240); const __m256i rhs_raw_hbit_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_3, requiredOrder), rhs_raw_hbit_CDEF_3, 240); - const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1); - const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1); - const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1); - const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1); + const __m512i rhs_raw_lbit_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_0), rhs_raw_lbit_89CD_0, 1); + const __m512i rhs_raw_lbit_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_0), rhs_raw_lbit_ABEF_0, 1); + const __m512i rhs_raw_lbit_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_1), rhs_raw_lbit_89CD_1, 1); + const __m512i rhs_raw_lbit_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_1), rhs_raw_lbit_ABEF_1, 1); - const __m512i rhs_raw_mat_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_2), rhs_raw_mat_89CD_2, 1); - const __m512i rhs_raw_mat_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_2), rhs_raw_mat_ABEF_2, 1); - const __m512i rhs_raw_mat_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_3), rhs_raw_mat_89CD_3, 1); - const __m512i rhs_raw_mat_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_3), rhs_raw_mat_ABEF_3, 1); + const __m512i rhs_raw_lbit_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_2), rhs_raw_lbit_89CD_2, 1); + const __m512i rhs_raw_lbit_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_2), rhs_raw_lbit_ABEF_2, 1); + const __m512i rhs_raw_lbit_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_3), rhs_raw_lbit_89CD_3, 1); + const __m512i rhs_raw_lbit_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_3), rhs_raw_lbit_ABEF_3, 1); - const __m512i rhs_raw_mat_014589CD_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_4), rhs_raw_mat_89CD_4, 1); - const __m512i rhs_raw_mat_2367ABEF_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_4), rhs_raw_mat_ABEF_4, 1); - const __m512i rhs_raw_mat_014589CD_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_5), rhs_raw_mat_89CD_5, 1); - const __m512i rhs_raw_mat_2367ABEF_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_5), rhs_raw_mat_ABEF_5, 1); + const __m512i rhs_raw_lbit_014589CD_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_4), rhs_raw_lbit_89CD_4, 1); + const __m512i rhs_raw_lbit_2367ABEF_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_4), rhs_raw_lbit_ABEF_4, 1); + const __m512i rhs_raw_lbit_014589CD_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_5), rhs_raw_lbit_89CD_5, 1); + const __m512i rhs_raw_lbit_2367ABEF_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_5), rhs_raw_lbit_ABEF_5, 1); - const __m512i rhs_raw_mat_014589CD_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_6), rhs_raw_mat_89CD_6, 1); - const __m512i rhs_raw_mat_2367ABEF_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_6), rhs_raw_mat_ABEF_6, 1); - const __m512i rhs_raw_mat_014589CD_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_7), rhs_raw_mat_89CD_7, 1); - const __m512i rhs_raw_mat_2367ABEF_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_7), rhs_raw_mat_ABEF_7, 1); + const __m512i rhs_raw_lbit_014589CD_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_6), rhs_raw_lbit_89CD_6, 1); + const __m512i rhs_raw_lbit_2367ABEF_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_6), rhs_raw_lbit_ABEF_6, 1); + const __m512i rhs_raw_lbit_014589CD_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_7), rhs_raw_lbit_89CD_7, 1); + const __m512i rhs_raw_lbit_2367ABEF_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_7), rhs_raw_lbit_ABEF_7, 1); const __m512i rhs_raw_hbit_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_0), rhs_raw_hbit_89CD_0, 1); const __m512i rhs_raw_hbit_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_0), rhs_raw_hbit_ABEF_0, 1); @@ -6904,206 +6910,210 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m512i rhs_raw_hbit_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_3), rhs_raw_hbit_ABEF_3, 1); // 2-bit -> 8-bit - // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) - const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) - const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) - const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + // hbit Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop + const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //Index : 96 - 103 - const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) - const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) - const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) - const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //Index : 96 - 103 - const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) - const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) - const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) - const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 8 - 15, 72 - 79 + const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //Index : 104 - 111 - const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) - const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) - const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) - const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //Index : 104 - 111 - // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) - const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) - const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) - const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + // hbit values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop + const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) - const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) - const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) - const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) - const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) - const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) - const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //Index : 120 - 127 - const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) - const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) - const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) - const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //Index : 120 - 127 - // 0 -7, 64 - 71 - const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); - const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4_expanded), rhs_hbit_014589CD_40); + // 4 bit values are unpacked/denibbled and bitwise or-ed with the hbit values to form the 6 bit quantized values - const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4_expanded), rhs_hbit_2367ABEF_00); - const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4_expanded), rhs_hbit_2367ABEF_40); + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 0 -7, 64 - 71 + const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); + const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_0, 4), m4_expanded), rhs_hbit_014589CD_40); - // 8 - 15, 72 - 79 - const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_1, m4_expanded), rhs_hbit_014589CD_01); - const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4_expanded), rhs_hbit_014589CD_41); + const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_0, m4_expanded), rhs_hbit_2367ABEF_00); + const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_0, 4), m4_expanded), rhs_hbit_2367ABEF_40); - const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4_expanded), rhs_hbit_2367ABEF_01); - const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4_expanded), rhs_hbit_2367ABEF_41); + // Index : 8 - 15, 72 - 79 + const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_1, m4_expanded), rhs_hbit_014589CD_01); + const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_1, 4), m4_expanded), rhs_hbit_014589CD_41); - // 16 - 23, 80 - 87 - const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_2, m4_expanded), rhs_hbit_014589CD_10); - const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 4), m4_expanded), rhs_hbit_014589CD_50); + const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_1, m4_expanded), rhs_hbit_2367ABEF_01); + const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_1, 4), m4_expanded), rhs_hbit_2367ABEF_41); - const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_2, m4_expanded), rhs_hbit_2367ABEF_10); - const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 4), m4_expanded), rhs_hbit_2367ABEF_50); + // Index : 16 - 23, 80 - 87 + const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_2, m4_expanded), rhs_hbit_014589CD_10); + const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_2, 4), m4_expanded), rhs_hbit_014589CD_50); - // 24 - 31, 88 - 95 - const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_3, m4_expanded), rhs_hbit_014589CD_11); - const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 4), m4_expanded), rhs_hbit_014589CD_51); + const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_2, m4_expanded), rhs_hbit_2367ABEF_10); + const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_2, 4), m4_expanded), rhs_hbit_2367ABEF_50); - const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_3, m4_expanded), rhs_hbit_2367ABEF_11); - const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 4), m4_expanded), rhs_hbit_2367ABEF_51); + // Index : 24 - 31, 88 - 95 + const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_3, m4_expanded), rhs_hbit_014589CD_11); + const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_3, 4), m4_expanded), rhs_hbit_014589CD_51); - // 32 - 39, 96 - 103 - const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_4, m4_expanded), rhs_hbit_014589CD_20); - const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_4, 4), m4_expanded), rhs_hbit_014589CD_60); + const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_3, m4_expanded), rhs_hbit_2367ABEF_11); + const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_3, 4), m4_expanded), rhs_hbit_2367ABEF_51); - const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_4, m4_expanded), rhs_hbit_2367ABEF_20); - const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_4, 4), m4_expanded), rhs_hbit_2367ABEF_60); + // Index : 32 - 39, 96 - 103 + const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_4, m4_expanded), rhs_hbit_014589CD_20); + const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_4, 4), m4_expanded), rhs_hbit_014589CD_60); - // 40 - 47, 104 - 111 - const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_5, m4_expanded), rhs_hbit_014589CD_21); - const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_5, 4), m4_expanded), rhs_hbit_014589CD_61); + const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_4, m4_expanded), rhs_hbit_2367ABEF_20); + const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_4, 4), m4_expanded), rhs_hbit_2367ABEF_60); - const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_5, m4_expanded), rhs_hbit_2367ABEF_21); - const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_5, 4), m4_expanded), rhs_hbit_2367ABEF_61); + // Index : 40 - 47, 104 - 111 + const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_5, m4_expanded), rhs_hbit_014589CD_21); + const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_5, 4), m4_expanded), rhs_hbit_014589CD_61); - // 48 - 55, 112 - 119 - const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_6, m4_expanded), rhs_hbit_014589CD_30); - const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_6, 4), m4_expanded), rhs_hbit_014589CD_70); + const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_5, m4_expanded), rhs_hbit_2367ABEF_21); + const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_5, 4), m4_expanded), rhs_hbit_2367ABEF_61); - const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_6, m4_expanded), rhs_hbit_2367ABEF_30); - const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_6, 4), m4_expanded), rhs_hbit_2367ABEF_70); + // Index : 48 - 55, 112 - 119 + const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_6, m4_expanded), rhs_hbit_014589CD_30); + const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_6, 4), m4_expanded), rhs_hbit_014589CD_70); - // 56 - 63, 120 - 127 - const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_7, m4_expanded), rhs_hbit_014589CD_31); - const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_7, 4), m4_expanded), rhs_hbit_014589CD_71); + const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_6, m4_expanded), rhs_hbit_2367ABEF_30); + const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_6, 4), m4_expanded), rhs_hbit_2367ABEF_70); - const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_7, m4_expanded), rhs_hbit_2367ABEF_31); - const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_7, 4), m4_expanded), rhs_hbit_2367ABEF_71); + // Index : 56 - 63, 120 - 127 + const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_7, m4_expanded), rhs_hbit_014589CD_31); + const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_7, 4), m4_expanded), rhs_hbit_014589CD_71); + + const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_7, m4_expanded), rhs_hbit_2367ABEF_31); + const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_7, 4), m4_expanded), rhs_hbit_2367ABEF_71); // Shuffle pattern one - right side input - const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) - const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) + const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) B08(0-3) B09(0-3) B08(0-3) B09(0-3) B0C(0-3) B0D(0-3) B0C(0-3) B0D(0-3) + const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) B0A(0-3) B0B(0-3) B0A(0-3) B0B(0-3) B0E(0-3) B0F(0-3) B0E(0-3) B0F(0-3) - const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) - const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) + const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) B08(8-11) B09(8-11) B08(8-11) B09(8-11) B0C(8-11) B0D(8-11) B0C(8-11) B0D(8-11) + const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) B0A(8-11) B0B(8-11) B0A(8-11) B0B(8-11) B0E(8-11) B0F(8-11) B0E(8-11) B0F(8-11) - const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) - const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) + const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) B18(0-3) B19(0-3) B18(0-3) B19(0-3) B1C(0-3) B1D(0-3) B1C(0-3) B1D(0-3) + const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) B1A(0-3) B1B(0-3) B1A(0-3) B1B(0-3) B1E(0-3) B1F(0-3) B1E(0-3) B1F(0-3) - const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) - const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) + const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) B18(8-11) B19(8-11) B18(8-11) B19(8-11) B1C(8-11) B1D(8-11) B1C(8-11) B1D(8-11) + const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) B1A(8-11) B1B(8-11) B1A(8-11) B1B(8-11) B1E(8-11) B1F(8-11) B1E(8-11) B1F(8-11) - const __m512i rhs_mat_014589CD_20_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) - const __m512i rhs_mat_2367ABEF_20_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) + const __m512i rhs_mat_014589CD_20_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) B28(0-3) B29(0-3) B28(0-3) B29(0-3) B2C(0-3) B2D(0-3) B2C(0-3) B2D(0-3) + const __m512i rhs_mat_2367ABEF_20_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) B2A(0-3) B2B(0-3) B2A(0-3) B2B(0-3) B2E(0-3) B2F(0-3) B2E(0-3) B2F(0-3) - const __m512i rhs_mat_014589CD_21_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) - const __m512i rhs_mat_2367ABEF_21_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) + const __m512i rhs_mat_014589CD_21_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) B28(8-11) B29(8-11) B28(8-11) B29(8-11) B2C(8-11) B2D(8-11) B2C(8-11) B2D(8-11) + const __m512i rhs_mat_2367ABEF_21_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) B2A(8-11) B2B(8-11) B2A(8-11) B2B(8-11) B2E(8-11) B2F(8-11) B2E(8-11) B2F(8-11) - const __m512i rhs_mat_014589CD_30_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) - const __m512i rhs_mat_2367ABEF_30_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) + const __m512i rhs_mat_014589CD_30_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) B38(0-3) B39(0-3) B38(0-3) B39(0-3) B3C(0-3) B3D(0-3) B3C(0-3) B3D(0-3) + const __m512i rhs_mat_2367ABEF_30_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) B3A(0-3) B3B(0-3) B3A(0-3) B3B(0-3) B3E(0-3) B3F(0-3) B3E(0-3) B3F(0-3) - const __m512i rhs_mat_014589CD_31_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11 - const __m512i rhs_mat_2367ABEF_31_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) + const __m512i rhs_mat_014589CD_31_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11) B38(8-11) B39(8-11) B38(8-11) B39(8-11) B3C(8-11) B3D(8-11) B3C(8-11) B3D(8-11) + const __m512i rhs_mat_2367ABEF_31_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) B3A(8-11) B3B(8-11) B3A(8-11) B3B(8-11) B3E(8-11) B3F(8-11) B3E(8-11) B3F(8-11) - const __m512i rhs_mat_014589CD_40_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) - const __m512i rhs_mat_2367ABEF_40_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) + const __m512i rhs_mat_014589CD_40_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) B48(0-3) B49(0-3) B48(0-3) B49(0-3) B4C(0-3) B4D(0-3) B4C(0-3) B4D(0-3) + const __m512i rhs_mat_2367ABEF_40_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) B4A(0-3) B4B(0-3) B4A(0-3) B4B(0-3) B4E(0-3) B4F(0-3) B4E(0-3) B4F(0-3) - const __m512i rhs_mat_014589CD_41_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) - const __m512i rhs_mat_2367ABEF_41_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) + const __m512i rhs_mat_014589CD_41_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) B48(8-11) B49(8-11) B48(8-11) B49(8-11) B4C(8-11) B4D(8-11) B4C(8-11) B4D(8-11) + const __m512i rhs_mat_2367ABEF_41_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) B4A(8-11) B4B(8-11) B4A(8-11) B4B(8-11) B4E(8-11) B4F(8-11) B4E(8-11) B4F(8-11) - const __m512i rhs_mat_014589CD_50_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) - const __m512i rhs_mat_2367ABEF_50_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) + const __m512i rhs_mat_014589CD_50_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) B58(0-3) B59(0-3) B58(0-3) B59(0-3) B5C(0-3) B5D(0-3) B5C(0-3) B5D(0-3) + const __m512i rhs_mat_2367ABEF_50_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) B5A(0-3) B5B(0-3) B5A(0-3) B5B(0-3) B5E(0-3) B5F(0-3) B5E(0-3) B5F(0-3) - const __m512i rhs_mat_014589CD_51_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) - const __m512i rhs_mat_2367ABEF_51_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) + const __m512i rhs_mat_014589CD_51_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) B58(8-11) B59(8-11) B58(8-11) B59(8-11) B5C(8-11) B5D(8-11) B5C(8-11) B5D(8-11) + const __m512i rhs_mat_2367ABEF_51_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) B5A(8-11) B5B(8-11) B5A(8-11) B5B(8-11) B5E(8-11) B5F(8-11) B5E(8-11) B5F(8-11) - const __m512i rhs_mat_014589CD_60_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) - const __m512i rhs_mat_2367ABEF_60_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) + const __m512i rhs_mat_014589CD_60_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) B68(0-3) B69(0-3) B68(0-3) B69(0-3) B6C(0-3) B6D(0-3) B6C(0-3) B6D(0-3) + const __m512i rhs_mat_2367ABEF_60_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) B6A(0-3) B6B(0-3) B6A(0-3) B6B(0-3) B6E(0-3) B6F(0-3) B6E(0-3) B6F(0-3) - const __m512i rhs_mat_014589CD_61_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) - const __m512i rhs_mat_2367ABEF_61_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) + const __m512i rhs_mat_014589CD_61_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) B68(8-11) B69(8-11) B68(8-11) B69(8-11) B6C(8-11) B6D(8-11) B6C(8-11) B6D(8-11) + const __m512i rhs_mat_2367ABEF_61_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) B6A(8-11) B6B(8-11) B6A(8-11) B6B(8-11) B6E(8-11) B6F(8-11) B6E(8-11) B6F(8-11) - const __m512i rhs_mat_014589CD_70_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) - const __m512i rhs_mat_2367ABEF_70_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) + const __m512i rhs_mat_014589CD_70_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) B78(0-3) B79(0-3) B78(0-3) B79(0-3) B7C(0-3) B7D(0-3) B7C(0-3) B7D(0-3) + const __m512i rhs_mat_2367ABEF_70_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) B7A(0-3) B7B(0-3) B7A(0-3) B7B(0-3) B7E(0-3) B7F(0-3) B7E(0-3) B7F(0-3) - const __m512i rhs_mat_014589CD_71_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) - const __m512i rhs_mat_2367ABEF_71_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) + const __m512i rhs_mat_014589CD_71_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) B78(8-11) B79(8-11) B78(8-11) B79(8-11) B7C(8-11) B7D(8-11) B7C(8-11) B7D(8-11) + const __m512i rhs_mat_2367ABEF_71_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) B7A(8-11) B7B(8-11) B7A(8-11) B7B(8-11) B7E(8-11) B7F(8-11) B7E(8-11) B7F(8-11) // Shuffle pattern two - right side input - const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) - const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) + const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) B08(4-7) B09(4-7) B08(4-7) B09(4-7) B0C(4-7) B0D(4-7) B0C(4-7) B0D(4-7) + const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) B0A(4-7) B0B(4-7) B0A(4-7) B0B(4-7) B0E(4-7) B0F(4-7) B0E(4-7) B0F(4-7) - const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) - const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) + const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) B08(12-15) B09(12-15) B08(12-15) B09(12-15) B0C(12-15) B0D(12-15) B0C(12-15) B0D(12-15) + const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) B0A(12-15) B0B(12-15) B0A(12-15) B0B(12-15) B0E(12-15) B0F(12-15) B0E(12-15) B0F(12-15) - const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) - const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) + const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) B18(4-7) B19(4-7) B18(4-7) B19(4-7) B1C(4-7) B1D(4-7) B1C(4-7) B1D(4-7) + const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) B1A(4-7) B1B(4-7) B1A(4-7) B1B(4-7) B1E(4-7) B1F(4-7) B1E(4-7) B1F(4-7) - const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) - const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) + const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) B18(12-15) B19(12-15) B18(12-15) B19(12-15) B1C(12-15) B1D(12-15) B1C(12-15) B1D(12-15) + const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) B1A(12-15) B1B(12-15) B1A(12-15) B1B(12-15) B1E(12-15) B1F(12-15) B1E(12-15) B1F(12-15) - const __m512i rhs_mat_014589CD_20_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) - const __m512i rhs_mat_2367ABEF_20_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) + const __m512i rhs_mat_014589CD_20_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) B28(4-7) B29(4-7) B28(4-7) B29(4-7) B2C(4-7) B2D(4-7) B2C(4-7) B2D(4-7) + const __m512i rhs_mat_2367ABEF_20_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) B2A(4-7) B2B(4-7) B2A(4-7) B2B(4-7) B2E(4-7) B2F(4-7) B2E(4-7) B2F(4-7) - const __m512i rhs_mat_014589CD_21_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) - const __m512i rhs_mat_2367ABEF_21_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) + const __m512i rhs_mat_014589CD_21_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) B28(12-15) B29(12-15) B28(12-15) B29(12-15) B2C(12-15) B2D(12-15) B2C(12-15) B2D(12-15) + const __m512i rhs_mat_2367ABEF_21_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) B2A(12-15) B2B(12-15) B2A(12-15) B2B(12-15) B2E(12-15) B2F(12-15) B2E(12-15) B2F(12-15) - const __m512i rhs_mat_014589CD_30_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) - const __m512i rhs_mat_2367ABEF_30_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) + const __m512i rhs_mat_014589CD_30_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) B38(4-7) B39(4-7) B38(4-7) B39(4-7) B3C(4-7) B3D(4-7) B3C(4-7) B3D(4-7) + const __m512i rhs_mat_2367ABEF_30_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) B3A(4-7) B3B(4-7) B3A(4-7) B3B(4-7) B3E(4-7) B3F(4-7) B3E(4-7) B3F(4-7) - const __m512i rhs_mat_014589CD_31_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) - const __m512i rhs_mat_2367ABEF_31_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) + const __m512i rhs_mat_014589CD_31_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) B38(12-15) B39(12-15) B38(12-15) B39(12-15) B3C(12-15) B3D(12-15) B3C(12-15) B3D(12-15) + const __m512i rhs_mat_2367ABEF_31_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) B3A(12-15) B3B(12-15) B3A(12-15) B3B(12-15) B3E(12-15) B3F(12-15) B3E(12-15) B3F(12-15) - const __m512i rhs_mat_014589CD_40_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) - const __m512i rhs_mat_2367ABEF_40_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) + const __m512i rhs_mat_014589CD_40_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) B48(4-7) B49(4-7) B48(4-7) B49(4-7) B4C(4-7) B4D(4-7) B4C(4-7) B4D(4-7) + const __m512i rhs_mat_2367ABEF_40_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) B4A(4-7) B4B(4-7) B4A(4-7) B4B(4-7) B4E(4-7) B4F(4-7) B4E(4-7) B4F(4-7) - const __m512i rhs_mat_014589CD_41_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) - const __m512i rhs_mat_2367ABEF_41_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) + const __m512i rhs_mat_014589CD_41_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) B48(12-15) B49(12-15) B48(12-15) B49(12-15) B4C(12-15) B4D(12-15) B4C(12-15) B4D(12-15) + const __m512i rhs_mat_2367ABEF_41_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) B4A(12-15) B4B(12-15) B4A(12-15) B4B(12-15) B4E(12-15) B4F(12-15) B4E(12-15) B4F(12-15) - const __m512i rhs_mat_014589CD_50_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) - const __m512i rhs_mat_2367ABEF_50_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) + const __m512i rhs_mat_014589CD_50_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) B58(4-7) B59(4-7) B58(4-7) B59(4-7) B5C(4-7) B5D(4-7) B5C(4-7) B5D(4-7) + const __m512i rhs_mat_2367ABEF_50_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) B5A(4-7) B5B(4-7) B5A(4-7) B5B(4-7) B5E(4-7) B5F(4-7) B5E(4-7) B5F(4-7) - const __m512i rhs_mat_014589CD_51_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) - const __m512i rhs_mat_2367ABEF_51_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) + const __m512i rhs_mat_014589CD_51_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) B58(12-15) B59(12-15) B58(12-15) B59(12-15) B5C(12-15) B5D(12-15) B5C(12-15) B5D(12-15) + const __m512i rhs_mat_2367ABEF_51_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) B5A(12-15) B5B(12-15) B5A(12-15) B5B(12-15) B5E(12-15) B5F(12-15) B5E(12-15) B5F(12-15) - const __m512i rhs_mat_014589CD_60_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) - const __m512i rhs_mat_2367ABEF_60_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) + const __m512i rhs_mat_014589CD_60_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) B68(4-7) B69(4-7) B68(4-7) B69(4-7) B6C(4-7) B6D(4-7) B6C(4-7) B6D(4-7) + const __m512i rhs_mat_2367ABEF_60_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) B6A(4-7) B6B(4-7) B6A(4-7) B6B(4-7) B6E(4-7) B6F(4-7) B6E(4-7) B6F(4-7) - const __m512i rhs_mat_014589CD_61_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) - const __m512i rhs_mat_2367ABEF_61_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) + const __m512i rhs_mat_014589CD_61_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) B68(12-15) B69(12-15) B68(12-15) B69(12-15) B6C(12-15) B6D(12-15) B6C(12-15) B6D(12-15) + const __m512i rhs_mat_2367ABEF_61_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) B6A(12-15) B6B(12-15) B6A(12-15) B6B(12-15) B6E(12-15) B6F(12-15) B6E(12-15) B6F(12-15) - const __m512i rhs_mat_014589CD_70_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) - const __m512i rhs_mat_2367ABEF_70_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) + const __m512i rhs_mat_014589CD_70_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) B78(4-7) B79(4-7) B78(4-7) B79(4-7) B7C(4-7) B7D(4-7) B7C(4-7) B7D(4-7) + const __m512i rhs_mat_2367ABEF_70_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) B7A(4-7) B7B(4-7) B7A(4-7) B7B(4-7) B7E(4-7) B7F(4-7) B7E(4-7) B7F(4-7) - const __m512i rhs_mat_014589CD_71_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) - const __m512i rhs_mat_2367ABEF_71_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) + const __m512i rhs_mat_014589CD_71_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) B78(12-15) B79(12-15) B78(12-15) B79(12-15) B7C(12-15) B7D(12-15) B7C(12-15) B7D(12-15) + const __m512i rhs_mat_2367ABEF_71_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) B7A(12-15) B7B(12-15) B7A(12-15) B7B(12-15) B7E(12-15) B7F(12-15) B7E(12-15) B7F(12-15) //Scales of corresponding sub blocks from different Q6_K structures are stored together //s00 s01 s10 s11 s20 s21 ...... s70 s71 - // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop const __m128i scales_01_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + sb * 64)); const __m128i scales_23_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 16 + sb * 64)); const __m128i scales_45_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 32 + sb * 64)); @@ -7114,7 +7124,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m128i scales_45_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 32 + sb * 64)); const __m128i scales_67_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 48 + sb * 64)); - // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop const __m256i scales_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_01_0), scales_01_1, 1); const __m256i scales_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_23_0), scales_23_1, 1); const __m256i scales_45 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_45_0), scales_45_1, 1); @@ -7207,7 +7216,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i lhs_mat_ymm_01_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 0); __m256i lhs_mat_ymm_23_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 17); - __m512i lhs_mat_01_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_00), lhs_mat_ymm_01_00, 1); __m512i lhs_mat_23_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_00), lhs_mat_ymm_23_00, 1); __m512i lhs_mat_01_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_01), lhs_mat_ymm_01_01, 1); @@ -7281,201 +7289,205 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m512i lhs_mat_s_01_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_71); __m512i lhs_mat_s_23_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_71); - // Shuffle pattern one - left side input - const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) - const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + // Shuffle pattern one – left-side input - const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) - const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) - const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) - const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) - const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) - const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) - const __m512i lhs_mat_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) - const __m512i lhs_mat_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) - const __m512i lhs_mat_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) - const __m512i lhs_mat_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + const __m512i lhs_mat_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m512i lhs_mat_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) - const __m512i lhs_mat_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) - const __m512i lhs_mat_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + const __m512i lhs_mat_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m512i lhs_mat_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) - const __m512i lhs_mat_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) - const __m512i lhs_mat_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + const __m512i lhs_mat_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m512i lhs_mat_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) - const __m512i lhs_mat_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) - const __m512i lhs_mat_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + const __m512i lhs_mat_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m512i lhs_mat_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) - const __m512i lhs_mat_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) - const __m512i lhs_mat_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + const __m512i lhs_mat_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m512i lhs_mat_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) - const __m512i lhs_mat_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) - const __m512i lhs_mat_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + const __m512i lhs_mat_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m512i lhs_mat_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) - const __m512i lhs_mat_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) - const __m512i lhs_mat_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + const __m512i lhs_mat_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m512i lhs_mat_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) - const __m512i lhs_mat_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) - const __m512i lhs_mat_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + const __m512i lhs_mat_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m512i lhs_mat_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) - const __m512i lhs_mat_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) - const __m512i lhs_mat_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + const __m512i lhs_mat_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m512i lhs_mat_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) - const __m512i lhs_mat_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) - const __m512i lhs_mat_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + const __m512i lhs_mat_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m512i lhs_mat_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) - const __m512i lhs_mat_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) - const __m512i lhs_mat_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + const __m512i lhs_mat_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m512i lhs_mat_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) - // Shuffle pattern two- left side input - const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) - const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + const __m512i lhs_mat_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m512i lhs_mat_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) - const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) - const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + // Shuffle pattern two – left-side input - const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) - const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) - const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) - const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) - const __m512i lhs_mat_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) - const __m512i lhs_mat_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) - const __m512i lhs_mat_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) - const __m512i lhs_mat_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) - const __m512i lhs_mat_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) - const __m512i lhs_mat_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + const __m512i lhs_mat_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m512i lhs_mat_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) - const __m512i lhs_mat_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) - const __m512i lhs_mat_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + const __m512i lhs_mat_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m512i lhs_mat_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) - const __m512i lhs_mat_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) - const __m512i lhs_mat_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + const __m512i lhs_mat_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m512i lhs_mat_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) - const __m512i lhs_mat_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) - const __m512i lhs_mat_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + const __m512i lhs_mat_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m512i lhs_mat_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) - const __m512i lhs_mat_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) - const __m512i lhs_mat_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + const __m512i lhs_mat_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m512i lhs_mat_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) - const __m512i lhs_mat_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) - const __m512i lhs_mat_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + const __m512i lhs_mat_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m512i lhs_mat_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) - const __m512i lhs_mat_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) - const __m512i lhs_mat_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + const __m512i lhs_mat_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m512i lhs_mat_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) - const __m512i lhs_mat_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) - const __m512i lhs_mat_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + const __m512i lhs_mat_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m512i lhs_mat_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) - const __m512i lhs_mat_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) - const __m512i lhs_mat_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + const __m512i lhs_mat_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m512i lhs_mat_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) - const __m512i lhs_mat_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) - const __m512i lhs_mat_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + const __m512i lhs_mat_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m512i lhs_mat_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) - // Shuffle pattern one - left side input - const __m512i lhs_mat_s_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) - const __m512i lhs_mat_s_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + const __m512i lhs_mat_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m512i lhs_mat_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) - const __m512i lhs_mat_s_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) - const __m512i lhs_mat_s_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + const __m512i lhs_mat_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m512i lhs_mat_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) - const __m512i lhs_mat_s_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) - const __m512i lhs_mat_s_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + // Shuffle pattern one – left-side input - const __m512i lhs_mat_s_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) - const __m512i lhs_mat_s_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + const __m512i lhs_mat_s_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m512i lhs_mat_s_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) - const __m512i lhs_mat_s_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) - const __m512i lhs_mat_s_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + const __m512i lhs_mat_s_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m512i lhs_mat_s_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) - const __m512i lhs_mat_s_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) - const __m512i lhs_mat_s_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + const __m512i lhs_mat_s_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m512i lhs_mat_s_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) - const __m512i lhs_mat_s_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) - const __m512i lhs_mat_s_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + const __m512i lhs_mat_s_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m512i lhs_mat_s_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) - const __m512i lhs_mat_s_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) - const __m512i lhs_mat_s_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + const __m512i lhs_mat_s_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m512i lhs_mat_s_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) - const __m512i lhs_mat_s_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) - const __m512i lhs_mat_s_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + const __m512i lhs_mat_s_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m512i lhs_mat_s_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) - const __m512i lhs_mat_s_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) - const __m512i lhs_mat_s_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + const __m512i lhs_mat_s_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m512i lhs_mat_s_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) - const __m512i lhs_mat_s_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) - const __m512i lhs_mat_s_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + const __m512i lhs_mat_s_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m512i lhs_mat_s_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) - const __m512i lhs_mat_s_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) - const __m512i lhs_mat_s_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + const __m512i lhs_mat_s_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m512i lhs_mat_s_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) - const __m512i lhs_mat_s_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) - const __m512i lhs_mat_s_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + const __m512i lhs_mat_s_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m512i lhs_mat_s_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) - const __m512i lhs_mat_s_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) - const __m512i lhs_mat_s_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + const __m512i lhs_mat_s_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m512i lhs_mat_s_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) - const __m512i lhs_mat_s_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) - const __m512i lhs_mat_s_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + const __m512i lhs_mat_s_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m512i lhs_mat_s_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) - const __m512i lhs_mat_s_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) - const __m512i lhs_mat_s_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + const __m512i lhs_mat_s_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m512i lhs_mat_s_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) - // Shuffle pattern two- left side input - const __m512i lhs_mat_s_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) - const __m512i lhs_mat_s_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + const __m512i lhs_mat_s_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m512i lhs_mat_s_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) - const __m512i lhs_mat_s_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) - const __m512i lhs_mat_s_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + const __m512i lhs_mat_s_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m512i lhs_mat_s_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) - const __m512i lhs_mat_s_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) - const __m512i lhs_mat_s_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + const __m512i lhs_mat_s_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m512i lhs_mat_s_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) - const __m512i lhs_mat_s_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) - const __m512i lhs_mat_s_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + // Shuffle pattern two – left-side input - const __m512i lhs_mat_s_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) - const __m512i lhs_mat_s_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + const __m512i lhs_mat_s_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m512i lhs_mat_s_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) - const __m512i lhs_mat_s_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) - const __m512i lhs_mat_s_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + const __m512i lhs_mat_s_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m512i lhs_mat_s_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) - const __m512i lhs_mat_s_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) - const __m512i lhs_mat_s_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + const __m512i lhs_mat_s_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m512i lhs_mat_s_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) - const __m512i lhs_mat_s_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) - const __m512i lhs_mat_s_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + const __m512i lhs_mat_s_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m512i lhs_mat_s_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) - const __m512i lhs_mat_s_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) - const __m512i lhs_mat_s_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + const __m512i lhs_mat_s_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m512i lhs_mat_s_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) - const __m512i lhs_mat_s_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) - const __m512i lhs_mat_s_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + const __m512i lhs_mat_s_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m512i lhs_mat_s_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) - const __m512i lhs_mat_s_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) - const __m512i lhs_mat_s_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + const __m512i lhs_mat_s_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m512i lhs_mat_s_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) - const __m512i lhs_mat_s_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) - const __m512i lhs_mat_s_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + const __m512i lhs_mat_s_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m512i lhs_mat_s_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) - const __m512i lhs_mat_s_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) - const __m512i lhs_mat_s_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + const __m512i lhs_mat_s_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m512i lhs_mat_s_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) - const __m512i lhs_mat_s_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) - const __m512i lhs_mat_s_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + const __m512i lhs_mat_s_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m512i lhs_mat_s_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) - const __m512i lhs_mat_s_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) - const __m512i lhs_mat_s_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + const __m512i lhs_mat_s_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m512i lhs_mat_s_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) - const __m512i lhs_mat_s_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) - const __m512i lhs_mat_s_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + const __m512i lhs_mat_s_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m512i lhs_mat_s_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) + + const __m512i lhs_mat_s_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m512i lhs_mat_s_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m512i lhs_mat_s_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m512i lhs_mat_s_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m512i lhs_mat_s_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m512i lhs_mat_s_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m512i lhs_mat_s_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m512i lhs_mat_s_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane __m512i iacc_mat_00_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); @@ -7856,46 +7868,46 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 2-bit -> 8-bit // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) - const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) - const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) - const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //Index : 96 - 103 - const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) - const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) - const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) - const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //Index : 96 - 103 - const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) - const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) - const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) - const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //Index : 104 - 111 - const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) - const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) - const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) - const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //Index : 104 - 111 // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) - const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) - const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) - const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) - const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) - const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) - const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) - const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) - const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) - const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //Index : 120 - 127 - const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) - const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) - const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) - const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //Index : 120 - 127 // 0 -7, 64 - 71 const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); @@ -8728,46 +8740,46 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 2-bit -> 8-bit // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) - const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) - const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) - const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //Index : 0 - 7 + const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //Index : 32 - 39 + const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //Index : 64 - 71 + const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //Index : 96 - 103 - const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) - const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) - const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) - const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //Index : 0 - 7 + const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //Index : 32 - 39 + const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //Index : 64 - 71 + const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //Index : 96 - 103 - const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) - const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) - const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) - const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //Index : 8 - 15 + const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //Index : 40 - 47 + const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //Index : 72 - 79 + const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //Index : 104 - 111 - const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) - const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) - const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) - const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //Index : 8 - 15 + const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //Index : 40 - 47 + const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //Index : 72 - 79 + const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //Index : 104 - 111 // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) - const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) - const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) - const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //Index : 16 - 23 + const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //Index : 48 - 55 + const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //Index : 80 - 87 + const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //Index : 112 - 119 - const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) - const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) - const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) - const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //Index : 16 - 23 + const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //Index : 48 - 55 + const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //Index : 80 - 87 + const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //Index : 112 - 119 - const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) - const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) - const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) - const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //Index : 24 - 31 + const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //Index : 56 - 63 + const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //Index : 88 - 95 + const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //Index : 120 - 127 - const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) - const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) - const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) - const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //Index : 24 - 31 + const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //Index : 56 - 63 + const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95 + const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127 // 0 -7, 64 - 71 const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_0, m4), rhs_hbit_0145_00); @@ -9535,46 +9547,46 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 2-bit -> 8-bit // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) - const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) - const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) - const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //Index : 0 - 7 + const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //Index : 32 - 39 + const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //Index : 64 - 71 + const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //Index : 96 - 103 - const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) - const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) - const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) - const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //Index : 0 - 7 + const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //Index : 32 - 39 + const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //Index : 64 - 71 + const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //Index : 96 - 103 - const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) - const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) - const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) - const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //Index : 8 - 15 + const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //Index : 40 - 47 + const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //Index : 72 - 79 + const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //Index : 104 - 111 - const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) - const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) - const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) - const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //Index : 8 - 15 + const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //Index : 40 - 47 + const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //Index : 72 - 79 + const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //Index : 104 - 111 // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) - const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) - const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) - const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //Index : 16 - 23 + const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //Index : 48 - 55 + const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //Index : 80 - 87 + const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //Index : 112 - 119 - const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) - const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) - const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) - const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //Index : 16 - 23 + const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //Index : 48 - 55 + const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //Index : 80 - 87 + const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //Index : 112 - 119 - const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) - const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) - const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) - const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //Index : 24 - 31 + const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //Index : 56 - 63 + const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //Index : 88 - 95 + const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //Index : 120 - 127 - const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) - const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) - const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) - const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //Index : 24 - 31 + const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //Index : 56 - 63 + const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95 + const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127 // 0 -7, 64 - 71 const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_0, m4), rhs_hbit_0145_00); From e1c3c053c04c7147d5709ae384787aecad9d461f Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 17:23:20 +0530 Subject: [PATCH 09/23] Further cleanup of GEMM --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 679 +++++++++++++------------- 1 file changed, 341 insertions(+), 338 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 0f3b6e40b3..52aa99f2bd 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -7489,7 +7489,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m512i lhs_mat_s_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) const __m512i lhs_mat_s_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) - // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane + // The values arranged in shuffle patterns are operated with dot product operation within 16 bit lane i.e corresponding bytes and multiplied and added into 16 bit integers within 16 bit lane __m512i iacc_mat_00_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); __m512i iacc_mat_01_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); @@ -7627,7 +7627,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m512i iacc_mat_10_7 = _mm512_add_epi16(iacc_mat_10_7_sp1, iacc_mat_10_7_sp2); __m512i iacc_mat_11_7 = _mm512_add_epi16(iacc_mat_11_7_sp1, iacc_mat_11_7_sp2); - // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block + // Multiply madd of quants with scales iacc_mat_00_0 = _mm512_madd_epi16(iacc_mat_00_0, scale_014589CD_0); iacc_mat_01_0 = _mm512_madd_epi16(iacc_mat_01_0, scale_2367ABEF_0); iacc_mat_10_0 = _mm512_madd_epi16(iacc_mat_10_0, scale_014589CD_0); @@ -7668,7 +7668,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo iacc_mat_10_7 = _mm512_madd_epi16(iacc_mat_10_7, scale_014589CD_7); iacc_mat_11_7 = _mm512_madd_epi16(iacc_mat_11_7, scale_2367ABEF_7); - __m512i iacc_mat_00 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_0, iacc_mat_00_1), _mm512_add_epi32(iacc_mat_00_2, iacc_mat_00_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_4, iacc_mat_00_5), _mm512_add_epi32(iacc_mat_00_6, iacc_mat_00_7))); __m512i iacc_mat_01 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_0, iacc_mat_01_1), _mm512_add_epi32(iacc_mat_01_2, iacc_mat_01_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_4, iacc_mat_01_5), _mm512_add_epi32(iacc_mat_01_6, iacc_mat_01_7))); __m512i iacc_mat_10 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_0, iacc_mat_10_1), _mm512_add_epi32(iacc_mat_10_2, iacc_mat_10_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_4, iacc_mat_10_5), _mm512_add_epi32(iacc_mat_10_6, iacc_mat_10_7))); @@ -7706,7 +7705,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const block_q8_Kx4 * a_ptr = a_ptr_start + (y * nb); - // Take group of eight block_q6_kx8 structures at each pass of the loop and perform dot product operation + // Take group of two block_q6_kx8 structures at each pass of the loop and perform dot product operation for (int64_t x = 0; x < anc / 8; x += 2) { const block_q6_Kx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb); @@ -7720,45 +7719,49 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // For super block for (int64_t b = 0; b < nb; b++) { - // Delta values - Load the sixteen scale values from two block_q2_kx8 structures + // Delta values - Load the sixteen scale values from two block_q6_kx8 structures const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d); for (int sb = 0; sb < QK_K / 128; sb++) { - const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + sb * 512)); - const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 32 + sb * 512)); - const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 64 + sb * 512)); - const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 96 + sb * 512)); - const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 128 + sb * 512)); - const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 160 + sb * 512)); - const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 192 + sb * 512)); - const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 224 + sb * 512)); - const __m256i rhs_raw_mat_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 256 + sb * 512)); - const __m256i rhs_raw_mat_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 288 + sb * 512)); - const __m256i rhs_raw_mat_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 320 + sb * 512)); - const __m256i rhs_raw_mat_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 352 + sb * 512)); - const __m256i rhs_raw_mat_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 384 + sb * 512)); - const __m256i rhs_raw_mat_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 416 + sb * 512)); - const __m256i rhs_raw_mat_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 448 + sb * 512)); - const __m256i rhs_raw_mat_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 480 + sb * 512)); + // Load the sixteen block_q6_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7 + // The lower and higher packed bits are loaded, unpacked and individual bytes representing 6 bits each are formed from the same + // They are blended/permuted for further mul mat operations within the pipeline + const __m256i rhs_raw_lbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + sb * 512)); + const __m256i rhs_raw_lbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_lbit_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_lbit_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_lbit_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_lbit_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_lbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_lbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 224 + sb * 512)); - const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + sb * 512)); - const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 32 + sb * 512)); - const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 64 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 96 + sb * 512)); - const __m256i rhs_raw_mat_89AB_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 128 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 160 + sb * 512)); - const __m256i rhs_raw_mat_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 192 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 224 + sb * 512)); + const __m256i rhs_raw_lbit_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_lbit_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_lbit_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_lbit_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_lbit_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_lbit_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_lbit_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_lbit_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 480 + sb * 512)); - const __m256i rhs_raw_mat_89AB_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 256 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 288 + sb * 512)); - const __m256i rhs_raw_mat_89AB_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 320 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 352 + sb * 512)); - const __m256i rhs_raw_mat_89AB_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 384 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 416 + sb * 512)); - const __m256i rhs_raw_mat_89AB_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 448 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 480 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 224 + sb * 512)); + + const __m256i rhs_raw_lbit_89AB_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 480 + sb * 512)); const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + sb * 256)); const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 32 + sb * 256)); @@ -7778,45 +7781,41 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 192 + sb * 256)); const __m256i rhs_raw_hbit_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 224 + sb * 256)); - // Indices 0 through 7 (first block): - const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); - const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240); - const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240); - const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240); + const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240); + const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_1, requiredOrder), rhs_raw_lbit_4567_1, 240); + const __m256i rhs_raw_lbit_0145_2 = _mm256_blend_epi32(rhs_raw_lbit_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_2, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_2, requiredOrder), rhs_raw_lbit_4567_2, 240); + const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240); - // Indices 4 through 7 (second block): - const __m256i rhs_raw_mat_0145_4 = _mm256_blend_epi32(rhs_raw_mat_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_4, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_4, requiredOrder), rhs_raw_mat_4567_4, 240); - const __m256i rhs_raw_mat_0145_5 = _mm256_blend_epi32(rhs_raw_mat_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_5, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_5, requiredOrder), rhs_raw_mat_4567_5, 240); - const __m256i rhs_raw_mat_0145_6 = _mm256_blend_epi32(rhs_raw_mat_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_6, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_6, requiredOrder), rhs_raw_mat_4567_6, 240); - const __m256i rhs_raw_mat_0145_7 = _mm256_blend_epi32(rhs_raw_mat_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_7, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_7, requiredOrder), rhs_raw_mat_4567_7, 240); + const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240); + const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_5, requiredOrder), rhs_raw_lbit_4567_5, 240); + const __m256i rhs_raw_lbit_0145_6 = _mm256_blend_epi32(rhs_raw_lbit_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_6, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_6, requiredOrder), rhs_raw_lbit_4567_6, 240); + const __m256i rhs_raw_lbit_0145_7 = _mm256_blend_epi32(rhs_raw_lbit_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_7, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_7, requiredOrder), rhs_raw_lbit_4567_7, 240); - // Indices 8 through F (first block): - const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240); - const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240); - const __m256i rhs_raw_mat_89CD_2 = _mm256_blend_epi32(rhs_raw_mat_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_2, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_2, requiredOrder), rhs_raw_mat_CDEF_2, 240); - const __m256i rhs_raw_mat_89CD_3 = _mm256_blend_epi32(rhs_raw_mat_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_3, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_3, requiredOrder), rhs_raw_mat_CDEF_3, 240); + const __m256i rhs_raw_lbit_89CD_0 = _mm256_blend_epi32(rhs_raw_lbit_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_0, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_0, requiredOrder), rhs_raw_lbit_CDEF_0, 240); + const __m256i rhs_raw_lbit_89CD_1 = _mm256_blend_epi32(rhs_raw_lbit_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_1, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_1, requiredOrder), rhs_raw_lbit_CDEF_1, 240); + const __m256i rhs_raw_lbit_89CD_2 = _mm256_blend_epi32(rhs_raw_lbit_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_2, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_2, requiredOrder), rhs_raw_lbit_CDEF_2, 240); + const __m256i rhs_raw_lbit_89CD_3 = _mm256_blend_epi32(rhs_raw_lbit_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_3, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_3, requiredOrder), rhs_raw_lbit_CDEF_3, 240); - // Indices 8 through F (second block): - const __m256i rhs_raw_mat_89CD_4 = _mm256_blend_epi32(rhs_raw_mat_89AB_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_4, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_4, requiredOrder), rhs_raw_mat_CDEF_4, 240); - const __m256i rhs_raw_mat_89CD_5 = _mm256_blend_epi32(rhs_raw_mat_89AB_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_5, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_5, requiredOrder), rhs_raw_mat_CDEF_5, 240); - const __m256i rhs_raw_mat_89CD_6 = _mm256_blend_epi32(rhs_raw_mat_89AB_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_6, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_6, requiredOrder), rhs_raw_mat_CDEF_6, 240); - const __m256i rhs_raw_mat_89CD_7 = _mm256_blend_epi32(rhs_raw_mat_89AB_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_7, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_7, requiredOrder), rhs_raw_mat_CDEF_7, 240); + const __m256i rhs_raw_lbit_89CD_4 = _mm256_blend_epi32(rhs_raw_lbit_89AB_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_4, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_4, requiredOrder), rhs_raw_lbit_CDEF_4, 240); + const __m256i rhs_raw_lbit_89CD_5 = _mm256_blend_epi32(rhs_raw_lbit_89AB_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_5, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_5, requiredOrder), rhs_raw_lbit_CDEF_5, 240); + const __m256i rhs_raw_lbit_89CD_6 = _mm256_blend_epi32(rhs_raw_lbit_89AB_6, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_6, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_6, requiredOrder), rhs_raw_lbit_CDEF_6, 240); + const __m256i rhs_raw_lbit_89CD_7 = _mm256_blend_epi32(rhs_raw_lbit_89AB_7, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_7, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_7, requiredOrder), rhs_raw_lbit_CDEF_7, 240); const __m256i rhs_raw_hbit_0145_0 = _mm256_blend_epi32(rhs_raw_hbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_0, requiredOrder), 240); const __m256i rhs_raw_hbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_0, requiredOrder), rhs_raw_hbit_4567_0, 240); @@ -7836,25 +7835,25 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_89CD_3 = _mm256_blend_epi32(rhs_raw_hbit_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_3, requiredOrder), 240); const __m256i rhs_raw_hbit_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_3, requiredOrder), rhs_raw_hbit_CDEF_3, 240); - const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1); - const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1); - const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1); - const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1); + const __m512i rhs_raw_lbit_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_0), rhs_raw_lbit_89CD_0, 1); + const __m512i rhs_raw_lbit_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_0), rhs_raw_lbit_ABEF_0, 1); + const __m512i rhs_raw_lbit_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_1), rhs_raw_lbit_89CD_1, 1); + const __m512i rhs_raw_lbit_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_1), rhs_raw_lbit_ABEF_1, 1); - const __m512i rhs_raw_mat_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_2), rhs_raw_mat_89CD_2, 1); - const __m512i rhs_raw_mat_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_2), rhs_raw_mat_ABEF_2, 1); - const __m512i rhs_raw_mat_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_3), rhs_raw_mat_89CD_3, 1); - const __m512i rhs_raw_mat_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_3), rhs_raw_mat_ABEF_3, 1); + const __m512i rhs_raw_lbit_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_2), rhs_raw_lbit_89CD_2, 1); + const __m512i rhs_raw_lbit_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_2), rhs_raw_lbit_ABEF_2, 1); + const __m512i rhs_raw_lbit_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_3), rhs_raw_lbit_89CD_3, 1); + const __m512i rhs_raw_lbit_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_3), rhs_raw_lbit_ABEF_3, 1); - const __m512i rhs_raw_mat_014589CD_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_4), rhs_raw_mat_89CD_4, 1); - const __m512i rhs_raw_mat_2367ABEF_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_4), rhs_raw_mat_ABEF_4, 1); - const __m512i rhs_raw_mat_014589CD_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_5), rhs_raw_mat_89CD_5, 1); - const __m512i rhs_raw_mat_2367ABEF_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_5), rhs_raw_mat_ABEF_5, 1); + const __m512i rhs_raw_lbit_014589CD_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_4), rhs_raw_lbit_89CD_4, 1); + const __m512i rhs_raw_lbit_2367ABEF_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_4), rhs_raw_lbit_ABEF_4, 1); + const __m512i rhs_raw_lbit_014589CD_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_5), rhs_raw_lbit_89CD_5, 1); + const __m512i rhs_raw_lbit_2367ABEF_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_5), rhs_raw_lbit_ABEF_5, 1); - const __m512i rhs_raw_mat_014589CD_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_6), rhs_raw_mat_89CD_6, 1); - const __m512i rhs_raw_mat_2367ABEF_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_6), rhs_raw_mat_ABEF_6, 1); - const __m512i rhs_raw_mat_014589CD_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_7), rhs_raw_mat_89CD_7, 1); - const __m512i rhs_raw_mat_2367ABEF_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_7), rhs_raw_mat_ABEF_7, 1); + const __m512i rhs_raw_lbit_014589CD_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_6), rhs_raw_lbit_89CD_6, 1); + const __m512i rhs_raw_lbit_2367ABEF_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_6), rhs_raw_lbit_ABEF_6, 1); + const __m512i rhs_raw_lbit_014589CD_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_7), rhs_raw_lbit_89CD_7, 1); + const __m512i rhs_raw_lbit_2367ABEF_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_7), rhs_raw_lbit_ABEF_7, 1); const __m512i rhs_raw_hbit_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_0), rhs_raw_hbit_89CD_0, 1); const __m512i rhs_raw_hbit_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_0), rhs_raw_hbit_ABEF_0, 1); @@ -7867,7 +7866,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m512i rhs_raw_hbit_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_3), rhs_raw_hbit_ABEF_3, 1); // 2-bit -> 8-bit - // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop + // hbit Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //Index : 0 - 7 const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //Index : 32 - 39 const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //Index : 64 - 71 @@ -7878,6 +7877,8 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //Index : 64 - 71 const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //Index : 96 - 103 + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 8 - 15, 72 - 79 const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //Index : 8 - 15 const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //Index : 40 - 47 const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //Index : 72 - 79 @@ -7888,7 +7889,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //Index : 72 - 79 const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //Index : 104 - 111 - // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop + // hbit values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //Index : 16 - 23 const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //Index : 48 - 55 const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //Index : 80 - 87 @@ -7909,164 +7910,166 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //Index : 88 - 95 const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //Index : 120 - 127 - // 0 -7, 64 - 71 - const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); - const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4_expanded), rhs_hbit_014589CD_40); + // 4 bit values are unpacked/denibbled and bitwise or-ed with the hbit values to form the 6 bit quantized values - const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4_expanded), rhs_hbit_2367ABEF_00); - const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4_expanded), rhs_hbit_2367ABEF_40); + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 0 -7, 64 - 71 + const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); + const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_0, 4), m4_expanded), rhs_hbit_014589CD_40); - // 8 - 15, 72 - 79 - const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_1, m4_expanded), rhs_hbit_014589CD_01); - const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4_expanded), rhs_hbit_014589CD_41); + const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_0, m4_expanded), rhs_hbit_2367ABEF_00); + const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_0, 4), m4_expanded), rhs_hbit_2367ABEF_40); - const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4_expanded), rhs_hbit_2367ABEF_01); - const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4_expanded), rhs_hbit_2367ABEF_41); + // Index : 8 - 15, 72 - 79 + const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_1, m4_expanded), rhs_hbit_014589CD_01); + const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_1, 4), m4_expanded), rhs_hbit_014589CD_41); - // 16 - 23, 80 - 87 - const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_2, m4_expanded), rhs_hbit_014589CD_10); - const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 4), m4_expanded), rhs_hbit_014589CD_50); + const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_1, m4_expanded), rhs_hbit_2367ABEF_01); + const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_1, 4), m4_expanded), rhs_hbit_2367ABEF_41); - const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_2, m4_expanded), rhs_hbit_2367ABEF_10); - const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 4), m4_expanded), rhs_hbit_2367ABEF_50); + // Index : 16 - 23, 80 - 87 + const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_2, m4_expanded), rhs_hbit_014589CD_10); + const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_2, 4), m4_expanded), rhs_hbit_014589CD_50); - // 24 - 31, 88 - 95 - const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_3, m4_expanded), rhs_hbit_014589CD_11); - const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 4), m4_expanded), rhs_hbit_014589CD_51); + const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_2, m4_expanded), rhs_hbit_2367ABEF_10); + const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_2, 4), m4_expanded), rhs_hbit_2367ABEF_50); - const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_3, m4_expanded), rhs_hbit_2367ABEF_11); - const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 4), m4_expanded), rhs_hbit_2367ABEF_51); + // Index : 24 - 31, 88 - 95 + const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_3, m4_expanded), rhs_hbit_014589CD_11); + const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_3, 4), m4_expanded), rhs_hbit_014589CD_51); - // 32 - 39, 96 - 103 - const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_4, m4_expanded), rhs_hbit_014589CD_20); - const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_4, 4), m4_expanded), rhs_hbit_014589CD_60); + const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_3, m4_expanded), rhs_hbit_2367ABEF_11); + const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_3, 4), m4_expanded), rhs_hbit_2367ABEF_51); - const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_4, m4_expanded), rhs_hbit_2367ABEF_20); - const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_4, 4), m4_expanded), rhs_hbit_2367ABEF_60); + // Index : 32 - 39, 96 - 103 + const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_4, m4_expanded), rhs_hbit_014589CD_20); + const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_4, 4), m4_expanded), rhs_hbit_014589CD_60); - // 40 - 47, 104 - 111 - const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_5, m4_expanded), rhs_hbit_014589CD_21); - const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_5, 4), m4_expanded), rhs_hbit_014589CD_61); + const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_4, m4_expanded), rhs_hbit_2367ABEF_20); + const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_4, 4), m4_expanded), rhs_hbit_2367ABEF_60); - const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_5, m4_expanded), rhs_hbit_2367ABEF_21); - const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_5, 4), m4_expanded), rhs_hbit_2367ABEF_61); + // Index : 40 - 47, 104 - 111 + const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_5, m4_expanded), rhs_hbit_014589CD_21); + const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_5, 4), m4_expanded), rhs_hbit_014589CD_61); - // 48 - 55, 112 - 119 - const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_6, m4_expanded), rhs_hbit_014589CD_30); - const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_6, 4), m4_expanded), rhs_hbit_014589CD_70); + const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_5, m4_expanded), rhs_hbit_2367ABEF_21); + const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_5, 4), m4_expanded), rhs_hbit_2367ABEF_61); - const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_6, m4_expanded), rhs_hbit_2367ABEF_30); - const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_6, 4), m4_expanded), rhs_hbit_2367ABEF_70); + // Index : 48 - 55, 112 - 119 + const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_6, m4_expanded), rhs_hbit_014589CD_30); + const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_6, 4), m4_expanded), rhs_hbit_014589CD_70); - // 56 - 63, 120 - 127 - const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_7, m4_expanded), rhs_hbit_014589CD_31); - const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_7, 4), m4_expanded), rhs_hbit_014589CD_71); + const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_6, m4_expanded), rhs_hbit_2367ABEF_30); + const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_6, 4), m4_expanded), rhs_hbit_2367ABEF_70); - const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_7, m4_expanded), rhs_hbit_2367ABEF_31); - const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_7, 4), m4_expanded), rhs_hbit_2367ABEF_71); + // Index : 56 - 63, 120 - 127 + const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_7, m4_expanded), rhs_hbit_014589CD_31); + const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_7, 4), m4_expanded), rhs_hbit_014589CD_71); + + const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_7, m4_expanded), rhs_hbit_2367ABEF_31); + const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_7, 4), m4_expanded), rhs_hbit_2367ABEF_71); // Shuffle pattern one - right side input - const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) - const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) + const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) B08(0-3) B09(0-3) B08(0-3) B09(0-3) B0C(0-3) B0D(0-3) B0C(0-3) B0D(0-3) + const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) B0A(0-3) B0B(0-3) B0A(0-3) B0B(0-3) B0E(0-3) B0F(0-3) B0E(0-3) B0F(0-3) - const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) - const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) + const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) B08(8-11) B09(8-11) B08(8-11) B09(8-11) B0C(8-11) B0D(8-11) B0C(8-11) B0D(8-11) + const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) B0A(8-11) B0B(8-11) B0A(8-11) B0B(8-11) B0E(8-11) B0F(8-11) B0E(8-11) B0F(8-11) - const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) - const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) + const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) B18(0-3) B19(0-3) B18(0-3) B19(0-3) B1C(0-3) B1D(0-3) B1C(0-3) B1D(0-3) + const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) B1A(0-3) B1B(0-3) B1A(0-3) B1B(0-3) B1E(0-3) B1F(0-3) B1E(0-3) B1F(0-3) - const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) - const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) + const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) B18(8-11) B19(8-11) B18(8-11) B19(8-11) B1C(8-11) B1D(8-11) B1C(8-11) B1D(8-11) + const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) B1A(8-11) B1B(8-11) B1A(8-11) B1B(8-11) B1E(8-11) B1F(8-11) B1E(8-11) B1F(8-11) - const __m512i rhs_mat_014589CD_20_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) - const __m512i rhs_mat_2367ABEF_20_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) + const __m512i rhs_mat_014589CD_20_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) B28(0-3) B29(0-3) B28(0-3) B29(0-3) B2C(0-3) B2D(0-3) B2C(0-3) B2D(0-3) + const __m512i rhs_mat_2367ABEF_20_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) B2A(0-3) B2B(0-3) B2A(0-3) B2B(0-3) B2E(0-3) B2F(0-3) B2E(0-3) B2F(0-3) - const __m512i rhs_mat_014589CD_21_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) - const __m512i rhs_mat_2367ABEF_21_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) + const __m512i rhs_mat_014589CD_21_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) B28(8-11) B29(8-11) B28(8-11) B29(8-11) B2C(8-11) B2D(8-11) B2C(8-11) B2D(8-11) + const __m512i rhs_mat_2367ABEF_21_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) B2A(8-11) B2B(8-11) B2A(8-11) B2B(8-11) B2E(8-11) B2F(8-11) B2E(8-11) B2F(8-11) - const __m512i rhs_mat_014589CD_30_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) - const __m512i rhs_mat_2367ABEF_30_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) + const __m512i rhs_mat_014589CD_30_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) B38(0-3) B39(0-3) B38(0-3) B39(0-3) B3C(0-3) B3D(0-3) B3C(0-3) B3D(0-3) + const __m512i rhs_mat_2367ABEF_30_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) B3A(0-3) B3B(0-3) B3A(0-3) B3B(0-3) B3E(0-3) B3F(0-3) B3E(0-3) B3F(0-3) - const __m512i rhs_mat_014589CD_31_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11 - const __m512i rhs_mat_2367ABEF_31_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) + const __m512i rhs_mat_014589CD_31_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11) B38(8-11) B39(8-11) B38(8-11) B39(8-11) B3C(8-11) B3D(8-11) B3C(8-11) B3D(8-11) + const __m512i rhs_mat_2367ABEF_31_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) B3A(8-11) B3B(8-11) B3A(8-11) B3B(8-11) B3E(8-11) B3F(8-11) B3E(8-11) B3F(8-11) - const __m512i rhs_mat_014589CD_40_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) - const __m512i rhs_mat_2367ABEF_40_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) + const __m512i rhs_mat_014589CD_40_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) B48(0-3) B49(0-3) B48(0-3) B49(0-3) B4C(0-3) B4D(0-3) B4C(0-3) B4D(0-3) + const __m512i rhs_mat_2367ABEF_40_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) B4A(0-3) B4B(0-3) B4A(0-3) B4B(0-3) B4E(0-3) B4F(0-3) B4E(0-3) B4F(0-3) - const __m512i rhs_mat_014589CD_41_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) - const __m512i rhs_mat_2367ABEF_41_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) + const __m512i rhs_mat_014589CD_41_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) B48(8-11) B49(8-11) B48(8-11) B49(8-11) B4C(8-11) B4D(8-11) B4C(8-11) B4D(8-11) + const __m512i rhs_mat_2367ABEF_41_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) B4A(8-11) B4B(8-11) B4A(8-11) B4B(8-11) B4E(8-11) B4F(8-11) B4E(8-11) B4F(8-11) - const __m512i rhs_mat_014589CD_50_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) - const __m512i rhs_mat_2367ABEF_50_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) + const __m512i rhs_mat_014589CD_50_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) B58(0-3) B59(0-3) B58(0-3) B59(0-3) B5C(0-3) B5D(0-3) B5C(0-3) B5D(0-3) + const __m512i rhs_mat_2367ABEF_50_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) B5A(0-3) B5B(0-3) B5A(0-3) B5B(0-3) B5E(0-3) B5F(0-3) B5E(0-3) B5F(0-3) - const __m512i rhs_mat_014589CD_51_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) - const __m512i rhs_mat_2367ABEF_51_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) + const __m512i rhs_mat_014589CD_51_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) B58(8-11) B59(8-11) B58(8-11) B59(8-11) B5C(8-11) B5D(8-11) B5C(8-11) B5D(8-11) + const __m512i rhs_mat_2367ABEF_51_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) B5A(8-11) B5B(8-11) B5A(8-11) B5B(8-11) B5E(8-11) B5F(8-11) B5E(8-11) B5F(8-11) - const __m512i rhs_mat_014589CD_60_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) - const __m512i rhs_mat_2367ABEF_60_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) + const __m512i rhs_mat_014589CD_60_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) B68(0-3) B69(0-3) B68(0-3) B69(0-3) B6C(0-3) B6D(0-3) B6C(0-3) B6D(0-3) + const __m512i rhs_mat_2367ABEF_60_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) B6A(0-3) B6B(0-3) B6A(0-3) B6B(0-3) B6E(0-3) B6F(0-3) B6E(0-3) B6F(0-3) - const __m512i rhs_mat_014589CD_61_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) - const __m512i rhs_mat_2367ABEF_61_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) + const __m512i rhs_mat_014589CD_61_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) B68(8-11) B69(8-11) B68(8-11) B69(8-11) B6C(8-11) B6D(8-11) B6C(8-11) B6D(8-11) + const __m512i rhs_mat_2367ABEF_61_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) B6A(8-11) B6B(8-11) B6A(8-11) B6B(8-11) B6E(8-11) B6F(8-11) B6E(8-11) B6F(8-11) - const __m512i rhs_mat_014589CD_70_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) - const __m512i rhs_mat_2367ABEF_70_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) + const __m512i rhs_mat_014589CD_70_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) B78(0-3) B79(0-3) B78(0-3) B79(0-3) B7C(0-3) B7D(0-3) B7C(0-3) B7D(0-3) + const __m512i rhs_mat_2367ABEF_70_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) B7A(0-3) B7B(0-3) B7A(0-3) B7B(0-3) B7E(0-3) B7F(0-3) B7E(0-3) B7F(0-3) - const __m512i rhs_mat_014589CD_71_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) - const __m512i rhs_mat_2367ABEF_71_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) + const __m512i rhs_mat_014589CD_71_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) B78(8-11) B79(8-11) B78(8-11) B79(8-11) B7C(8-11) B7D(8-11) B7C(8-11) B7D(8-11) + const __m512i rhs_mat_2367ABEF_71_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) B7A(8-11) B7B(8-11) B7A(8-11) B7B(8-11) B7E(8-11) B7F(8-11) B7E(8-11) B7F(8-11) // Shuffle pattern two - right side input - const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) - const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) + const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) B08(4-7) B09(4-7) B08(4-7) B09(4-7) B0C(4-7) B0D(4-7) B0C(4-7) B0D(4-7) + const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) B0A(4-7) B0B(4-7) B0A(4-7) B0B(4-7) B0E(4-7) B0F(4-7) B0E(4-7) B0F(4-7) - const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) - const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) + const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) B08(12-15) B09(12-15) B08(12-15) B09(12-15) B0C(12-15) B0D(12-15) B0C(12-15) B0D(12-15) + const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) B0A(12-15) B0B(12-15) B0A(12-15) B0B(12-15) B0E(12-15) B0F(12-15) B0E(12-15) B0F(12-15) - const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) - const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) + const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) B18(4-7) B19(4-7) B18(4-7) B19(4-7) B1C(4-7) B1D(4-7) B1C(4-7) B1D(4-7) + const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) B1A(4-7) B1B(4-7) B1A(4-7) B1B(4-7) B1E(4-7) B1F(4-7) B1E(4-7) B1F(4-7) - const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) - const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) + const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) B18(12-15) B19(12-15) B18(12-15) B19(12-15) B1C(12-15) B1D(12-15) B1C(12-15) B1D(12-15) + const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) B1A(12-15) B1B(12-15) B1A(12-15) B1B(12-15) B1E(12-15) B1F(12-15) B1E(12-15) B1F(12-15) - const __m512i rhs_mat_014589CD_20_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) - const __m512i rhs_mat_2367ABEF_20_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) + const __m512i rhs_mat_014589CD_20_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) B28(4-7) B29(4-7) B28(4-7) B29(4-7) B2C(4-7) B2D(4-7) B2C(4-7) B2D(4-7) + const __m512i rhs_mat_2367ABEF_20_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) B2A(4-7) B2B(4-7) B2A(4-7) B2B(4-7) B2E(4-7) B2F(4-7) B2E(4-7) B2F(4-7) - const __m512i rhs_mat_014589CD_21_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) - const __m512i rhs_mat_2367ABEF_21_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) + const __m512i rhs_mat_014589CD_21_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) B28(12-15) B29(12-15) B28(12-15) B29(12-15) B2C(12-15) B2D(12-15) B2C(12-15) B2D(12-15) + const __m512i rhs_mat_2367ABEF_21_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) B2A(12-15) B2B(12-15) B2A(12-15) B2B(12-15) B2E(12-15) B2F(12-15) B2E(12-15) B2F(12-15) - const __m512i rhs_mat_014589CD_30_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) - const __m512i rhs_mat_2367ABEF_30_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) + const __m512i rhs_mat_014589CD_30_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) B38(4-7) B39(4-7) B38(4-7) B39(4-7) B3C(4-7) B3D(4-7) B3C(4-7) B3D(4-7) + const __m512i rhs_mat_2367ABEF_30_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) B3A(4-7) B3B(4-7) B3A(4-7) B3B(4-7) B3E(4-7) B3F(4-7) B3E(4-7) B3F(4-7) - const __m512i rhs_mat_014589CD_31_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) - const __m512i rhs_mat_2367ABEF_31_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) + const __m512i rhs_mat_014589CD_31_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) B38(12-15) B39(12-15) B38(12-15) B39(12-15) B3C(12-15) B3D(12-15) B3C(12-15) B3D(12-15) + const __m512i rhs_mat_2367ABEF_31_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) B3A(12-15) B3B(12-15) B3A(12-15) B3B(12-15) B3E(12-15) B3F(12-15) B3E(12-15) B3F(12-15) - const __m512i rhs_mat_014589CD_40_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) - const __m512i rhs_mat_2367ABEF_40_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) + const __m512i rhs_mat_014589CD_40_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) B48(4-7) B49(4-7) B48(4-7) B49(4-7) B4C(4-7) B4D(4-7) B4C(4-7) B4D(4-7) + const __m512i rhs_mat_2367ABEF_40_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) B4A(4-7) B4B(4-7) B4A(4-7) B4B(4-7) B4E(4-7) B4F(4-7) B4E(4-7) B4F(4-7) - const __m512i rhs_mat_014589CD_41_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) - const __m512i rhs_mat_2367ABEF_41_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) + const __m512i rhs_mat_014589CD_41_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) B48(12-15) B49(12-15) B48(12-15) B49(12-15) B4C(12-15) B4D(12-15) B4C(12-15) B4D(12-15) + const __m512i rhs_mat_2367ABEF_41_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) B4A(12-15) B4B(12-15) B4A(12-15) B4B(12-15) B4E(12-15) B4F(12-15) B4E(12-15) B4F(12-15) - const __m512i rhs_mat_014589CD_50_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) - const __m512i rhs_mat_2367ABEF_50_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) + const __m512i rhs_mat_014589CD_50_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) B58(4-7) B59(4-7) B58(4-7) B59(4-7) B5C(4-7) B5D(4-7) B5C(4-7) B5D(4-7) + const __m512i rhs_mat_2367ABEF_50_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) B5A(4-7) B5B(4-7) B5A(4-7) B5B(4-7) B5E(4-7) B5F(4-7) B5E(4-7) B5F(4-7) - const __m512i rhs_mat_014589CD_51_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) - const __m512i rhs_mat_2367ABEF_51_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) + const __m512i rhs_mat_014589CD_51_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) B58(12-15) B59(12-15) B58(12-15) B59(12-15) B5C(12-15) B5D(12-15) B5C(12-15) B5D(12-15) + const __m512i rhs_mat_2367ABEF_51_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) B5A(12-15) B5B(12-15) B5A(12-15) B5B(12-15) B5E(12-15) B5F(12-15) B5E(12-15) B5F(12-15) - const __m512i rhs_mat_014589CD_60_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) - const __m512i rhs_mat_2367ABEF_60_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) + const __m512i rhs_mat_014589CD_60_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) B68(4-7) B69(4-7) B68(4-7) B69(4-7) B6C(4-7) B6D(4-7) B6C(4-7) B6D(4-7) + const __m512i rhs_mat_2367ABEF_60_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) B6A(4-7) B6B(4-7) B6A(4-7) B6B(4-7) B6E(4-7) B6F(4-7) B6E(4-7) B6F(4-7) - const __m512i rhs_mat_014589CD_61_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) - const __m512i rhs_mat_2367ABEF_61_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) + const __m512i rhs_mat_014589CD_61_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) B68(12-15) B69(12-15) B68(12-15) B69(12-15) B6C(12-15) B6D(12-15) B6C(12-15) B6D(12-15) + const __m512i rhs_mat_2367ABEF_61_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) B6A(12-15) B6B(12-15) B6A(12-15) B6B(12-15) B6E(12-15) B6F(12-15) B6E(12-15) B6F(12-15) - const __m512i rhs_mat_014589CD_70_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) - const __m512i rhs_mat_2367ABEF_70_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) + const __m512i rhs_mat_014589CD_70_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) B78(4-7) B79(4-7) B78(4-7) B79(4-7) B7C(4-7) B7D(4-7) B7C(4-7) B7D(4-7) + const __m512i rhs_mat_2367ABEF_70_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) B7A(4-7) B7B(4-7) B7A(4-7) B7B(4-7) B7E(4-7) B7F(4-7) B7E(4-7) B7F(4-7) - const __m512i rhs_mat_014589CD_71_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) - const __m512i rhs_mat_2367ABEF_71_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) + const __m512i rhs_mat_014589CD_71_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) B78(12-15) B79(12-15) B78(12-15) B79(12-15) B7C(12-15) B7D(12-15) B7C(12-15) B7D(12-15) + const __m512i rhs_mat_2367ABEF_71_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) B7A(12-15) B7B(12-15) B7A(12-15) B7B(12-15) B7E(12-15) B7F(12-15) B7E(12-15) B7F(12-15) //Scales of corresponding sub blocks from different Q6_K structures are stored together //s00 s01 s10 s11 s20 s21 ...... s70 s71 - // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop const __m128i scales_01_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + sb * 64)); const __m128i scales_23_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 16 + sb * 64)); const __m128i scales_45_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 32 + sb * 64)); @@ -8077,7 +8080,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m128i scales_45_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 32 + sb * 64)); const __m128i scales_67_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 48 + sb * 64)); - // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop const __m256i scales_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_01_0), scales_01_1, 1); const __m256i scales_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_23_0), scales_23_1, 1); const __m256i scales_45 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_45_0), scales_45_1, 1); @@ -8116,7 +8118,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m512i scale_014589CD_7 = _mm512_shuffle_epi32(scales_7, (_MM_PERM_ENUM)68); const __m512i scale_2367ABEF_7 = _mm512_shuffle_epi32(scales_7, (_MM_PERM_ENUM)238); - // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3 // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector __m256i lhs_mat_ymm_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 512 * sb))); @@ -8169,7 +8170,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i lhs_mat_ymm_01_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 0); __m256i lhs_mat_ymm_23_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 17); - __m512i lhs_mat_01_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_00), lhs_mat_ymm_01_00, 1); __m512i lhs_mat_23_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_00), lhs_mat_ymm_23_00, 1); __m512i lhs_mat_01_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_01), lhs_mat_ymm_01_01, 1); @@ -8243,203 +8243,207 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m512i lhs_mat_s_01_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_71); __m512i lhs_mat_s_23_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_71); - // Shuffle pattern one - left side input - const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) - const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + // Shuffle pattern one – left-side input - const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) - const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) - const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) - const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) - const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) - const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) - const __m512i lhs_mat_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) - const __m512i lhs_mat_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) - const __m512i lhs_mat_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) - const __m512i lhs_mat_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + const __m512i lhs_mat_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m512i lhs_mat_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) - const __m512i lhs_mat_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) - const __m512i lhs_mat_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + const __m512i lhs_mat_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m512i lhs_mat_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) - const __m512i lhs_mat_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) - const __m512i lhs_mat_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + const __m512i lhs_mat_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m512i lhs_mat_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) - const __m512i lhs_mat_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) - const __m512i lhs_mat_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + const __m512i lhs_mat_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m512i lhs_mat_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) - const __m512i lhs_mat_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) - const __m512i lhs_mat_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + const __m512i lhs_mat_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m512i lhs_mat_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) - const __m512i lhs_mat_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) - const __m512i lhs_mat_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + const __m512i lhs_mat_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m512i lhs_mat_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) - const __m512i lhs_mat_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) - const __m512i lhs_mat_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + const __m512i lhs_mat_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m512i lhs_mat_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) - const __m512i lhs_mat_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) - const __m512i lhs_mat_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + const __m512i lhs_mat_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m512i lhs_mat_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) - const __m512i lhs_mat_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) - const __m512i lhs_mat_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + const __m512i lhs_mat_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m512i lhs_mat_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) - const __m512i lhs_mat_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) - const __m512i lhs_mat_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + const __m512i lhs_mat_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m512i lhs_mat_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) - const __m512i lhs_mat_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) - const __m512i lhs_mat_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + const __m512i lhs_mat_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m512i lhs_mat_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) - // Shuffle pattern two- left side input - const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) - const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + const __m512i lhs_mat_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m512i lhs_mat_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) - const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) - const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + // Shuffle pattern two – left-side input - const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) - const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) - const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) - const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) - const __m512i lhs_mat_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) - const __m512i lhs_mat_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) - const __m512i lhs_mat_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) - const __m512i lhs_mat_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) - const __m512i lhs_mat_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) - const __m512i lhs_mat_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + const __m512i lhs_mat_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m512i lhs_mat_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) - const __m512i lhs_mat_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) - const __m512i lhs_mat_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + const __m512i lhs_mat_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m512i lhs_mat_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) - const __m512i lhs_mat_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) - const __m512i lhs_mat_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + const __m512i lhs_mat_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m512i lhs_mat_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) - const __m512i lhs_mat_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) - const __m512i lhs_mat_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + const __m512i lhs_mat_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m512i lhs_mat_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) - const __m512i lhs_mat_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) - const __m512i lhs_mat_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + const __m512i lhs_mat_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m512i lhs_mat_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) - const __m512i lhs_mat_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) - const __m512i lhs_mat_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + const __m512i lhs_mat_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m512i lhs_mat_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) - const __m512i lhs_mat_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) - const __m512i lhs_mat_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + const __m512i lhs_mat_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m512i lhs_mat_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) - const __m512i lhs_mat_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) - const __m512i lhs_mat_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + const __m512i lhs_mat_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m512i lhs_mat_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) - const __m512i lhs_mat_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) - const __m512i lhs_mat_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + const __m512i lhs_mat_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m512i lhs_mat_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) - const __m512i lhs_mat_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) - const __m512i lhs_mat_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + const __m512i lhs_mat_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m512i lhs_mat_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) - // Shuffle pattern one - left side input - const __m512i lhs_mat_s_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) - const __m512i lhs_mat_s_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + const __m512i lhs_mat_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m512i lhs_mat_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) - const __m512i lhs_mat_s_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) - const __m512i lhs_mat_s_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + const __m512i lhs_mat_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m512i lhs_mat_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) - const __m512i lhs_mat_s_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) - const __m512i lhs_mat_s_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + // Shuffle pattern one – left-side input - const __m512i lhs_mat_s_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) - const __m512i lhs_mat_s_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + const __m512i lhs_mat_s_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m512i lhs_mat_s_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) - const __m512i lhs_mat_s_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) - const __m512i lhs_mat_s_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + const __m512i lhs_mat_s_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m512i lhs_mat_s_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) - const __m512i lhs_mat_s_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) - const __m512i lhs_mat_s_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + const __m512i lhs_mat_s_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m512i lhs_mat_s_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) - const __m512i lhs_mat_s_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) - const __m512i lhs_mat_s_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + const __m512i lhs_mat_s_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m512i lhs_mat_s_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) - const __m512i lhs_mat_s_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) - const __m512i lhs_mat_s_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + const __m512i lhs_mat_s_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m512i lhs_mat_s_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) - const __m512i lhs_mat_s_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) - const __m512i lhs_mat_s_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + const __m512i lhs_mat_s_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m512i lhs_mat_s_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) - const __m512i lhs_mat_s_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) - const __m512i lhs_mat_s_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + const __m512i lhs_mat_s_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m512i lhs_mat_s_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) - const __m512i lhs_mat_s_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) - const __m512i lhs_mat_s_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + const __m512i lhs_mat_s_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m512i lhs_mat_s_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) - const __m512i lhs_mat_s_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) - const __m512i lhs_mat_s_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + const __m512i lhs_mat_s_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m512i lhs_mat_s_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) - const __m512i lhs_mat_s_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) - const __m512i lhs_mat_s_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + const __m512i lhs_mat_s_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m512i lhs_mat_s_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) - const __m512i lhs_mat_s_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) - const __m512i lhs_mat_s_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + const __m512i lhs_mat_s_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m512i lhs_mat_s_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) - const __m512i lhs_mat_s_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) - const __m512i lhs_mat_s_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + const __m512i lhs_mat_s_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m512i lhs_mat_s_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) - const __m512i lhs_mat_s_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) - const __m512i lhs_mat_s_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + const __m512i lhs_mat_s_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m512i lhs_mat_s_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) - // Shuffle pattern two- left side input - const __m512i lhs_mat_s_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) - const __m512i lhs_mat_s_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + const __m512i lhs_mat_s_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m512i lhs_mat_s_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) - const __m512i lhs_mat_s_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) - const __m512i lhs_mat_s_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + const __m512i lhs_mat_s_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m512i lhs_mat_s_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) - const __m512i lhs_mat_s_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) - const __m512i lhs_mat_s_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + const __m512i lhs_mat_s_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m512i lhs_mat_s_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) - const __m512i lhs_mat_s_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) - const __m512i lhs_mat_s_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + // Shuffle pattern two – left-side input - const __m512i lhs_mat_s_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) - const __m512i lhs_mat_s_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + const __m512i lhs_mat_s_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m512i lhs_mat_s_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) - const __m512i lhs_mat_s_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) - const __m512i lhs_mat_s_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + const __m512i lhs_mat_s_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m512i lhs_mat_s_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) - const __m512i lhs_mat_s_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) - const __m512i lhs_mat_s_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + const __m512i lhs_mat_s_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m512i lhs_mat_s_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) - const __m512i lhs_mat_s_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) - const __m512i lhs_mat_s_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + const __m512i lhs_mat_s_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m512i lhs_mat_s_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) - const __m512i lhs_mat_s_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) - const __m512i lhs_mat_s_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + const __m512i lhs_mat_s_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m512i lhs_mat_s_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) - const __m512i lhs_mat_s_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) - const __m512i lhs_mat_s_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + const __m512i lhs_mat_s_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m512i lhs_mat_s_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) - const __m512i lhs_mat_s_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) - const __m512i lhs_mat_s_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + const __m512i lhs_mat_s_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m512i lhs_mat_s_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) - const __m512i lhs_mat_s_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) - const __m512i lhs_mat_s_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + const __m512i lhs_mat_s_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m512i lhs_mat_s_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) - const __m512i lhs_mat_s_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) - const __m512i lhs_mat_s_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + const __m512i lhs_mat_s_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m512i lhs_mat_s_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) - const __m512i lhs_mat_s_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) - const __m512i lhs_mat_s_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + const __m512i lhs_mat_s_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m512i lhs_mat_s_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) - const __m512i lhs_mat_s_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) - const __m512i lhs_mat_s_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + const __m512i lhs_mat_s_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m512i lhs_mat_s_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) - const __m512i lhs_mat_s_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) - const __m512i lhs_mat_s_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + const __m512i lhs_mat_s_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m512i lhs_mat_s_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) - // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane + const __m512i lhs_mat_s_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m512i lhs_mat_s_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m512i lhs_mat_s_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m512i lhs_mat_s_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m512i lhs_mat_s_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m512i lhs_mat_s_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m512i lhs_mat_s_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m512i lhs_mat_s_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) + + // The values arranged in shuffle patterns are operated with dot product operation within 16 bit lane i.e corresponding bytes and multiplied and added into 16 bit integers within 16 bit lane __m512i iacc_mat_00_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); __m512i iacc_mat_01_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); @@ -8577,7 +8581,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m512i iacc_mat_10_7 = _mm512_add_epi16(iacc_mat_10_7_sp1, iacc_mat_10_7_sp2); __m512i iacc_mat_11_7 = _mm512_add_epi16(iacc_mat_11_7_sp1, iacc_mat_11_7_sp2); - // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block + // Multiply madd of quants with scales iacc_mat_00_0 = _mm512_madd_epi16(iacc_mat_00_0, scale_014589CD_0); iacc_mat_01_0 = _mm512_madd_epi16(iacc_mat_01_0, scale_2367ABEF_0); iacc_mat_10_0 = _mm512_madd_epi16(iacc_mat_10_0, scale_014589CD_0); @@ -8618,7 +8622,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo iacc_mat_10_7 = _mm512_madd_epi16(iacc_mat_10_7, scale_014589CD_7); iacc_mat_11_7 = _mm512_madd_epi16(iacc_mat_11_7, scale_2367ABEF_7); - __m512i iacc_mat_00 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_0, iacc_mat_00_1), _mm512_add_epi32(iacc_mat_00_2, iacc_mat_00_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_4, iacc_mat_00_5), _mm512_add_epi32(iacc_mat_00_6, iacc_mat_00_7))); __m512i iacc_mat_01 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_0, iacc_mat_01_1), _mm512_add_epi32(iacc_mat_01_2, iacc_mat_01_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_4, iacc_mat_01_5), _mm512_add_epi32(iacc_mat_01_6, iacc_mat_01_7))); __m512i iacc_mat_10 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_0, iacc_mat_10_1), _mm512_add_epi32(iacc_mat_10_2, iacc_mat_10_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_4, iacc_mat_10_5), _mm512_add_epi32(iacc_mat_10_6, iacc_mat_10_7))); @@ -9259,7 +9262,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i lhs_mat_s_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) const __m256i lhs_mat_s_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) - // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane + // The values arranged in shuffle patterns are operated with dot product operation within 16 bit lane i.e corresponding bytes and multiplied and added into 16 bit integers within 16 bit lane __m256i iacc_mat_00_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); __m256i iacc_mat_01_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); @@ -9397,7 +9400,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i iacc_mat_10_7 = _mm256_add_epi16(iacc_mat_10_7_sp1, iacc_mat_10_7_sp2); __m256i iacc_mat_11_7 = _mm256_add_epi16(iacc_mat_11_7_sp1, iacc_mat_11_7_sp2); - // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block + // Multiply madd of quants with scales iacc_mat_00_0 = _mm256_madd_epi16(iacc_mat_00_0, scale_0145_0); iacc_mat_01_0 = _mm256_madd_epi16(iacc_mat_01_0, scale_2367_0); iacc_mat_10_0 = _mm256_madd_epi16(iacc_mat_10_0, scale_0145_0); @@ -10066,7 +10069,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i lhs_mat_s_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) const __m256i lhs_mat_s_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) - // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane + // The values arranged in shuffle patterns are operated with dot product operation within 16 bit lane i.e corresponding bytes and multiplied and added into 16 bit integers within 16 bit lane __m256i iacc_mat_00_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); __m256i iacc_mat_01_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_2367_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); @@ -10204,7 +10207,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i iacc_mat_10_7 = _mm256_add_epi16(iacc_mat_10_7_sp1, iacc_mat_10_7_sp2); __m256i iacc_mat_11_7 = _mm256_add_epi16(iacc_mat_11_7_sp1, iacc_mat_11_7_sp2); - // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block + // Multiply madd of quants with scales iacc_mat_00_0 = _mm256_madd_epi16(iacc_mat_00_0, scale_0145_0); iacc_mat_01_0 = _mm256_madd_epi16(iacc_mat_01_0, scale_2367_0); iacc_mat_10_0 = _mm256_madd_epi16(iacc_mat_10_0, scale_0145_0); From d6fb079cb5af3f90724c3659a99d22459debd4a2 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 18:03:00 +0530 Subject: [PATCH 10/23] Cleanup commit for AVX2 GEMM bigger loop --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 352 +++++++++++++------------- 1 file changed, 181 insertions(+), 171 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 52aa99f2bd..7a18d42939 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -7256,6 +7256,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m512i lhs_mat_01_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_71), lhs_mat_ymm_01_71, 1); __m512i lhs_mat_23_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_71), lhs_mat_ymm_23_71, 1); + // Multiply Q8 quants with bytes valued 32 - Subtracted later as an adjustment for 6 bit quantization __m512i lhs_mat_s_01_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_00); __m512i lhs_mat_s_23_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_00); __m512i lhs_mat_s_01_01 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_01); @@ -8210,6 +8211,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m512i lhs_mat_01_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_71), lhs_mat_ymm_01_71, 1); __m512i lhs_mat_23_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_71), lhs_mat_ymm_23_71, 1); + // Multiply Q8 quants with bytes valued 32 - Subtracted later as an adjustment for 6 bit quantization __m512i lhs_mat_s_01_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_00); __m512i lhs_mat_s_23_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_00); __m512i lhs_mat_s_01_01 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_01); @@ -8660,6 +8662,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif + //Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation for (; y < anr / 4; y += 4){ const block_q8_Kx4 * a_ptrs[4]; @@ -8685,23 +8688,27 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d); for (int sb = 0; sb < QK_K / 128; sb++) { - const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + sb * 512)); - const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 32 + sb * 512)); - const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 64 + sb * 512)); - const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 96 + sb * 512)); - const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 128 + sb * 512)); - const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 160 + sb * 512)); - const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 192 + sb * 512)); - const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 224 + sb * 512)); - const __m256i rhs_raw_mat_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 256 + sb * 512)); - const __m256i rhs_raw_mat_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 288 + sb * 512)); - const __m256i rhs_raw_mat_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 320 + sb * 512)); - const __m256i rhs_raw_mat_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 352 + sb * 512)); - const __m256i rhs_raw_mat_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 384 + sb * 512)); - const __m256i rhs_raw_mat_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 416 + sb * 512)); - const __m256i rhs_raw_mat_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 448 + sb * 512)); - const __m256i rhs_raw_mat_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 480 + sb * 512)); + // Load the eight block_q6_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7 + // The lower and higher packed bits are loaded, unpacked and individual bytes representing 6 bits each are formed from the same + // They are blended/permuted for further mul mat operations within the pipeline + const __m256i rhs_raw_lbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + sb * 512)); + const __m256i rhs_raw_lbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_lbit_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_lbit_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_lbit_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_lbit_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_lbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_lbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 224 + sb * 512)); + + const __m256i rhs_raw_lbit_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_lbit_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_lbit_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_lbit_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_lbit_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_lbit_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_lbit_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_lbit_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 480 + sb * 512)); const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + sb * 256)); const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 32 + sb * 256)); @@ -8713,24 +8720,24 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 224 + sb * 256)); // Indices 0 through 7 (first block): - const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); - const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240); - const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240); - const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240); + const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240); + const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_1, requiredOrder), rhs_raw_lbit_4567_1, 240); + const __m256i rhs_raw_lbit_0145_2 = _mm256_blend_epi32(rhs_raw_lbit_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_2, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_2, requiredOrder), rhs_raw_lbit_4567_2, 240); + const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240); // Indices 4 through 7 (second block): - const __m256i rhs_raw_mat_0145_4 = _mm256_blend_epi32(rhs_raw_mat_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_4, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_4, requiredOrder), rhs_raw_mat_4567_4, 240); - const __m256i rhs_raw_mat_0145_5 = _mm256_blend_epi32(rhs_raw_mat_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_5, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_5, requiredOrder), rhs_raw_mat_4567_5, 240); - const __m256i rhs_raw_mat_0145_6 = _mm256_blend_epi32(rhs_raw_mat_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_6, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_6, requiredOrder), rhs_raw_mat_4567_6, 240); - const __m256i rhs_raw_mat_0145_7 = _mm256_blend_epi32(rhs_raw_mat_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_7, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_7, requiredOrder), rhs_raw_mat_4567_7, 240); + const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240); + const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_5, requiredOrder), rhs_raw_lbit_4567_5, 240); + const __m256i rhs_raw_lbit_0145_6 = _mm256_blend_epi32(rhs_raw_lbit_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_6, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_6, requiredOrder), rhs_raw_lbit_4567_6, 240); + const __m256i rhs_raw_lbit_0145_7 = _mm256_blend_epi32(rhs_raw_lbit_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_7, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_7, requiredOrder), rhs_raw_lbit_4567_7, 240); const __m256i rhs_raw_hbit_0145_0 = _mm256_blend_epi32(rhs_raw_hbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_0, requiredOrder), 240); const __m256i rhs_raw_hbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_0, requiredOrder), rhs_raw_hbit_4567_0, 240); @@ -8784,56 +8791,59 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95 const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127 - // 0 -7, 64 - 71 + // 4 bit values are unpacked/denibbled and bitwise or-ed with the hbit values to form the 6 bit quantized values + + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 0 -7, 64 - 71 const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_0, m4), rhs_hbit_0145_00); const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4), rhs_hbit_0145_40); const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_0, m4), rhs_hbit_2367_00); const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4), rhs_hbit_2367_40); - // 8 - 15, 72 - 79 + // Index : 8 - 15, 72 - 79 const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_1, m4), rhs_hbit_0145_01); const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4), rhs_hbit_0145_41); const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_1, m4), rhs_hbit_2367_01); const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4), rhs_hbit_2367_41); - // 16 - 23, 80 - 87 + // Index : 16 - 23, 80 - 87 const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_2, m4), rhs_hbit_0145_10); const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_2, 4), m4), rhs_hbit_0145_50); const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_2, m4), rhs_hbit_2367_10); const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_2, 4), m4), rhs_hbit_2367_50); - // 24 - 31, 88 - 95 + // Index : 24 - 31, 88 - 95 const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_3, m4), rhs_hbit_0145_11); const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_3, 4), m4), rhs_hbit_0145_51); const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_3, m4), rhs_hbit_2367_11); const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_3, 4), m4), rhs_hbit_2367_51); - // 32 - 39, 96 - 103 + // Index : 32 - 39, 96 - 103 const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_4, m4), rhs_hbit_0145_20); const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_4, 4), m4), rhs_hbit_0145_60); const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_4, m4), rhs_hbit_2367_20); const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_4, 4), m4), rhs_hbit_2367_60); - // 40 - 47, 104 - 111 + // Index : 40 - 47, 104 - 111 const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_5, m4), rhs_hbit_0145_21); const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_5, 4), m4), rhs_hbit_0145_61); const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_5, m4), rhs_hbit_2367_21); const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_5, 4), m4), rhs_hbit_2367_61); - // 48 - 55, 112 - 119 + // Index : 48 - 55, 112 - 119 const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_6, m4), rhs_hbit_0145_30); const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_6, 4), m4), rhs_hbit_0145_70); const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_6, m4), rhs_hbit_2367_30); const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_6, 4), m4), rhs_hbit_2367_70); - // 56 - 63, 120 - 127 + // Index : 56 - 63, 120 - 127 const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_7, m4), rhs_hbit_0145_31); const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_7, 4), m4), rhs_hbit_0145_71); @@ -8889,7 +8899,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_mat_0145_71_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_71, 136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) const __m256i rhs_mat_2367_71_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_71, 136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) - // Shuffle pattern two - right side input const __m256i rhs_mat_0145_00_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_00, 221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) const __m256i rhs_mat_2367_00_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_00, 221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) @@ -8941,7 +8950,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo //Scales of corresponding sub blocks from different Q6_K structures are stored together //s00 s01 s10 s11 s20 s21 ...... s70 s71 - // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop const __m128i scales_01 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + sb * 64)); const __m128i scales_23 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 16 + sb * 64)); const __m128i scales_45 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 32 + sb * 64)); @@ -8982,7 +8990,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo for (int rp = 0; rp < 4; rp++) { // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3 - // Loaded as set of 128 bit vectors and repeated into a 256 bit vector + // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector __m256i lhs_mat_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 512 * sb))); __m256i lhs_mat_01_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 0); __m256i lhs_mat_23_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 17); @@ -9033,6 +9041,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i lhs_mat_01_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 0); __m256i lhs_mat_23_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 17); + // Multiply Q8 quants with bytes valued 32 - Subtracted later as an adjustment for 6 bit quantization __m256i lhs_mat_s_01_00 = _mm256_maddubs_epi16(m32s, lhs_mat_01_00); __m256i lhs_mat_s_23_00 = _mm256_maddubs_epi16(m32s, lhs_mat_23_00); __m256i lhs_mat_s_01_01 = _mm256_maddubs_epi16(m32s, lhs_mat_01_01); @@ -9067,200 +9076,200 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i lhs_mat_s_23_71 = _mm256_maddubs_epi16(m32s, lhs_mat_23_71); // Shuffle pattern one - left side input - const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) - const __m256i lhs_mat_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m256i lhs_mat_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) - const __m256i lhs_mat_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) - const __m256i lhs_mat_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + const __m256i lhs_mat_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m256i lhs_mat_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) - const __m256i lhs_mat_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) - const __m256i lhs_mat_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + const __m256i lhs_mat_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m256i lhs_mat_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) - const __m256i lhs_mat_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) - const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + const __m256i lhs_mat_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) - const __m256i lhs_mat_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) - const __m256i lhs_mat_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + const __m256i lhs_mat_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m256i lhs_mat_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) - const __m256i lhs_mat_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) - const __m256i lhs_mat_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + const __m256i lhs_mat_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m256i lhs_mat_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) - const __m256i lhs_mat_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) - const __m256i lhs_mat_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + const __m256i lhs_mat_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m256i lhs_mat_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) - const __m256i lhs_mat_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) - const __m256i lhs_mat_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + const __m256i lhs_mat_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m256i lhs_mat_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) - const __m256i lhs_mat_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) - const __m256i lhs_mat_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + const __m256i lhs_mat_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m256i lhs_mat_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) - const __m256i lhs_mat_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) - const __m256i lhs_mat_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + const __m256i lhs_mat_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m256i lhs_mat_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) - const __m256i lhs_mat_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) - const __m256i lhs_mat_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + const __m256i lhs_mat_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m256i lhs_mat_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) - const __m256i lhs_mat_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) - const __m256i lhs_mat_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + const __m256i lhs_mat_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m256i lhs_mat_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) - const __m256i lhs_mat_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) - const __m256i lhs_mat_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + const __m256i lhs_mat_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m256i lhs_mat_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) - const __m256i lhs_mat_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) - const __m256i lhs_mat_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + const __m256i lhs_mat_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m256i lhs_mat_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) - const __m256i lhs_mat_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) - const __m256i lhs_mat_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + const __m256i lhs_mat_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m256i lhs_mat_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) - const __m256i lhs_mat_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) - const __m256i lhs_mat_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + const __m256i lhs_mat_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m256i lhs_mat_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) // Shuffle pattern two- left side input - const __m256i lhs_mat_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) - const __m256i lhs_mat_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + const __m256i lhs_mat_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m256i lhs_mat_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) - const __m256i lhs_mat_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) - const __m256i lhs_mat_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + const __m256i lhs_mat_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m256i lhs_mat_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) - const __m256i lhs_mat_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) - const __m256i lhs_mat_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + const __m256i lhs_mat_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m256i lhs_mat_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) - const __m256i lhs_mat_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) - const __m256i lhs_mat_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + const __m256i lhs_mat_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m256i lhs_mat_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) - const __m256i lhs_mat_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) - const __m256i lhs_mat_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + const __m256i lhs_mat_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m256i lhs_mat_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) - const __m256i lhs_mat_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) - const __m256i lhs_mat_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + const __m256i lhs_mat_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m256i lhs_mat_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) - const __m256i lhs_mat_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) - const __m256i lhs_mat_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + const __m256i lhs_mat_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m256i lhs_mat_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) - const __m256i lhs_mat_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) - const __m256i lhs_mat_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + const __m256i lhs_mat_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m256i lhs_mat_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) - const __m256i lhs_mat_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) - const __m256i lhs_mat_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + const __m256i lhs_mat_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m256i lhs_mat_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) - const __m256i lhs_mat_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) - const __m256i lhs_mat_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + const __m256i lhs_mat_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m256i lhs_mat_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) - const __m256i lhs_mat_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) - const __m256i lhs_mat_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + const __m256i lhs_mat_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m256i lhs_mat_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) - const __m256i lhs_mat_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) - const __m256i lhs_mat_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + const __m256i lhs_mat_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m256i lhs_mat_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) - const __m256i lhs_mat_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) - const __m256i lhs_mat_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + const __m256i lhs_mat_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m256i lhs_mat_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) - const __m256i lhs_mat_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) - const __m256i lhs_mat_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + const __m256i lhs_mat_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m256i lhs_mat_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) - const __m256i lhs_mat_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) - const __m256i lhs_mat_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + const __m256i lhs_mat_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m256i lhs_mat_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) - const __m256i lhs_mat_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) - const __m256i lhs_mat_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + const __m256i lhs_mat_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m256i lhs_mat_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) // Shuffle pattern one - left side input - const __m256i lhs_mat_s_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) - const __m256i lhs_mat_s_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + const __m256i lhs_mat_s_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m256i lhs_mat_s_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) - const __m256i lhs_mat_s_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) - const __m256i lhs_mat_s_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + const __m256i lhs_mat_s_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m256i lhs_mat_s_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) - const __m256i lhs_mat_s_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) - const __m256i lhs_mat_s_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + const __m256i lhs_mat_s_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m256i lhs_mat_s_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) - const __m256i lhs_mat_s_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) - const __m256i lhs_mat_s_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + const __m256i lhs_mat_s_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m256i lhs_mat_s_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) - const __m256i lhs_mat_s_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) - const __m256i lhs_mat_s_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + const __m256i lhs_mat_s_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m256i lhs_mat_s_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) - const __m256i lhs_mat_s_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) - const __m256i lhs_mat_s_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + const __m256i lhs_mat_s_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m256i lhs_mat_s_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) - const __m256i lhs_mat_s_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) - const __m256i lhs_mat_s_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + const __m256i lhs_mat_s_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m256i lhs_mat_s_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) - const __m256i lhs_mat_s_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) - const __m256i lhs_mat_s_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + const __m256i lhs_mat_s_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m256i lhs_mat_s_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) - const __m256i lhs_mat_s_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) - const __m256i lhs_mat_s_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + const __m256i lhs_mat_s_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m256i lhs_mat_s_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) - const __m256i lhs_mat_s_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) - const __m256i lhs_mat_s_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + const __m256i lhs_mat_s_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m256i lhs_mat_s_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) - const __m256i lhs_mat_s_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) - const __m256i lhs_mat_s_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + const __m256i lhs_mat_s_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m256i lhs_mat_s_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) - const __m256i lhs_mat_s_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) - const __m256i lhs_mat_s_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + const __m256i lhs_mat_s_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m256i lhs_mat_s_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) - const __m256i lhs_mat_s_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) - const __m256i lhs_mat_s_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + const __m256i lhs_mat_s_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m256i lhs_mat_s_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) - const __m256i lhs_mat_s_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) - const __m256i lhs_mat_s_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + const __m256i lhs_mat_s_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m256i lhs_mat_s_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) - const __m256i lhs_mat_s_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) - const __m256i lhs_mat_s_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + const __m256i lhs_mat_s_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m256i lhs_mat_s_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) - const __m256i lhs_mat_s_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) - const __m256i lhs_mat_s_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + const __m256i lhs_mat_s_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m256i lhs_mat_s_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) // Shuffle pattern two- left side input - const __m256i lhs_mat_s_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) - const __m256i lhs_mat_s_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + const __m256i lhs_mat_s_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m256i lhs_mat_s_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) - const __m256i lhs_mat_s_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) - const __m256i lhs_mat_s_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + const __m256i lhs_mat_s_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m256i lhs_mat_s_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) - const __m256i lhs_mat_s_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) - const __m256i lhs_mat_s_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + const __m256i lhs_mat_s_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m256i lhs_mat_s_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) - const __m256i lhs_mat_s_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) - const __m256i lhs_mat_s_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + const __m256i lhs_mat_s_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m256i lhs_mat_s_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) - const __m256i lhs_mat_s_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) - const __m256i lhs_mat_s_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + const __m256i lhs_mat_s_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m256i lhs_mat_s_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) - const __m256i lhs_mat_s_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) - const __m256i lhs_mat_s_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + const __m256i lhs_mat_s_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m256i lhs_mat_s_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) - const __m256i lhs_mat_s_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) - const __m256i lhs_mat_s_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + const __m256i lhs_mat_s_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m256i lhs_mat_s_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) - const __m256i lhs_mat_s_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) - const __m256i lhs_mat_s_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + const __m256i lhs_mat_s_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m256i lhs_mat_s_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) - const __m256i lhs_mat_s_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) - const __m256i lhs_mat_s_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + const __m256i lhs_mat_s_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m256i lhs_mat_s_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) - const __m256i lhs_mat_s_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) - const __m256i lhs_mat_s_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + const __m256i lhs_mat_s_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m256i lhs_mat_s_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) - const __m256i lhs_mat_s_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) - const __m256i lhs_mat_s_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + const __m256i lhs_mat_s_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m256i lhs_mat_s_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) - const __m256i lhs_mat_s_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) - const __m256i lhs_mat_s_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + const __m256i lhs_mat_s_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m256i lhs_mat_s_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) - const __m256i lhs_mat_s_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) - const __m256i lhs_mat_s_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + const __m256i lhs_mat_s_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m256i lhs_mat_s_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) - const __m256i lhs_mat_s_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) - const __m256i lhs_mat_s_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + const __m256i lhs_mat_s_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m256i lhs_mat_s_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) - const __m256i lhs_mat_s_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) - const __m256i lhs_mat_s_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + const __m256i lhs_mat_s_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m256i lhs_mat_s_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) - const __m256i lhs_mat_s_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) - const __m256i lhs_mat_s_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + const __m256i lhs_mat_s_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m256i lhs_mat_s_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) // The values arranged in shuffle patterns are operated with dot product operation within 16 bit lane i.e corresponding bytes and multiplied and added into 16 bit integers within 16 bit lane __m256i iacc_mat_00_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); @@ -9840,6 +9849,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i lhs_mat_23_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 17); + // Multiply Q8 quants with bytes valued 32 - Subtracted later as an adjustment for 6 bit quantization __m256i lhs_mat_s_01_00 = _mm256_maddubs_epi16(m32s, lhs_mat_01_00); __m256i lhs_mat_s_23_00 = _mm256_maddubs_epi16(m32s, lhs_mat_23_00); __m256i lhs_mat_s_01_01 = _mm256_maddubs_epi16(m32s, lhs_mat_01_01); From 266fa8002085b555b9fd72e63396df3c0bedf9c7 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 18:14:26 +0530 Subject: [PATCH 11/23] Cleanup of smaller loop of AVX2' --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 340 +++++++++++++------------- 1 file changed, 172 insertions(+), 168 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 7a18d42939..668240791e 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -9501,23 +9501,26 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Loop to iterate over the sixteen sub blocks of a super block - eight sub blocks are processed per iteration for (int sb = 0; sb < QK_K / 128; sb++) { - const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + sb * 512)); - const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 32 + sb * 512)); - const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 64 + sb * 512)); - const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 96 + sb * 512)); - const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 128 + sb * 512)); - const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 160 + sb * 512)); - const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 192 + sb * 512)); - const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 224 + sb * 512)); + // Load the eight block_q6_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7 + // The lower and higher packed bits are loaded, unpacked and individual bytes representing 6 bits each are formed from the same + // They are blended/permuted for further mul mat operations within the pipeline + const __m256i rhs_raw_lbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + sb * 512)); + const __m256i rhs_raw_lbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_lbit_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_lbit_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_lbit_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_lbit_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_lbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_lbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 224 + sb * 512)); - const __m256i rhs_raw_mat_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 256 + sb * 512)); - const __m256i rhs_raw_mat_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 288 + sb * 512)); - const __m256i rhs_raw_mat_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 320 + sb * 512)); - const __m256i rhs_raw_mat_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 352 + sb * 512)); - const __m256i rhs_raw_mat_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 384 + sb * 512)); - const __m256i rhs_raw_mat_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 416 + sb * 512)); - const __m256i rhs_raw_mat_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 448 + sb * 512)); - const __m256i rhs_raw_mat_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 480 + sb * 512)); + const __m256i rhs_raw_lbit_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_lbit_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_lbit_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_lbit_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_lbit_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_lbit_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_lbit_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_lbit_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].ql + 480 + sb * 512)); const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + sb * 256)); const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 32 + sb * 256)); @@ -9529,24 +9532,24 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 224 + sb * 256)); // Indices 0 through 7 (first block): - const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); - const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240); - const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240); - const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240); + const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240); + const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_1, requiredOrder), rhs_raw_lbit_4567_1, 240); + const __m256i rhs_raw_lbit_0145_2 = _mm256_blend_epi32(rhs_raw_lbit_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_2, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_2, requiredOrder), rhs_raw_lbit_4567_2, 240); + const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240); // Indices 4 through 7 (second block): - const __m256i rhs_raw_mat_0145_4 = _mm256_blend_epi32(rhs_raw_mat_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_4, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_4, requiredOrder), rhs_raw_mat_4567_4, 240); - const __m256i rhs_raw_mat_0145_5 = _mm256_blend_epi32(rhs_raw_mat_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_5, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_5, requiredOrder), rhs_raw_mat_4567_5, 240); - const __m256i rhs_raw_mat_0145_6 = _mm256_blend_epi32(rhs_raw_mat_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_6, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_6, requiredOrder), rhs_raw_mat_4567_6, 240); - const __m256i rhs_raw_mat_0145_7 = _mm256_blend_epi32(rhs_raw_mat_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_7, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_7, requiredOrder), rhs_raw_mat_4567_7, 240); + const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240); + const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_5, requiredOrder), rhs_raw_lbit_4567_5, 240); + const __m256i rhs_raw_lbit_0145_6 = _mm256_blend_epi32(rhs_raw_lbit_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_6, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_6, requiredOrder), rhs_raw_lbit_4567_6, 240); + const __m256i rhs_raw_lbit_0145_7 = _mm256_blend_epi32(rhs_raw_lbit_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_7, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_7, requiredOrder), rhs_raw_lbit_4567_7, 240); const __m256i rhs_raw_hbit_0145_0 = _mm256_blend_epi32(rhs_raw_hbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_0, requiredOrder), 240); const __m256i rhs_raw_hbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_0, requiredOrder), rhs_raw_hbit_4567_0, 240); @@ -9600,56 +9603,57 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95 const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127 - // 0 -7, 64 - 71 + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 0 -7, 64 - 71 const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_0, m4), rhs_hbit_0145_00); const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4), rhs_hbit_0145_40); const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_0, m4), rhs_hbit_2367_00); const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4), rhs_hbit_2367_40); - // 8 - 15, 72 - 79 + // Index : 8 - 15, 72 - 79 const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_1, m4), rhs_hbit_0145_01); const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4), rhs_hbit_0145_41); const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_1, m4), rhs_hbit_2367_01); const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4), rhs_hbit_2367_41); - // 16 - 23, 80 - 87 + // Index : 16 - 23, 80 - 87 const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_2, m4), rhs_hbit_0145_10); const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_2, 4), m4), rhs_hbit_0145_50); const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_2, m4), rhs_hbit_2367_10); const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_2, 4), m4), rhs_hbit_2367_50); - // 24 - 31, 88 - 95 + // Index : 24 - 31, 88 - 95 const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_3, m4), rhs_hbit_0145_11); const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_3, 4), m4), rhs_hbit_0145_51); const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_3, m4), rhs_hbit_2367_11); const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_3, 4), m4), rhs_hbit_2367_51); - // 32 - 39, 96 - 103 + // Index : 32 - 39, 96 - 103 const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_4, m4), rhs_hbit_0145_20); const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_4, 4), m4), rhs_hbit_0145_60); const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_4, m4), rhs_hbit_2367_20); const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_4, 4), m4), rhs_hbit_2367_60); - // 40 - 47, 104 - 111 + // Index : 40 - 47, 104 - 111 const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_5, m4), rhs_hbit_0145_21); const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_5, 4), m4), rhs_hbit_0145_61); const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_5, m4), rhs_hbit_2367_21); const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_5, 4), m4), rhs_hbit_2367_61); - // 48 - 55, 112 - 119 + // Index : 48 - 55, 112 - 119 const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_6, m4), rhs_hbit_0145_30); const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_6, 4), m4), rhs_hbit_0145_70); const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_6, m4), rhs_hbit_2367_30); const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_6, 4), m4), rhs_hbit_2367_70); - // 56 - 63, 120 - 127 + // Index : 56 - 63, 120 - 127 const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_7, m4), rhs_hbit_0145_31); const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_7, 4), m4), rhs_hbit_0145_71); @@ -9884,200 +9888,200 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i lhs_mat_s_23_71 = _mm256_maddubs_epi16(m32s, lhs_mat_23_71); // Shuffle pattern one - left side input - const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) - const __m256i lhs_mat_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m256i lhs_mat_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) - const __m256i lhs_mat_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) - const __m256i lhs_mat_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + const __m256i lhs_mat_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m256i lhs_mat_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) - const __m256i lhs_mat_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) - const __m256i lhs_mat_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + const __m256i lhs_mat_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m256i lhs_mat_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) - const __m256i lhs_mat_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) - const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + const __m256i lhs_mat_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) - const __m256i lhs_mat_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) - const __m256i lhs_mat_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + const __m256i lhs_mat_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m256i lhs_mat_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) - const __m256i lhs_mat_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) - const __m256i lhs_mat_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + const __m256i lhs_mat_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m256i lhs_mat_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) - const __m256i lhs_mat_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) - const __m256i lhs_mat_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + const __m256i lhs_mat_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m256i lhs_mat_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) - const __m256i lhs_mat_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) - const __m256i lhs_mat_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + const __m256i lhs_mat_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m256i lhs_mat_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) - const __m256i lhs_mat_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) - const __m256i lhs_mat_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + const __m256i lhs_mat_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m256i lhs_mat_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) - const __m256i lhs_mat_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) - const __m256i lhs_mat_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + const __m256i lhs_mat_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m256i lhs_mat_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) - const __m256i lhs_mat_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) - const __m256i lhs_mat_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + const __m256i lhs_mat_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m256i lhs_mat_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) - const __m256i lhs_mat_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) - const __m256i lhs_mat_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + const __m256i lhs_mat_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m256i lhs_mat_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) - const __m256i lhs_mat_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) - const __m256i lhs_mat_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + const __m256i lhs_mat_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m256i lhs_mat_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) - const __m256i lhs_mat_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) - const __m256i lhs_mat_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + const __m256i lhs_mat_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m256i lhs_mat_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) - const __m256i lhs_mat_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) - const __m256i lhs_mat_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + const __m256i lhs_mat_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m256i lhs_mat_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) - const __m256i lhs_mat_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) - const __m256i lhs_mat_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + const __m256i lhs_mat_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m256i lhs_mat_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) // Shuffle pattern two- left side input - const __m256i lhs_mat_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) - const __m256i lhs_mat_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + const __m256i lhs_mat_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m256i lhs_mat_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) - const __m256i lhs_mat_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) - const __m256i lhs_mat_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + const __m256i lhs_mat_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m256i lhs_mat_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) - const __m256i lhs_mat_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) - const __m256i lhs_mat_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + const __m256i lhs_mat_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m256i lhs_mat_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) - const __m256i lhs_mat_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) - const __m256i lhs_mat_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + const __m256i lhs_mat_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m256i lhs_mat_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) - const __m256i lhs_mat_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) - const __m256i lhs_mat_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + const __m256i lhs_mat_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m256i lhs_mat_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) - const __m256i lhs_mat_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) - const __m256i lhs_mat_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + const __m256i lhs_mat_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m256i lhs_mat_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) - const __m256i lhs_mat_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) - const __m256i lhs_mat_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + const __m256i lhs_mat_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m256i lhs_mat_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) - const __m256i lhs_mat_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) - const __m256i lhs_mat_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + const __m256i lhs_mat_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m256i lhs_mat_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) - const __m256i lhs_mat_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) - const __m256i lhs_mat_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + const __m256i lhs_mat_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m256i lhs_mat_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) - const __m256i lhs_mat_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) - const __m256i lhs_mat_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + const __m256i lhs_mat_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m256i lhs_mat_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) - const __m256i lhs_mat_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) - const __m256i lhs_mat_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + const __m256i lhs_mat_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m256i lhs_mat_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) - const __m256i lhs_mat_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) - const __m256i lhs_mat_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + const __m256i lhs_mat_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m256i lhs_mat_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) - const __m256i lhs_mat_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) - const __m256i lhs_mat_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + const __m256i lhs_mat_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m256i lhs_mat_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) - const __m256i lhs_mat_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) - const __m256i lhs_mat_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + const __m256i lhs_mat_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m256i lhs_mat_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) - const __m256i lhs_mat_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) - const __m256i lhs_mat_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + const __m256i lhs_mat_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m256i lhs_mat_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) - const __m256i lhs_mat_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) - const __m256i lhs_mat_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + const __m256i lhs_mat_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m256i lhs_mat_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) // Shuffle pattern one - left side input - const __m256i lhs_mat_s_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) - const __m256i lhs_mat_s_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + const __m256i lhs_mat_s_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m256i lhs_mat_s_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) - const __m256i lhs_mat_s_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) - const __m256i lhs_mat_s_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + const __m256i lhs_mat_s_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m256i lhs_mat_s_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) - const __m256i lhs_mat_s_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) - const __m256i lhs_mat_s_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + const __m256i lhs_mat_s_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m256i lhs_mat_s_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) - const __m256i lhs_mat_s_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) - const __m256i lhs_mat_s_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + const __m256i lhs_mat_s_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m256i lhs_mat_s_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) - const __m256i lhs_mat_s_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) - const __m256i lhs_mat_s_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + const __m256i lhs_mat_s_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m256i lhs_mat_s_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) - const __m256i lhs_mat_s_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) - const __m256i lhs_mat_s_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + const __m256i lhs_mat_s_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m256i lhs_mat_s_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) - const __m256i lhs_mat_s_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) - const __m256i lhs_mat_s_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + const __m256i lhs_mat_s_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m256i lhs_mat_s_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) - const __m256i lhs_mat_s_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) - const __m256i lhs_mat_s_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + const __m256i lhs_mat_s_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m256i lhs_mat_s_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) - const __m256i lhs_mat_s_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) - const __m256i lhs_mat_s_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + const __m256i lhs_mat_s_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m256i lhs_mat_s_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) - const __m256i lhs_mat_s_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) - const __m256i lhs_mat_s_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + const __m256i lhs_mat_s_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m256i lhs_mat_s_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) - const __m256i lhs_mat_s_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) - const __m256i lhs_mat_s_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + const __m256i lhs_mat_s_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m256i lhs_mat_s_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) - const __m256i lhs_mat_s_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) - const __m256i lhs_mat_s_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + const __m256i lhs_mat_s_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m256i lhs_mat_s_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) - const __m256i lhs_mat_s_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) - const __m256i lhs_mat_s_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + const __m256i lhs_mat_s_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m256i lhs_mat_s_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) - const __m256i lhs_mat_s_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) - const __m256i lhs_mat_s_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + const __m256i lhs_mat_s_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m256i lhs_mat_s_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) - const __m256i lhs_mat_s_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) - const __m256i lhs_mat_s_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + const __m256i lhs_mat_s_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m256i lhs_mat_s_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) - const __m256i lhs_mat_s_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) - const __m256i lhs_mat_s_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + const __m256i lhs_mat_s_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m256i lhs_mat_s_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) // Shuffle pattern two- left side input - const __m256i lhs_mat_s_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) - const __m256i lhs_mat_s_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + const __m256i lhs_mat_s_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m256i lhs_mat_s_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) - const __m256i lhs_mat_s_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) - const __m256i lhs_mat_s_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + const __m256i lhs_mat_s_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m256i lhs_mat_s_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) - const __m256i lhs_mat_s_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) - const __m256i lhs_mat_s_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + const __m256i lhs_mat_s_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m256i lhs_mat_s_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) - const __m256i lhs_mat_s_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) - const __m256i lhs_mat_s_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + const __m256i lhs_mat_s_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m256i lhs_mat_s_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) - const __m256i lhs_mat_s_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) - const __m256i lhs_mat_s_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + const __m256i lhs_mat_s_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m256i lhs_mat_s_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) - const __m256i lhs_mat_s_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) - const __m256i lhs_mat_s_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + const __m256i lhs_mat_s_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m256i lhs_mat_s_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) - const __m256i lhs_mat_s_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) - const __m256i lhs_mat_s_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + const __m256i lhs_mat_s_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m256i lhs_mat_s_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) - const __m256i lhs_mat_s_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) - const __m256i lhs_mat_s_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + const __m256i lhs_mat_s_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m256i lhs_mat_s_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) - const __m256i lhs_mat_s_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) - const __m256i lhs_mat_s_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + const __m256i lhs_mat_s_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m256i lhs_mat_s_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) - const __m256i lhs_mat_s_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) - const __m256i lhs_mat_s_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + const __m256i lhs_mat_s_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m256i lhs_mat_s_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) - const __m256i lhs_mat_s_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) - const __m256i lhs_mat_s_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + const __m256i lhs_mat_s_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m256i lhs_mat_s_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) - const __m256i lhs_mat_s_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) - const __m256i lhs_mat_s_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + const __m256i lhs_mat_s_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m256i lhs_mat_s_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) - const __m256i lhs_mat_s_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) - const __m256i lhs_mat_s_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + const __m256i lhs_mat_s_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m256i lhs_mat_s_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) - const __m256i lhs_mat_s_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) - const __m256i lhs_mat_s_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + const __m256i lhs_mat_s_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m256i lhs_mat_s_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) - const __m256i lhs_mat_s_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) - const __m256i lhs_mat_s_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + const __m256i lhs_mat_s_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m256i lhs_mat_s_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) - const __m256i lhs_mat_s_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) - const __m256i lhs_mat_s_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + const __m256i lhs_mat_s_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m256i lhs_mat_s_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_s_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) // The values arranged in shuffle patterns are operated with dot product operation within 16 bit lane i.e corresponding bytes and multiplied and added into 16 bit integers within 16 bit lane __m256i iacc_mat_00_0_sp1 = _mm256_add_epi16(_mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm256_sub_epi16(_mm256_maddubs_epi16(rhs_mat_0145_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); From c29ac56955f37d55081d9a5986f124651d4c006a Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 18:22:15 +0530 Subject: [PATCH 12/23] Further cleanup --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 132 +++++++++++++------------- 1 file changed, 68 insertions(+), 64 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 668240791e..1b196d14df 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -8795,60 +8795,62 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 - const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_0, m4), rhs_hbit_0145_00); - const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4), rhs_hbit_0145_40); + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 0 -7, 64 - 71 + const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_0, m4), rhs_hbit_0145_00); + const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_0, 4), m4), rhs_hbit_0145_40); - const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_0, m4), rhs_hbit_2367_00); - const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4), rhs_hbit_2367_40); + const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_0, m4), rhs_hbit_2367_00); + const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_0, 4), m4), rhs_hbit_2367_40); // Index : 8 - 15, 72 - 79 - const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_1, m4), rhs_hbit_0145_01); - const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4), rhs_hbit_0145_41); + const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_1, m4), rhs_hbit_0145_01); + const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_1, 4), m4), rhs_hbit_0145_41); - const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_1, m4), rhs_hbit_2367_01); - const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4), rhs_hbit_2367_41); + const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_1, m4), rhs_hbit_2367_01); + const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_1, 4), m4), rhs_hbit_2367_41); // Index : 16 - 23, 80 - 87 - const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_2, m4), rhs_hbit_0145_10); - const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_2, 4), m4), rhs_hbit_0145_50); + const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_2, m4), rhs_hbit_0145_10); + const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_2, 4), m4), rhs_hbit_0145_50); - const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_2, m4), rhs_hbit_2367_10); - const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_2, 4), m4), rhs_hbit_2367_50); + const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_2, m4), rhs_hbit_2367_10); + const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_2, 4), m4), rhs_hbit_2367_50); // Index : 24 - 31, 88 - 95 - const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_3, m4), rhs_hbit_0145_11); - const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_3, 4), m4), rhs_hbit_0145_51); + const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_3, m4), rhs_hbit_0145_11); + const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_3, 4), m4), rhs_hbit_0145_51); - const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_3, m4), rhs_hbit_2367_11); - const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_3, 4), m4), rhs_hbit_2367_51); + const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_3, m4), rhs_hbit_2367_11); + const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_3, 4), m4), rhs_hbit_2367_51); // Index : 32 - 39, 96 - 103 - const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_4, m4), rhs_hbit_0145_20); - const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_4, 4), m4), rhs_hbit_0145_60); + const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_4, m4), rhs_hbit_0145_20); + const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_4, 4), m4), rhs_hbit_0145_60); - const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_4, m4), rhs_hbit_2367_20); - const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_4, 4), m4), rhs_hbit_2367_60); + const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_4, m4), rhs_hbit_2367_20); + const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_4, 4), m4), rhs_hbit_2367_60); // Index : 40 - 47, 104 - 111 - const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_5, m4), rhs_hbit_0145_21); - const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_5, 4), m4), rhs_hbit_0145_61); + const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_5, m4), rhs_hbit_0145_21); + const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_5, 4), m4), rhs_hbit_0145_61); - const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_5, m4), rhs_hbit_2367_21); - const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_5, 4), m4), rhs_hbit_2367_61); + const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_5, m4), rhs_hbit_2367_21); + const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_5, 4), m4), rhs_hbit_2367_61); // Index : 48 - 55, 112 - 119 - const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_6, m4), rhs_hbit_0145_30); - const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_6, 4), m4), rhs_hbit_0145_70); + const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_6, m4), rhs_hbit_0145_30); + const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_6, 4), m4), rhs_hbit_0145_70); - const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_6, m4), rhs_hbit_2367_30); - const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_6, 4), m4), rhs_hbit_2367_70); + const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_6, m4), rhs_hbit_2367_30); + const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_6, 4), m4), rhs_hbit_2367_70); // Index : 56 - 63, 120 - 127 - const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_7, m4), rhs_hbit_0145_31); - const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_7, 4), m4), rhs_hbit_0145_71); + const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_7, m4), rhs_hbit_0145_31); + const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_7, 4), m4), rhs_hbit_0145_71); - const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_7, m4), rhs_hbit_2367_31); - const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_7, 4), m4), rhs_hbit_2367_71); + const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_7, m4), rhs_hbit_2367_31); + const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_7, 4), m4), rhs_hbit_2367_71); // Shuffle pattern one - right side input const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) @@ -9605,60 +9607,62 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 - const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_0, m4), rhs_hbit_0145_00); - const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4), rhs_hbit_0145_40); + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 0 -7, 64 - 71 + const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_0, m4), rhs_hbit_0145_00); + const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_0, 4), m4), rhs_hbit_0145_40); - const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_0, m4), rhs_hbit_2367_00); - const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4), rhs_hbit_2367_40); + const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_0, m4), rhs_hbit_2367_00); + const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_0, 4), m4), rhs_hbit_2367_40); // Index : 8 - 15, 72 - 79 - const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_1, m4), rhs_hbit_0145_01); - const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4), rhs_hbit_0145_41); + const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_1, m4), rhs_hbit_0145_01); + const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_1, 4), m4), rhs_hbit_0145_41); - const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_1, m4), rhs_hbit_2367_01); - const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4), rhs_hbit_2367_41); + const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_1, m4), rhs_hbit_2367_01); + const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_1, 4), m4), rhs_hbit_2367_41); // Index : 16 - 23, 80 - 87 - const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_2, m4), rhs_hbit_0145_10); - const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_2, 4), m4), rhs_hbit_0145_50); + const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_2, m4), rhs_hbit_0145_10); + const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_2, 4), m4), rhs_hbit_0145_50); - const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_2, m4), rhs_hbit_2367_10); - const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_2, 4), m4), rhs_hbit_2367_50); + const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_2, m4), rhs_hbit_2367_10); + const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_2, 4), m4), rhs_hbit_2367_50); // Index : 24 - 31, 88 - 95 - const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_3, m4), rhs_hbit_0145_11); - const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_3, 4), m4), rhs_hbit_0145_51); + const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_3, m4), rhs_hbit_0145_11); + const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_3, 4), m4), rhs_hbit_0145_51); - const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_3, m4), rhs_hbit_2367_11); - const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_3, 4), m4), rhs_hbit_2367_51); + const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_3, m4), rhs_hbit_2367_11); + const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_3, 4), m4), rhs_hbit_2367_51); // Index : 32 - 39, 96 - 103 - const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_4, m4), rhs_hbit_0145_20); - const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_4, 4), m4), rhs_hbit_0145_60); + const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_4, m4), rhs_hbit_0145_20); + const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_4, 4), m4), rhs_hbit_0145_60); - const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_4, m4), rhs_hbit_2367_20); - const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_4, 4), m4), rhs_hbit_2367_60); + const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_4, m4), rhs_hbit_2367_20); + const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_4, 4), m4), rhs_hbit_2367_60); // Index : 40 - 47, 104 - 111 - const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_5, m4), rhs_hbit_0145_21); - const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_5, 4), m4), rhs_hbit_0145_61); + const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_5, m4), rhs_hbit_0145_21); + const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_5, 4), m4), rhs_hbit_0145_61); - const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_5, m4), rhs_hbit_2367_21); - const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_5, 4), m4), rhs_hbit_2367_61); + const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_5, m4), rhs_hbit_2367_21); + const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_5, 4), m4), rhs_hbit_2367_61); // Index : 48 - 55, 112 - 119 - const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_6, m4), rhs_hbit_0145_30); - const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_6, 4), m4), rhs_hbit_0145_70); + const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_6, m4), rhs_hbit_0145_30); + const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_6, 4), m4), rhs_hbit_0145_70); - const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_6, m4), rhs_hbit_2367_30); - const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_6, 4), m4), rhs_hbit_2367_70); + const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_6, m4), rhs_hbit_2367_30); + const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_6, 4), m4), rhs_hbit_2367_70); // Index : 56 - 63, 120 - 127 - const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_7, m4), rhs_hbit_0145_31); - const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_7, 4), m4), rhs_hbit_0145_71); + const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_7, m4), rhs_hbit_0145_31); + const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_7, 4), m4), rhs_hbit_0145_71); - const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_2367_7, m4), rhs_hbit_2367_31); - const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_7, 4), m4), rhs_hbit_2367_71); + const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_7, m4), rhs_hbit_2367_31); + const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_7, 4), m4), rhs_hbit_2367_71); // Shuffle pattern one - right side input const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) From 4806d6a8fef66af2b62eb90c7845f6f7743051cb Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 18:53:34 +0530 Subject: [PATCH 13/23] Add further fixes and updates to scalar code --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 130 +++++++++++++------------- ggml/src/ggml-cpu/repack.cpp | 56 ++++------- 2 files changed, 82 insertions(+), 104 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 1b196d14df..7e6a375e89 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -6702,7 +6702,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined(__AVX2__) +#if defined(__AVX2__) || defined(__AVX512F__) const block_q6_Kx8 * b_ptr_start = (const block_q6_Kx8 * ) vx; const block_q8_Kx4 * a_ptr_start = (const block_q8_Kx4 * ) vy; int64_t b_nb = n / QK_K; @@ -8797,60 +8797,60 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Index : 0 -7, 64 - 71 // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 - const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_0, m4), rhs_hbit_0145_00); - const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_0, 4), m4), rhs_hbit_0145_40); + const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4), rhs_hbit_0145_00); + const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_0, 4), m4), rhs_hbit_0145_40); - const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_0, m4), rhs_hbit_2367_00); - const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_0, 4), m4), rhs_hbit_2367_40); + const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_0, m4), rhs_hbit_2367_00); + const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_0, 4), m4), rhs_hbit_2367_40); // Index : 8 - 15, 72 - 79 - const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_1, m4), rhs_hbit_0145_01); - const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_1, 4), m4), rhs_hbit_0145_41); + const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_1, m4), rhs_hbit_0145_01); + const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_1, 4), m4), rhs_hbit_0145_41); - const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_1, m4), rhs_hbit_2367_01); - const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_1, 4), m4), rhs_hbit_2367_41); + const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_1, m4), rhs_hbit_2367_01); + const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_1, 4), m4), rhs_hbit_2367_41); // Index : 16 - 23, 80 - 87 - const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_2, m4), rhs_hbit_0145_10); - const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_2, 4), m4), rhs_hbit_0145_50); + const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_2, m4), rhs_hbit_0145_10); + const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_2, 4), m4), rhs_hbit_0145_50); - const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_2, m4), rhs_hbit_2367_10); - const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_2, 4), m4), rhs_hbit_2367_50); + const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_2, m4), rhs_hbit_2367_10); + const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_2, 4), m4), rhs_hbit_2367_50); // Index : 24 - 31, 88 - 95 - const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_3, m4), rhs_hbit_0145_11); - const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_3, 4), m4), rhs_hbit_0145_51); + const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_3, m4), rhs_hbit_0145_11); + const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_3, 4), m4), rhs_hbit_0145_51); - const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_3, m4), rhs_hbit_2367_11); - const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_3, 4), m4), rhs_hbit_2367_51); + const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_3, m4), rhs_hbit_2367_11); + const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_3, 4), m4), rhs_hbit_2367_51); // Index : 32 - 39, 96 - 103 - const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_4, m4), rhs_hbit_0145_20); - const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_4, 4), m4), rhs_hbit_0145_60); + const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_4, m4), rhs_hbit_0145_20); + const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_4, 4), m4), rhs_hbit_0145_60); - const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_4, m4), rhs_hbit_2367_20); - const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_4, 4), m4), rhs_hbit_2367_60); + const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_4, m4), rhs_hbit_2367_20); + const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_4, 4), m4), rhs_hbit_2367_60); // Index : 40 - 47, 104 - 111 - const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_5, m4), rhs_hbit_0145_21); - const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_5, 4), m4), rhs_hbit_0145_61); + const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_5, m4), rhs_hbit_0145_21); + const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_5, 4), m4), rhs_hbit_0145_61); - const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_5, m4), rhs_hbit_2367_21); - const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_5, 4), m4), rhs_hbit_2367_61); + const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_5, m4), rhs_hbit_2367_21); + const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_5, 4), m4), rhs_hbit_2367_61); // Index : 48 - 55, 112 - 119 - const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_6, m4), rhs_hbit_0145_30); - const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_6, 4), m4), rhs_hbit_0145_70); + const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_6, m4), rhs_hbit_0145_30); + const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_6, 4), m4), rhs_hbit_0145_70); - const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_6, m4), rhs_hbit_2367_30); - const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_6, 4), m4), rhs_hbit_2367_70); + const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_6, m4), rhs_hbit_2367_30); + const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_6, 4), m4), rhs_hbit_2367_70); // Index : 56 - 63, 120 - 127 - const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_7, m4), rhs_hbit_0145_31); - const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_7, 4), m4), rhs_hbit_0145_71); + const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_7, m4), rhs_hbit_0145_31); + const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_7, 4), m4), rhs_hbit_0145_71); - const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_7, m4), rhs_hbit_2367_31); - const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_7, 4), m4), rhs_hbit_2367_71); + const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_7, m4), rhs_hbit_2367_31); + const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_7, 4), m4), rhs_hbit_2367_71); // Shuffle pattern one - right side input const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) @@ -9609,60 +9609,60 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Index : 0 -7, 64 - 71 // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 - const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_0, m4), rhs_hbit_0145_00); - const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_0, 4), m4), rhs_hbit_0145_40); + const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4), rhs_hbit_0145_00); + const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_0, 4), m4), rhs_hbit_0145_40); - const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_0, m4), rhs_hbit_2367_00); - const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_0, 4), m4), rhs_hbit_2367_40); + const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_0, m4), rhs_hbit_2367_00); + const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_0, 4), m4), rhs_hbit_2367_40); // Index : 8 - 15, 72 - 79 - const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_1, m4), rhs_hbit_0145_01); - const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_1, 4), m4), rhs_hbit_0145_41); + const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_1, m4), rhs_hbit_0145_01); + const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_1, 4), m4), rhs_hbit_0145_41); - const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_1, m4), rhs_hbit_2367_01); - const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_1, 4), m4), rhs_hbit_2367_41); + const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_1, m4), rhs_hbit_2367_01); + const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_1, 4), m4), rhs_hbit_2367_41); // Index : 16 - 23, 80 - 87 - const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_2, m4), rhs_hbit_0145_10); - const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_2, 4), m4), rhs_hbit_0145_50); + const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_2, m4), rhs_hbit_0145_10); + const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_2, 4), m4), rhs_hbit_0145_50); - const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_2, m4), rhs_hbit_2367_10); - const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_2, 4), m4), rhs_hbit_2367_50); + const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_2, m4), rhs_hbit_2367_10); + const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_2, 4), m4), rhs_hbit_2367_50); // Index : 24 - 31, 88 - 95 - const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_3, m4), rhs_hbit_0145_11); - const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_3, 4), m4), rhs_hbit_0145_51); + const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_3, m4), rhs_hbit_0145_11); + const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_3, 4), m4), rhs_hbit_0145_51); - const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_3, m4), rhs_hbit_2367_11); - const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_3, 4), m4), rhs_hbit_2367_51); + const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_3, m4), rhs_hbit_2367_11); + const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_3, 4), m4), rhs_hbit_2367_51); // Index : 32 - 39, 96 - 103 - const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_4, m4), rhs_hbit_0145_20); - const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_4, 4), m4), rhs_hbit_0145_60); + const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_4, m4), rhs_hbit_0145_20); + const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_4, 4), m4), rhs_hbit_0145_60); - const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_4, m4), rhs_hbit_2367_20); - const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_4, 4), m4), rhs_hbit_2367_60); + const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_4, m4), rhs_hbit_2367_20); + const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_4, 4), m4), rhs_hbit_2367_60); // Index : 40 - 47, 104 - 111 - const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_5, m4), rhs_hbit_0145_21); - const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_5, 4), m4), rhs_hbit_0145_61); + const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_5, m4), rhs_hbit_0145_21); + const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_5, 4), m4), rhs_hbit_0145_61); - const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_5, m4), rhs_hbit_2367_21); - const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_5, 4), m4), rhs_hbit_2367_61); + const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_5, m4), rhs_hbit_2367_21); + const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_5, 4), m4), rhs_hbit_2367_61); // Index : 48 - 55, 112 - 119 - const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_6, m4), rhs_hbit_0145_30); - const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_6, 4), m4), rhs_hbit_0145_70); + const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_6, m4), rhs_hbit_0145_30); + const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_6, 4), m4), rhs_hbit_0145_70); - const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_6, m4), rhs_hbit_2367_30); - const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_6, 4), m4), rhs_hbit_2367_70); + const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_6, m4), rhs_hbit_2367_30); + const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_6, 4), m4), rhs_hbit_2367_70); // Index : 56 - 63, 120 - 127 - const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_0145_7, m4), rhs_hbit_0145_31); - const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_0145_7, 4), m4), rhs_hbit_0145_71); + const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_7, m4), rhs_hbit_0145_31); + const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_7, 4), m4), rhs_hbit_0145_71); - const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_lbit_mat_2367_7, m4), rhs_hbit_2367_31); - const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_lbit_mat_2367_7, 4), m4), rhs_hbit_2367_71); + const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_7, m4), rhs_hbit_2367_31); + const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_7, 4), m4), rhs_hbit_2367_71); // Shuffle pattern one - right side input const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 7182479f61..2a3fe71150 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -647,10 +647,10 @@ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } for (int l = 0; l < nb; l++) { for (int k = 0; k < (qk / (4 * blocklen)); k++) { - const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ; - const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; - const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; - const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; + const int8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64; + const int8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; + const int8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; + const int8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; for (int j = 0; j < ncols_interleaved; j++) { sumi1 = 0; sumi2 = 0; @@ -659,22 +659,10 @@ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi = 0; int offset = ((k / 2) % 2) + j * 2; for (int i = 0; i < blocklen; ++i) { - const int hbits_index = k * ncols_interleaved * blocklen + j * blocklen + i; - const int lbits_index = (hbits_index / 32) * 64 + (hbits_index % 32); - const int v0_hbits = (int8_t) ((b_ptr[l].qh[hbits_index] & 3) << 4); - const int v1_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4); - const int v2_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4); - const int v3_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4); - - const int v0_lbits = (int8_t) (b_ptr[l].qh[lbits_index] & 0xF); - const int v1_lbits = (int8_t) (b_ptr[l].qh[lbits_index + 32] & 0xF); - const int v2_lbits = (int8_t) ((b_ptr[l].qh[lbits_index] >> 4) & 0xF); - const int v3_lbits = (int8_t) ((b_ptr[l].qh[lbits_index + 32] >> 4) & 0xF); - - const int v0 = v0_hbits | v0_lbits; - const int v1 = v1_hbits | v1_lbits; - const int v2 = v2_hbits | v2_lbits; - const int v3 = v3_hbits | v3_lbits; + int8_t v0 = (int8_t)((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF) - 32; + int8_t v1 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 32] & 0xF) - 32; + int8_t v2 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF) - 32; + int8_t v3 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index + 32] >> 4) & 0xF) - 32; sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]); sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]); @@ -1226,20 +1214,19 @@ void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, for (int y = 0; y < nr / 4; y++) { const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb); for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb); + const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb); for (int m = 0; m < 4; m++) { for (int j = 0; j < ncols_interleaved; j++) { sumf[m][j] = 0.0; - sum_minf[m][j] = 0.0; } } for (int l = 0; l < nb; l++) { for (int k = 0; k < (qk / (4 * blocklen)); k++) { - const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ; - const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; - const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; - const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; + const int8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64; + const int8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; + const int8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; + const int8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; for (int m = 0; m < 4; m++) { for (int j = 0; j < ncols_interleaved; j++) { sumi1 = 0; @@ -1251,20 +1238,11 @@ void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, for (int i = 0; i < blocklen; ++i){ const int hbits_index = k * ncols_interleaved * blocklen + j * blocklen + i; const int lbits_index = (hbits_index / 32) * 64 + (hbits_index % 32); - const int v0_hbits = (int8_t) ((b_ptr[l].qh[hbits_index] & 3) << 4); - const int v1_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4); - const int v2_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4); - const int v3_hbits = (int8_t) (((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4); - const int v0_lbits = (int8_t) (b_ptr[l].qh[lbits_index] & 0xF); - const int v1_lbits = (int8_t) (b_ptr[l].qh[lbits_index + 32] & 0xF); - const int v2_lbits = (int8_t) ((b_ptr[l].qh[lbits_index] >> 4) & 0xF); - const int v3_lbits = (int8_t) ((b_ptr[l].qh[lbits_index + 32] >> 4) & 0xF); - - const int v0 = v0_hbits | v0_lbits; - const int v1 = v1_hbits | v1_lbits; - const int v2 = v2_hbits | v2_lbits; - const int v3 = v3_hbits | v3_lbits; + int8_t v0 = (int8_t)((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF) - 32; + int8_t v1 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 32] & 0xF) - 32; + int8_t v2 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF) - 32; + int8_t v3 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index + 32] >> 4) & 0xF) - 32; sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]); sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]); From 5c851ca7bd5d816310fe3fc4c0fe57cb0abae96d Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 22:10:29 +0530 Subject: [PATCH 14/23] Cleanup GEMV Code --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 218 +++++++++++++------------- 1 file changed, 107 insertions(+), 111 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 7e6a375e89..0d93534524 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -1971,8 +1971,11 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Permute mask used for easier vector processing at later stages __m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + // Mask to extract 2 bits from packed bytes const __m256i m3b = _mm256_set1_epi8(3); + // Mask to extract nibbles from packed bytes const __m256i m4b = _mm256_set1_epi8(0xF); + // Vector with each byte value 32 - Used as an subtract offset for 6 bit quantized values const __m256i m32s = _mm256_set1_epi8(32); //Mask to get appropriate scales @@ -1998,7 +2001,6 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Master FP accumulators __m256 acc_row = _mm256_setzero_ps(); - __m256 acc_min_rows = _mm256_setzero_ps(); for (int64_t b = 0; b < nb; b++) { @@ -2015,135 +2017,137 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo for(int sb = 0; sb < QK_K / 128; sb++) { // Load the high bits(bit 5, 6) of eight block_q6_K for eight sub blocks quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7 - const __m256i rhs_raw_vec_qh_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + sb * 256)); - const __m256i rhs_raw_vec_qh_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 32 + sb * 256)); - const __m256i rhs_raw_vec_qh_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 64 + sb * 256)); - const __m256i rhs_raw_vec_qh_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 96 + sb * 256)); - const __m256i rhs_raw_vec_qh_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 128 + sb * 256)); - const __m256i rhs_raw_vec_qh_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 160 + sb * 256)); - const __m256i rhs_raw_vec_qh_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 192 + sb * 256)); - const __m256i rhs_raw_vec_qh_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 224 + sb * 256)); + const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + sb * 256)); + const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 32 + sb * 256)); + const __m256i rhs_raw_hbit_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 64 + sb * 256)); + const __m256i rhs_raw_hbit_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 96 + sb * 256)); + const __m256i rhs_raw_hbit_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 128 + sb * 256)); + const __m256i rhs_raw_hbit_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 160 + sb * 256)); + const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 192 + sb * 256)); + const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 224 + sb * 256)); // 2-bit -> 8-bit - // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_vec_qh_0123_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_0, m3b), 4); //B00(0-7) B01(0-7) B02(0-7) B03(0-7) - const __m256i rhs_vec_qh_0123_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 2), m3b), 4); //B20(0-7) B21(0-7) B22(0-7) B23(0-7) - const __m256i rhs_vec_qh_0123_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 4), m3b), 4); //B40(0-7) B41(0-7) B42(0-7) B43(0-7) - const __m256i rhs_vec_qh_0123_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 6), m3b), 4); //B60(0-7) B61(0-7) B62(0-7) B63(0-7) + // hbit Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop + const __m256i rhs_hbit_0123_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_0, m3b), 4); //B00(0-7) B01(0-7) B02(0-7) B03(0-7) + const __m256i rhs_hbit_0123_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_0, 2), m3b), 4); //B20(0-7) B21(0-7) B22(0-7) B23(0-7) + const __m256i rhs_hbit_0123_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_0, 4), m3b), 4); //B40(0-7) B41(0-7) B42(0-7) B43(0-7) + const __m256i rhs_hbit_0123_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_0, 6), m3b), 4); //B60(0-7) B61(0-7) B62(0-7) B63(0-7) - const __m256i rhs_vec_qh_4567_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_0, m3b), 4); //B04(0-7) B05(0-7) B06(0-7) B07(0-7) - const __m256i rhs_vec_qh_4567_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 2), m3b), 4); //B24(0-7) B25(0-7) B26(0-7) B27(0-7) - const __m256i rhs_vec_qh_4567_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 4), m3b), 4); //B44(0-7) B45(0-7) B46(0-7) B47(0-7) - const __m256i rhs_vec_qh_4567_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 6), m3b), 4); //B64(0-7) B65(0-7) B66(0-7) B67(0-7) + const __m256i rhs_hbit_4567_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_0, m3b), 4); //B04(0-7) B05(0-7) B06(0-7) B07(0-7) + const __m256i rhs_hbit_4567_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_0, 2), m3b), 4); //B24(0-7) B25(0-7) B26(0-7) B27(0-7) + const __m256i rhs_hbit_4567_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_0, 4), m3b), 4); //B44(0-7) B45(0-7) B46(0-7) B47(0-7) + const __m256i rhs_hbit_4567_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_0, 6), m3b), 4); //B64(0-7) B65(0-7) B66(0-7) B67(0-7) - const __m256i rhs_vec_qh_0123_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_1, m3b), 4); //B00(8-15) B01(8-15) B02(8-15) B03(8-15) - const __m256i rhs_vec_qh_0123_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 2), m3b), 4); //B20(8-15) B21(8-15) B22(8-15) B23(8-15) - const __m256i rhs_vec_qh_0123_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 4), m3b), 4); //B40(8-15) B41(8-15) B42(8-15) B43(8-15) - const __m256i rhs_vec_qh_0123_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 6), m3b), 4); //B60(8-15) B61(8-15) B62(8-15) B63(8-15) + const __m256i rhs_hbit_0123_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_1, m3b), 4); //B00(8-15) B01(8-15) B02(8-15) B03(8-15) + const __m256i rhs_hbit_0123_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_1, 2), m3b), 4); //B20(8-15) B21(8-15) B22(8-15) B23(8-15) + const __m256i rhs_hbit_0123_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_1, 4), m3b), 4); //B40(8-15) B41(8-15) B42(8-15) B43(8-15) + const __m256i rhs_hbit_0123_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_1, 6), m3b), 4); //B60(8-15) B61(8-15) B62(8-15) B63(8-15) - const __m256i rhs_vec_qh_4567_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_1, m3b), 4); //B04(8-15) B05(8-15) B06(8-15) B07(8-15) - const __m256i rhs_vec_qh_4567_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 2), m3b), 4); //B24(8-15) B25(8-15) B26(8-15) B27(8-15) - const __m256i rhs_vec_qh_4567_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 4), m3b), 4); //B44(8-15) B45(8-15) B46(8-15) B47(8-15) - const __m256i rhs_vec_qh_4567_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 6), m3b), 4); //B64(8-15) B65(8-15) B66(8-15) B67(8-15) + const __m256i rhs_hbit_4567_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_1, m3b), 4); //B04(8-15) B05(8-15) B06(8-15) B07(8-15) + const __m256i rhs_hbit_4567_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_1, 2), m3b), 4); //B24(8-15) B25(8-15) B26(8-15) B27(8-15) + const __m256i rhs_hbit_4567_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_1, 4), m3b), 4); //B44(8-15) B45(8-15) B46(8-15) B47(8-15) + const __m256i rhs_hbit_4567_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_1, 6), m3b), 4); //B64(8-15) B65(8-15) B66(8-15) B67(8-15) - // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_vec_qh_0123_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_2, m3b), 4); //B10(0-7) B11(0-7) B12(0-7) B13(0-7) - const __m256i rhs_vec_qh_0123_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 2), m3b), 4); //B30(0-7) B31(0-7) B32(0-7) B33(0-7) - const __m256i rhs_vec_qh_0123_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 4), m3b), 4); //B50(0-7) B51(0-7) B52(0-7) B53(0-7) - const __m256i rhs_vec_qh_0123_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 6), m3b), 4); //B70(0-7) B71(0-7) B72(0-7) B73(0-7) + // hbit Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop + const __m256i rhs_hbit_0123_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_2, m3b), 4); //B10(0-7) B11(0-7) B12(0-7) B13(0-7) + const __m256i rhs_hbit_0123_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_2, 2), m3b), 4); //B30(0-7) B31(0-7) B32(0-7) B33(0-7) + const __m256i rhs_hbit_0123_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_2, 4), m3b), 4); //B50(0-7) B51(0-7) B52(0-7) B53(0-7) + const __m256i rhs_hbit_0123_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_2, 6), m3b), 4); //B70(0-7) B71(0-7) B72(0-7) B73(0-7) - const __m256i rhs_vec_qh_4567_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_2, m3b), 4); //B14(0-7) B15(0-7) B16(0-7) B17(0-7) - const __m256i rhs_vec_qh_4567_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 2), m3b), 4); //B34(0-7) B35(0-7) B36(0-7) B37(0-7) - const __m256i rhs_vec_qh_4567_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 4), m3b), 4); //B54(0-7) B55(0-7) B56(0-7) B57(0-7) - const __m256i rhs_vec_qh_4567_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 6), m3b), 4); //B74(0-7) B75(0-7) B76(0-7) B77(0-7) + const __m256i rhs_hbit_4567_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_2, m3b), 4); //B14(0-7) B15(0-7) B16(0-7) B17(0-7) + const __m256i rhs_hbit_4567_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_2, 2), m3b), 4); //B34(0-7) B35(0-7) B36(0-7) B37(0-7) + const __m256i rhs_hbit_4567_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_2, 4), m3b), 4); //B54(0-7) B55(0-7) B56(0-7) B57(0-7) + const __m256i rhs_hbit_4567_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_2, 6), m3b), 4); //B74(0-7) B75(0-7) B76(0-7) B77(0-7) - const __m256i rhs_vec_qh_0123_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_3, m3b), 4); //B10(8-15) B11(8-15) B12(8-15) B13(8-15) - const __m256i rhs_vec_qh_0123_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 2), m3b), 4); //B30(8-15) B31(8-15) B32(8-15) B33(8-15) - const __m256i rhs_vec_qh_0123_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 4), m3b), 4); //B50(8-15) B51(8-15) B52(8-15) B53(8-15) - const __m256i rhs_vec_qh_0123_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 6), m3b), 4); //B70(8-15) B71(8-15) B72(8-15) B73(8-15) + const __m256i rhs_hbit_0123_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_3, m3b), 4); //B10(8-15) B11(8-15) B12(8-15) B13(8-15) + const __m256i rhs_hbit_0123_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_3, 2), m3b), 4); //B30(8-15) B31(8-15) B32(8-15) B33(8-15) + const __m256i rhs_hbit_0123_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_3, 4), m3b), 4); //B50(8-15) B51(8-15) B52(8-15) B53(8-15) + const __m256i rhs_hbit_0123_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_3, 6), m3b), 4); //B70(8-15) B71(8-15) B72(8-15) B73(8-15) - const __m256i rhs_vec_qh_4567_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_3, m3b), 4); //B14(8-15) B15(8-15) B16(8-15) B17(8-15) - const __m256i rhs_vec_qh_4567_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 2), m3b), 4); //B34(8-15) B35(8-15) B36(8-15) B37(8-15) - const __m256i rhs_vec_qh_4567_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 4), m3b), 4); //B54(8-15) B55(8-15) B56(8-15) B57(8-15) - const __m256i rhs_vec_qh_4567_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 6), m3b), 4); //B74(8-15) B75(8-15) B76(8-15) B77(8-15) + const __m256i rhs_hbit_4567_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_3, m3b), 4); //B14(8-15) B15(8-15) B16(8-15) B17(8-15) + const __m256i rhs_hbit_4567_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_3, 2), m3b), 4); //B34(8-15) B35(8-15) B36(8-15) B37(8-15) + const __m256i rhs_hbit_4567_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_3, 4), m3b), 4); //B54(8-15) B55(8-15) B56(8-15) B57(8-15) + const __m256i rhs_hbit_4567_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_3, 6), m3b), 4); //B74(8-15) B75(8-15) B76(8-15) B77(8-15) - // Load the lower bits(bits 0 - 3) of eight block_q6_K for eight sub blocks quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7 - const __m256i rhs_raw_vec_ql_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + sb * 512)); // 0 - 8, +64 - const __m256i rhs_raw_vec_ql_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 32 + sb * 512)); // 0 - 8 - const __m256i rhs_raw_vec_ql_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 64 + sb * 512)); // 8 - 15 - const __m256i rhs_raw_vec_ql_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 96 + sb * 512)); // 8 - 15 - const __m256i rhs_raw_vec_ql_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 128 + sb * 512)); // 16 - 23 - const __m256i rhs_raw_vec_ql_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 160 + sb * 512)); // 16 - 23 - const __m256i rhs_raw_vec_ql_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 192 + sb * 512)); // 24 - 31 - const __m256i rhs_raw_vec_ql_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 224 + sb * 512)); // 24 - 31 - const __m256i rhs_raw_vec_ql_0123_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 256 + sb * 512)); - const __m256i rhs_raw_vec_ql_4567_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 288 + sb * 512)); - const __m256i rhs_raw_vec_ql_0123_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 320 + sb * 512)); - const __m256i rhs_raw_vec_ql_4567_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 352 + sb * 512)); - const __m256i rhs_raw_vec_ql_0123_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 384 + sb * 512)); - const __m256i rhs_raw_vec_ql_4567_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 416 + sb * 512)); - const __m256i rhs_raw_vec_ql_0123_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 448 + sb * 512)); - const __m256i rhs_raw_vec_ql_4567_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 480 + sb * 512)); + // Load the eight block_q6_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7 + // The lower and higher packed bits are loaded, unpacked and individual bytes representing 6 bits each are formed from the same + // They are blended/permuted for further mul mat operations within the pipeline + const __m256i rhs_raw_lbit_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + sb * 512)); + const __m256i rhs_raw_lbit_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_lbit_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_lbit_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_lbit_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_lbit_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_lbit_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_lbit_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 224 + sb * 512)); + const __m256i rhs_raw_lbit_0123_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_lbit_4567_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_lbit_0123_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_lbit_4567_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_lbit_0123_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_lbit_4567_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_lbit_0123_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_lbit_4567_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 480 + sb * 512)); - // 0 -7, 64 - 71 - const __m256i rhs_vec_0123_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_0, m4b), rhs_vec_qh_0123_00); - const __m256i rhs_vec_0123_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_0, 4), m4b), rhs_vec_qh_0123_40); + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 0 -7, 64 - 71 + const __m256i rhs_vec_0123_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_0, m4b), rhs_hbit_0123_00); + const __m256i rhs_vec_0123_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_0, 4), m4b), rhs_hbit_0123_40); - const __m256i rhs_vec_4567_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_0, m4b), rhs_vec_qh_4567_00); - const __m256i rhs_vec_4567_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_0, 4), m4b), rhs_vec_qh_4567_40); + const __m256i rhs_vec_4567_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_0, m4b), rhs_hbit_4567_00); + const __m256i rhs_vec_4567_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_0, 4), m4b), rhs_hbit_4567_40); - // 8 - 15, 72 - 79 - const __m256i rhs_vec_0123_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_1, m4b), rhs_vec_qh_0123_01); - const __m256i rhs_vec_0123_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_1, 4), m4b), rhs_vec_qh_0123_41); + // Index : 8 - 15, 72 - 79 + const __m256i rhs_vec_0123_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_1, m4b), rhs_hbit_0123_01); + const __m256i rhs_vec_0123_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_1, 4), m4b), rhs_hbit_0123_41); - const __m256i rhs_vec_4567_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_1, m4b), rhs_vec_qh_4567_01); - const __m256i rhs_vec_4567_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_1, 4), m4b), rhs_vec_qh_4567_41); + const __m256i rhs_vec_4567_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_1, m4b), rhs_hbit_4567_01); + const __m256i rhs_vec_4567_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_1, 4), m4b), rhs_hbit_4567_41); - // 16 - 23, 80 - 87 - const __m256i rhs_vec_0123_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_2, m4b), rhs_vec_qh_0123_10); - const __m256i rhs_vec_0123_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_2, 4), m4b), rhs_vec_qh_0123_50); + // Index : 16 - 23, 80 - 87 + const __m256i rhs_vec_0123_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_2, m4b), rhs_hbit_0123_10); + const __m256i rhs_vec_0123_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_2, 4), m4b), rhs_hbit_0123_50); - const __m256i rhs_vec_4567_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_2, m4b), rhs_vec_qh_4567_10); - const __m256i rhs_vec_4567_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_2, 4), m4b), rhs_vec_qh_4567_50); + const __m256i rhs_vec_4567_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_2, m4b), rhs_hbit_4567_10); + const __m256i rhs_vec_4567_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_2, 4), m4b), rhs_hbit_4567_50); - // 24 - 31, 88 - 95 - const __m256i rhs_vec_0123_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_3, m4b), rhs_vec_qh_0123_11); - const __m256i rhs_vec_0123_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_3, 4), m4b), rhs_vec_qh_0123_51); + // Index : 24 - 31, 88 - 95 + const __m256i rhs_vec_0123_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_3, m4b), rhs_hbit_0123_11); + const __m256i rhs_vec_0123_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_3, 4), m4b), rhs_hbit_0123_51); - const __m256i rhs_vec_4567_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_3, m4b), rhs_vec_qh_4567_11); - const __m256i rhs_vec_4567_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_3, 4), m4b), rhs_vec_qh_4567_51); + const __m256i rhs_vec_4567_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_3, m4b), rhs_hbit_4567_11); + const __m256i rhs_vec_4567_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_3, 4), m4b), rhs_hbit_4567_51); - // 32 - 39, 96 - 103 - const __m256i rhs_vec_0123_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_4, m4b), rhs_vec_qh_0123_20); - const __m256i rhs_vec_0123_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_4, 4), m4b), rhs_vec_qh_0123_60); + // Index : 32 - 39, 96 - 103 + const __m256i rhs_vec_0123_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_4, m4b), rhs_hbit_0123_20); + const __m256i rhs_vec_0123_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_4, 4), m4b), rhs_hbit_0123_60); - const __m256i rhs_vec_4567_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_4, m4b), rhs_vec_qh_4567_20); - const __m256i rhs_vec_4567_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_4, 4), m4b), rhs_vec_qh_4567_60); + const __m256i rhs_vec_4567_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_4, m4b), rhs_hbit_4567_20); + const __m256i rhs_vec_4567_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_4, 4), m4b), rhs_hbit_4567_60); - // 40 - 47, 104 - 111 - const __m256i rhs_vec_0123_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_5, m4b), rhs_vec_qh_0123_21); - const __m256i rhs_vec_0123_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_5, 4), m4b), rhs_vec_qh_0123_61); + // Index : 40 - 47, 104 - 111 + const __m256i rhs_vec_0123_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_5, m4b), rhs_hbit_0123_21); + const __m256i rhs_vec_0123_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_5, 4), m4b), rhs_hbit_0123_61); - const __m256i rhs_vec_4567_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_5, m4b), rhs_vec_qh_4567_21); - const __m256i rhs_vec_4567_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_5, 4), m4b), rhs_vec_qh_4567_61); + const __m256i rhs_vec_4567_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_5, m4b), rhs_hbit_4567_21); + const __m256i rhs_vec_4567_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_5, 4), m4b), rhs_hbit_4567_61); - // 48 - 55, 112 - 119 - const __m256i rhs_vec_0123_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_6, m4b), rhs_vec_qh_0123_30); - const __m256i rhs_vec_0123_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_6, 4), m4b), rhs_vec_qh_0123_70); + // Index : 48 - 55, 112 - 119 + const __m256i rhs_vec_0123_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_6, m4b), rhs_hbit_0123_30); + const __m256i rhs_vec_0123_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_6, 4), m4b), rhs_hbit_0123_70); - const __m256i rhs_vec_4567_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_6, m4b), rhs_vec_qh_4567_30); - const __m256i rhs_vec_4567_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_6, 4), m4b), rhs_vec_qh_4567_70); + const __m256i rhs_vec_4567_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_6, m4b), rhs_hbit_4567_30); + const __m256i rhs_vec_4567_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_6, 4), m4b), rhs_hbit_4567_70); - // 56 - 63, 120 - 127 - const __m256i rhs_vec_0123_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_7, m4b), rhs_vec_qh_0123_31); - const __m256i rhs_vec_0123_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_7, 4), m4b), rhs_vec_qh_0123_71); + // Index : 56 - 63, 120 - 127 + const __m256i rhs_vec_0123_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_7, m4b), rhs_hbit_0123_31); + const __m256i rhs_vec_0123_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_7, 4), m4b), rhs_hbit_0123_71); - const __m256i rhs_vec_4567_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_7, m4b), rhs_vec_qh_4567_31); - const __m256i rhs_vec_4567_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_7, 4), m4b), rhs_vec_qh_4567_71); + const __m256i rhs_vec_4567_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_7, m4b), rhs_hbit_4567_31); + const __m256i rhs_vec_4567_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_7, 4), m4b), rhs_hbit_4567_71); //Scales of corresponding sub blocks from different Q6_K structures are stored together - //s00 s01 s10 s11 s20 s21 s30 s31 s40 s41 s50 s51 s60 s61 s70 s71 //s02 s03 //s04 s05 //s06 s07 - + //s00 s01 s10 s11 s20 s21 s30 s31 s40 s41 s50 s51 s60 s61 s70 s71 const __m128i scales_01 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + sb * 64)); const __m128i scales_23 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 16 + sb * 64)); const __m128i scales_45 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 32 + sb * 64)); @@ -2201,6 +2205,7 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo lhs_vec_6 = _mm256_permute2f128_si256(lhs_vec_6, lhs_vec_6, 0); lhs_vec_7 = _mm256_permute2f128_si256(lhs_vec_7, lhs_vec_7, 0); + // Multiply Q8 quants with bytes valued 32 - Subtracted later as an adjustment for 6 bit quantization __m256i lhs_vec_s_0 = _mm256_maddubs_epi16(m32s, lhs_vec_0); __m256i lhs_vec_s_1 = _mm256_maddubs_epi16(m32s, lhs_vec_1); __m256i lhs_vec_s_2 = _mm256_maddubs_epi16(m32s, lhs_vec_2); @@ -6922,7 +6927,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //Index : 96 - 103 // Comments indicate the indices of elements from individual super block in non interleaved fashion - // Index : 8 - 15, 72 - 79 const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //Index : 8 - 15 const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //Index : 40 - 47 const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //Index : 72 - 79 @@ -8719,7 +8723,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 192 + sb * 256)); const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 224 + sb * 256)); - // Indices 0 through 7 (first block): const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240); const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240); @@ -8729,7 +8732,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240); - // Indices 4 through 7 (second block): const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240); const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240); @@ -8793,8 +8795,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 4 bit values are unpacked/denibbled and bitwise or-ed with the hbit values to form the 6 bit quantized values - // Comments indicate the indices of elements from individual super block in non interleaved fashion - // Index : 0 -7, 64 - 71 // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4), rhs_hbit_0145_00); @@ -9533,7 +9533,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 192 + sb * 256)); const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 224 + sb * 256)); - // Indices 0 through 7 (first block): const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240); const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240); @@ -9543,7 +9542,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240); - // Indices 4 through 7 (second block): const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240); const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240); @@ -9605,8 +9603,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95 const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127 - // Comments indicate the indices of elements from individual super block in non interleaved fashion - // Index : 0 -7, 64 - 71 // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4), rhs_hbit_0145_00); From be80640fea566404557329b8f1d5413638d2b52f Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 22:14:16 +0530 Subject: [PATCH 15/23] Fix issues with scalar version --- ggml/src/ggml-cpu/repack.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 2a3fe71150..4bc5315220 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -659,6 +659,9 @@ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi = 0; int offset = ((k / 2) % 2) + j * 2; for (int i = 0; i < blocklen; ++i) { + const int hbits_index = k * ncols_interleaved * blocklen + j * blocklen + i; + const int lbits_index = (hbits_index / 32) * 64 + (hbits_index % 32); + int8_t v0 = (int8_t)((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF) - 32; int8_t v1 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 32] & 0xF) - 32; int8_t v2 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF) - 32; From a3957d11730ed4d7ae88846b09dc5c40dfd34c13 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 12 Aug 2025 22:28:06 +0530 Subject: [PATCH 16/23] Rename variables to maintain convention in other functions --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 650 +++++++++++++------------- 1 file changed, 325 insertions(+), 325 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 0d93534524..1ead6a30b3 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -6719,9 +6719,9 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo int anr = nr - nr % 16;; // Used to align nr with boundary of 16 // Mask to extract nibbles from packed bytes - const __m256i m4 = _mm256_set1_epi8(0xF); + const __m256i m4b = _mm256_set1_epi8(0xF); // Mask to extract 2 bit values from packed bytes - const __m256i m2 = _mm256_set1_epi8(3); + const __m256i m3b = _mm256_set1_epi8(3); // Vector with each byte value 32 - Used as an subtract offset for 6 bit quantized values const __m256i m32s = _mm256_set1_epi8(32); @@ -6739,11 +6739,11 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0); // Mask to extract nibbles from packed bytes - const __m512i m4_expanded = _mm512_set1_epi8(0xF); + const __m512i m4bexpanded = _mm512_set1_epi8(0xF); // Mask to extract 2 bit values from packed bytes - const __m512i m2_expanded = _mm512_set1_epi8(3); + const __m512i m3bexpanded = _mm512_set1_epi8(3); // Vector with each byte set to 32 - Used as an subtraction adjustment factor for 6 bit quantization - const __m512i m32s_expanded = _mm512_set1_epi8(32); + const __m512i m32expanded = _mm512_set1_epi8(32); //Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation for (; y < anr / 4; y += 4){ @@ -6916,106 +6916,106 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 2-bit -> 8-bit // hbit Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //Index : 0 - 7 - const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //Index : 32 - 39 - const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //Index : 64 - 71 - const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //Index : 96 - 103 + const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m3bexpanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m3bexpanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m3bexpanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m3bexpanded), 4); //Index : 96 - 103 - const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //Index : 0 - 7 - const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //Index : 32 - 39 - const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //Index : 64 - 71 - const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //Index : 96 - 103 + const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m3bexpanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m3bexpanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m3bexpanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m3bexpanded), 4); //Index : 96 - 103 // Comments indicate the indices of elements from individual super block in non interleaved fashion - const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //Index : 8 - 15 - const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //Index : 40 - 47 - const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //Index : 72 - 79 - const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //Index : 104 - 111 + const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m3bexpanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m3bexpanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m3bexpanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m3bexpanded), 4); //Index : 104 - 111 - const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //Index : 8 - 15 - const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //Index : 40 - 47 - const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //Index : 72 - 79 - const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //Index : 104 - 111 + const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m3bexpanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m3bexpanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m3bexpanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m3bexpanded), 4); //Index : 104 - 111 // hbit values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //Index : 16 - 23 - const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //Index : 48 - 55 - const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //Index : 80 - 87 - const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //Index : 112 - 119 + const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m3bexpanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m3bexpanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m3bexpanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m3bexpanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //Index : 16 - 23 - const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //Index : 48 - 55 - const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //Index : 80 - 87 - const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //Index : 112 - 119 + const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m3bexpanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m3bexpanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m3bexpanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m3bexpanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //Index : 24 - 31 - const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //Index : 56 - 63 - const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //Index : 88 - 95 - const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //Index : 120 - 127 + const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m3bexpanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m3bexpanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m3bexpanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m3bexpanded), 4); //Index : 120 - 127 - const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //Index : 24 - 31 - const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //Index : 56 - 63 - const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //Index : 88 - 95 - const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //Index : 120 - 127 + const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m3bexpanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m3bexpanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m3bexpanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m3bexpanded), 4); //Index : 120 - 127 // 4 bit values are unpacked/denibbled and bitwise or-ed with the hbit values to form the 6 bit quantized values // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 - const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); - const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_0, 4), m4_expanded), rhs_hbit_014589CD_40); + const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_0, m4bexpanded), rhs_hbit_014589CD_00); + const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_0, 4), m4bexpanded), rhs_hbit_014589CD_40); - const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_0, m4_expanded), rhs_hbit_2367ABEF_00); - const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_0, 4), m4_expanded), rhs_hbit_2367ABEF_40); + const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_0, m4bexpanded), rhs_hbit_2367ABEF_00); + const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_0, 4), m4bexpanded), rhs_hbit_2367ABEF_40); // Index : 8 - 15, 72 - 79 - const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_1, m4_expanded), rhs_hbit_014589CD_01); - const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_1, 4), m4_expanded), rhs_hbit_014589CD_41); + const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_1, m4bexpanded), rhs_hbit_014589CD_01); + const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_1, 4), m4bexpanded), rhs_hbit_014589CD_41); - const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_1, m4_expanded), rhs_hbit_2367ABEF_01); - const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_1, 4), m4_expanded), rhs_hbit_2367ABEF_41); + const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_1, m4bexpanded), rhs_hbit_2367ABEF_01); + const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_1, 4), m4bexpanded), rhs_hbit_2367ABEF_41); // Index : 16 - 23, 80 - 87 - const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_2, m4_expanded), rhs_hbit_014589CD_10); - const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_2, 4), m4_expanded), rhs_hbit_014589CD_50); + const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_2, m4bexpanded), rhs_hbit_014589CD_10); + const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_2, 4), m4bexpanded), rhs_hbit_014589CD_50); - const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_2, m4_expanded), rhs_hbit_2367ABEF_10); - const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_2, 4), m4_expanded), rhs_hbit_2367ABEF_50); + const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_2, m4bexpanded), rhs_hbit_2367ABEF_10); + const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_2, 4), m4bexpanded), rhs_hbit_2367ABEF_50); // Index : 24 - 31, 88 - 95 - const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_3, m4_expanded), rhs_hbit_014589CD_11); - const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_3, 4), m4_expanded), rhs_hbit_014589CD_51); + const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_3, m4bexpanded), rhs_hbit_014589CD_11); + const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_3, 4), m4bexpanded), rhs_hbit_014589CD_51); - const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_3, m4_expanded), rhs_hbit_2367ABEF_11); - const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_3, 4), m4_expanded), rhs_hbit_2367ABEF_51); + const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_3, m4bexpanded), rhs_hbit_2367ABEF_11); + const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_3, 4), m4bexpanded), rhs_hbit_2367ABEF_51); // Index : 32 - 39, 96 - 103 - const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_4, m4_expanded), rhs_hbit_014589CD_20); - const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_4, 4), m4_expanded), rhs_hbit_014589CD_60); + const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_4, m4bexpanded), rhs_hbit_014589CD_20); + const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_4, 4), m4bexpanded), rhs_hbit_014589CD_60); - const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_4, m4_expanded), rhs_hbit_2367ABEF_20); - const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_4, 4), m4_expanded), rhs_hbit_2367ABEF_60); + const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_4, m4bexpanded), rhs_hbit_2367ABEF_20); + const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_4, 4), m4bexpanded), rhs_hbit_2367ABEF_60); // Index : 40 - 47, 104 - 111 - const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_5, m4_expanded), rhs_hbit_014589CD_21); - const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_5, 4), m4_expanded), rhs_hbit_014589CD_61); + const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_5, m4bexpanded), rhs_hbit_014589CD_21); + const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_5, 4), m4bexpanded), rhs_hbit_014589CD_61); - const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_5, m4_expanded), rhs_hbit_2367ABEF_21); - const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_5, 4), m4_expanded), rhs_hbit_2367ABEF_61); + const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_5, m4bexpanded), rhs_hbit_2367ABEF_21); + const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_5, 4), m4bexpanded), rhs_hbit_2367ABEF_61); // Index : 48 - 55, 112 - 119 - const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_6, m4_expanded), rhs_hbit_014589CD_30); - const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_6, 4), m4_expanded), rhs_hbit_014589CD_70); + const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_6, m4bexpanded), rhs_hbit_014589CD_30); + const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_6, 4), m4bexpanded), rhs_hbit_014589CD_70); - const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_6, m4_expanded), rhs_hbit_2367ABEF_30); - const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_6, 4), m4_expanded), rhs_hbit_2367ABEF_70); + const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_6, m4bexpanded), rhs_hbit_2367ABEF_30); + const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_6, 4), m4bexpanded), rhs_hbit_2367ABEF_70); // Index : 56 - 63, 120 - 127 - const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_7, m4_expanded), rhs_hbit_014589CD_31); - const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_7, 4), m4_expanded), rhs_hbit_014589CD_71); + const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_7, m4bexpanded), rhs_hbit_014589CD_31); + const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_7, 4), m4bexpanded), rhs_hbit_014589CD_71); - const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_7, m4_expanded), rhs_hbit_2367ABEF_31); - const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_7, 4), m4_expanded), rhs_hbit_2367ABEF_71); + const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_7, m4bexpanded), rhs_hbit_2367ABEF_31); + const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_7, 4), m4bexpanded), rhs_hbit_2367ABEF_71); // Shuffle pattern one - right side input const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) B08(0-3) B09(0-3) B08(0-3) B09(0-3) B0C(0-3) B0D(0-3) B0C(0-3) B0D(0-3) @@ -7261,38 +7261,38 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m512i lhs_mat_23_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_71), lhs_mat_ymm_23_71, 1); // Multiply Q8 quants with bytes valued 32 - Subtracted later as an adjustment for 6 bit quantization - __m512i lhs_mat_s_01_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_00); - __m512i lhs_mat_s_23_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_00); - __m512i lhs_mat_s_01_01 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_01); - __m512i lhs_mat_s_23_01 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_01); - __m512i lhs_mat_s_01_10 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_10); - __m512i lhs_mat_s_23_10 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_10); - __m512i lhs_mat_s_01_11 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_11); - __m512i lhs_mat_s_23_11 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_11); - __m512i lhs_mat_s_01_20 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_20); - __m512i lhs_mat_s_23_20 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_20); - __m512i lhs_mat_s_01_21 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_21); - __m512i lhs_mat_s_23_21 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_21); - __m512i lhs_mat_s_01_30 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_30); - __m512i lhs_mat_s_23_30 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_30); - __m512i lhs_mat_s_01_31 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_31); - __m512i lhs_mat_s_23_31 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_31); - __m512i lhs_mat_s_01_40 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_40); - __m512i lhs_mat_s_23_40 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_40); - __m512i lhs_mat_s_01_41 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_41); - __m512i lhs_mat_s_23_41 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_41); - __m512i lhs_mat_s_01_50 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_50); - __m512i lhs_mat_s_23_50 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_50); - __m512i lhs_mat_s_01_51 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_51); - __m512i lhs_mat_s_23_51 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_51); - __m512i lhs_mat_s_01_60 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_60); - __m512i lhs_mat_s_23_60 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_60); - __m512i lhs_mat_s_01_61 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_61); - __m512i lhs_mat_s_23_61 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_61); - __m512i lhs_mat_s_01_70 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_70); - __m512i lhs_mat_s_23_70 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_70); - __m512i lhs_mat_s_01_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_71); - __m512i lhs_mat_s_23_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_71); + __m512i lhs_mat_s_01_00 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_00); + __m512i lhs_mat_s_23_00 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_00); + __m512i lhs_mat_s_01_01 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_01); + __m512i lhs_mat_s_23_01 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_01); + __m512i lhs_mat_s_01_10 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_10); + __m512i lhs_mat_s_23_10 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_10); + __m512i lhs_mat_s_01_11 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_11); + __m512i lhs_mat_s_23_11 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_11); + __m512i lhs_mat_s_01_20 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_20); + __m512i lhs_mat_s_23_20 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_20); + __m512i lhs_mat_s_01_21 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_21); + __m512i lhs_mat_s_23_21 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_21); + __m512i lhs_mat_s_01_30 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_30); + __m512i lhs_mat_s_23_30 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_30); + __m512i lhs_mat_s_01_31 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_31); + __m512i lhs_mat_s_23_31 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_31); + __m512i lhs_mat_s_01_40 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_40); + __m512i lhs_mat_s_23_40 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_40); + __m512i lhs_mat_s_01_41 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_41); + __m512i lhs_mat_s_23_41 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_41); + __m512i lhs_mat_s_01_50 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_50); + __m512i lhs_mat_s_23_50 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_50); + __m512i lhs_mat_s_01_51 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_51); + __m512i lhs_mat_s_23_51 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_51); + __m512i lhs_mat_s_01_60 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_60); + __m512i lhs_mat_s_23_60 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_60); + __m512i lhs_mat_s_01_61 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_61); + __m512i lhs_mat_s_23_61 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_61); + __m512i lhs_mat_s_01_70 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_70); + __m512i lhs_mat_s_23_70 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_70); + __m512i lhs_mat_s_01_71 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_71); + __m512i lhs_mat_s_23_71 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_71); // Shuffle pattern one – left-side input @@ -7872,107 +7872,107 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 2-bit -> 8-bit // hbit Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //Index : 0 - 7 - const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //Index : 32 - 39 - const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //Index : 64 - 71 - const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //Index : 96 - 103 + const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m3bexpanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m3bexpanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m3bexpanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m3bexpanded), 4); //Index : 96 - 103 - const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //Index : 0 - 7 - const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //Index : 32 - 39 - const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //Index : 64 - 71 - const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //Index : 96 - 103 + const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m3bexpanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m3bexpanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m3bexpanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m3bexpanded), 4); //Index : 96 - 103 // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 8 - 15, 72 - 79 - const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //Index : 8 - 15 - const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //Index : 40 - 47 - const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //Index : 72 - 79 - const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //Index : 104 - 111 + const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m3bexpanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m3bexpanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m3bexpanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m3bexpanded), 4); //Index : 104 - 111 - const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //Index : 8 - 15 - const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //Index : 40 - 47 - const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //Index : 72 - 79 - const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //Index : 104 - 111 + const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m3bexpanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m3bexpanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m3bexpanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m3bexpanded), 4); //Index : 104 - 111 // hbit values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //Index : 16 - 23 - const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //Index : 48 - 55 - const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //Index : 80 - 87 - const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //Index : 112 - 119 + const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m3bexpanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m3bexpanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m3bexpanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m3bexpanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //Index : 16 - 23 - const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //Index : 48 - 55 - const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //Index : 80 - 87 - const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //Index : 112 - 119 + const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m3bexpanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m3bexpanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m3bexpanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m3bexpanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //Index : 24 - 31 - const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //Index : 56 - 63 - const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //Index : 88 - 95 - const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //Index : 120 - 127 + const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m3bexpanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m3bexpanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m3bexpanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m3bexpanded), 4); //Index : 120 - 127 - const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //Index : 24 - 31 - const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //Index : 56 - 63 - const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //Index : 88 - 95 - const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //Index : 120 - 127 + const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m3bexpanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m3bexpanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m3bexpanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m3bexpanded), 4); //Index : 120 - 127 // 4 bit values are unpacked/denibbled and bitwise or-ed with the hbit values to form the 6 bit quantized values // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 - const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); - const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_0, 4), m4_expanded), rhs_hbit_014589CD_40); + const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_0, m4bexpanded), rhs_hbit_014589CD_00); + const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_0, 4), m4bexpanded), rhs_hbit_014589CD_40); - const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_0, m4_expanded), rhs_hbit_2367ABEF_00); - const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_0, 4), m4_expanded), rhs_hbit_2367ABEF_40); + const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_0, m4bexpanded), rhs_hbit_2367ABEF_00); + const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_0, 4), m4bexpanded), rhs_hbit_2367ABEF_40); // Index : 8 - 15, 72 - 79 - const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_1, m4_expanded), rhs_hbit_014589CD_01); - const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_1, 4), m4_expanded), rhs_hbit_014589CD_41); + const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_1, m4bexpanded), rhs_hbit_014589CD_01); + const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_1, 4), m4bexpanded), rhs_hbit_014589CD_41); - const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_1, m4_expanded), rhs_hbit_2367ABEF_01); - const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_1, 4), m4_expanded), rhs_hbit_2367ABEF_41); + const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_1, m4bexpanded), rhs_hbit_2367ABEF_01); + const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_1, 4), m4bexpanded), rhs_hbit_2367ABEF_41); // Index : 16 - 23, 80 - 87 - const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_2, m4_expanded), rhs_hbit_014589CD_10); - const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_2, 4), m4_expanded), rhs_hbit_014589CD_50); + const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_2, m4bexpanded), rhs_hbit_014589CD_10); + const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_2, 4), m4bexpanded), rhs_hbit_014589CD_50); - const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_2, m4_expanded), rhs_hbit_2367ABEF_10); - const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_2, 4), m4_expanded), rhs_hbit_2367ABEF_50); + const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_2, m4bexpanded), rhs_hbit_2367ABEF_10); + const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_2, 4), m4bexpanded), rhs_hbit_2367ABEF_50); // Index : 24 - 31, 88 - 95 - const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_3, m4_expanded), rhs_hbit_014589CD_11); - const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_3, 4), m4_expanded), rhs_hbit_014589CD_51); + const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_3, m4bexpanded), rhs_hbit_014589CD_11); + const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_3, 4), m4bexpanded), rhs_hbit_014589CD_51); - const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_3, m4_expanded), rhs_hbit_2367ABEF_11); - const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_3, 4), m4_expanded), rhs_hbit_2367ABEF_51); + const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_3, m4bexpanded), rhs_hbit_2367ABEF_11); + const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_3, 4), m4bexpanded), rhs_hbit_2367ABEF_51); // Index : 32 - 39, 96 - 103 - const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_4, m4_expanded), rhs_hbit_014589CD_20); - const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_4, 4), m4_expanded), rhs_hbit_014589CD_60); + const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_4, m4bexpanded), rhs_hbit_014589CD_20); + const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_4, 4), m4bexpanded), rhs_hbit_014589CD_60); - const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_4, m4_expanded), rhs_hbit_2367ABEF_20); - const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_4, 4), m4_expanded), rhs_hbit_2367ABEF_60); + const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_4, m4bexpanded), rhs_hbit_2367ABEF_20); + const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_4, 4), m4bexpanded), rhs_hbit_2367ABEF_60); // Index : 40 - 47, 104 - 111 - const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_5, m4_expanded), rhs_hbit_014589CD_21); - const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_5, 4), m4_expanded), rhs_hbit_014589CD_61); + const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_5, m4bexpanded), rhs_hbit_014589CD_21); + const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_5, 4), m4bexpanded), rhs_hbit_014589CD_61); - const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_5, m4_expanded), rhs_hbit_2367ABEF_21); - const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_5, 4), m4_expanded), rhs_hbit_2367ABEF_61); + const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_5, m4bexpanded), rhs_hbit_2367ABEF_21); + const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_5, 4), m4bexpanded), rhs_hbit_2367ABEF_61); // Index : 48 - 55, 112 - 119 - const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_6, m4_expanded), rhs_hbit_014589CD_30); - const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_6, 4), m4_expanded), rhs_hbit_014589CD_70); + const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_6, m4bexpanded), rhs_hbit_014589CD_30); + const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_6, 4), m4bexpanded), rhs_hbit_014589CD_70); - const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_6, m4_expanded), rhs_hbit_2367ABEF_30); - const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_6, 4), m4_expanded), rhs_hbit_2367ABEF_70); + const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_6, m4bexpanded), rhs_hbit_2367ABEF_30); + const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_6, 4), m4bexpanded), rhs_hbit_2367ABEF_70); // Index : 56 - 63, 120 - 127 - const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_7, m4_expanded), rhs_hbit_014589CD_31); - const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_7, 4), m4_expanded), rhs_hbit_014589CD_71); + const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_7, m4bexpanded), rhs_hbit_014589CD_31); + const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_7, 4), m4bexpanded), rhs_hbit_014589CD_71); - const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_7, m4_expanded), rhs_hbit_2367ABEF_31); - const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_7, 4), m4_expanded), rhs_hbit_2367ABEF_71); + const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_7, m4bexpanded), rhs_hbit_2367ABEF_31); + const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_7, 4), m4bexpanded), rhs_hbit_2367ABEF_71); // Shuffle pattern one - right side input const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) B08(0-3) B09(0-3) B08(0-3) B09(0-3) B0C(0-3) B0D(0-3) B0C(0-3) B0D(0-3) @@ -8216,38 +8216,38 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m512i lhs_mat_23_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_71), lhs_mat_ymm_23_71, 1); // Multiply Q8 quants with bytes valued 32 - Subtracted later as an adjustment for 6 bit quantization - __m512i lhs_mat_s_01_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_00); - __m512i lhs_mat_s_23_00 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_00); - __m512i lhs_mat_s_01_01 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_01); - __m512i lhs_mat_s_23_01 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_01); - __m512i lhs_mat_s_01_10 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_10); - __m512i lhs_mat_s_23_10 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_10); - __m512i lhs_mat_s_01_11 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_11); - __m512i lhs_mat_s_23_11 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_11); - __m512i lhs_mat_s_01_20 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_20); - __m512i lhs_mat_s_23_20 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_20); - __m512i lhs_mat_s_01_21 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_21); - __m512i lhs_mat_s_23_21 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_21); - __m512i lhs_mat_s_01_30 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_30); - __m512i lhs_mat_s_23_30 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_30); - __m512i lhs_mat_s_01_31 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_31); - __m512i lhs_mat_s_23_31 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_31); - __m512i lhs_mat_s_01_40 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_40); - __m512i lhs_mat_s_23_40 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_40); - __m512i lhs_mat_s_01_41 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_41); - __m512i lhs_mat_s_23_41 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_41); - __m512i lhs_mat_s_01_50 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_50); - __m512i lhs_mat_s_23_50 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_50); - __m512i lhs_mat_s_01_51 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_51); - __m512i lhs_mat_s_23_51 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_51); - __m512i lhs_mat_s_01_60 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_60); - __m512i lhs_mat_s_23_60 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_60); - __m512i lhs_mat_s_01_61 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_61); - __m512i lhs_mat_s_23_61 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_61); - __m512i lhs_mat_s_01_70 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_70); - __m512i lhs_mat_s_23_70 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_70); - __m512i lhs_mat_s_01_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_71); - __m512i lhs_mat_s_23_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_71); + __m512i lhs_mat_s_01_00 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_00); + __m512i lhs_mat_s_23_00 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_00); + __m512i lhs_mat_s_01_01 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_01); + __m512i lhs_mat_s_23_01 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_01); + __m512i lhs_mat_s_01_10 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_10); + __m512i lhs_mat_s_23_10 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_10); + __m512i lhs_mat_s_01_11 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_11); + __m512i lhs_mat_s_23_11 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_11); + __m512i lhs_mat_s_01_20 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_20); + __m512i lhs_mat_s_23_20 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_20); + __m512i lhs_mat_s_01_21 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_21); + __m512i lhs_mat_s_23_21 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_21); + __m512i lhs_mat_s_01_30 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_30); + __m512i lhs_mat_s_23_30 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_30); + __m512i lhs_mat_s_01_31 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_31); + __m512i lhs_mat_s_23_31 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_31); + __m512i lhs_mat_s_01_40 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_40); + __m512i lhs_mat_s_23_40 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_40); + __m512i lhs_mat_s_01_41 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_41); + __m512i lhs_mat_s_23_41 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_41); + __m512i lhs_mat_s_01_50 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_50); + __m512i lhs_mat_s_23_50 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_50); + __m512i lhs_mat_s_01_51 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_51); + __m512i lhs_mat_s_23_51 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_51); + __m512i lhs_mat_s_01_60 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_60); + __m512i lhs_mat_s_23_60 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_60); + __m512i lhs_mat_s_01_61 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_61); + __m512i lhs_mat_s_23_61 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_61); + __m512i lhs_mat_s_01_70 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_70); + __m512i lhs_mat_s_23_70 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_70); + __m512i lhs_mat_s_01_71 = _mm512_maddubs_epi16(m32expanded, lhs_mat_01_71); + __m512i lhs_mat_s_23_71 = _mm512_maddubs_epi16(m32expanded, lhs_mat_23_71); // Shuffle pattern one – left-side input @@ -8752,105 +8752,105 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 2-bit -> 8-bit // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //Index : 0 - 7 - const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //Index : 32 - 39 - const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //Index : 64 - 71 - const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //Index : 96 - 103 + const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m3b), 4); //Index : 0 - 7 + const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m3b), 4); //Index : 32 - 39 + const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m3b), 4); //Index : 64 - 71 + const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m3b), 4); //Index : 96 - 103 - const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //Index : 0 - 7 - const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //Index : 32 - 39 - const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //Index : 64 - 71 - const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //Index : 96 - 103 + const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m3b), 4); //Index : 0 - 7 + const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m3b), 4); //Index : 32 - 39 + const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m3b), 4); //Index : 64 - 71 + const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m3b), 4); //Index : 96 - 103 - const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //Index : 8 - 15 - const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //Index : 40 - 47 - const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //Index : 72 - 79 - const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //Index : 104 - 111 + const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m3b), 4); //Index : 8 - 15 + const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m3b), 4); //Index : 40 - 47 + const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m3b), 4); //Index : 72 - 79 + const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m3b), 4); //Index : 104 - 111 - const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //Index : 8 - 15 - const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //Index : 40 - 47 - const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //Index : 72 - 79 - const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //Index : 104 - 111 + const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m3b), 4); //Index : 8 - 15 + const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m3b), 4); //Index : 40 - 47 + const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m3b), 4); //Index : 72 - 79 + const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m3b), 4); //Index : 104 - 111 // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //Index : 16 - 23 - const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //Index : 48 - 55 - const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //Index : 80 - 87 - const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //Index : 112 - 119 + const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m3b), 4); //Index : 16 - 23 + const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m3b), 4); //Index : 48 - 55 + const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m3b), 4); //Index : 80 - 87 + const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m3b), 4); //Index : 112 - 119 - const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //Index : 16 - 23 - const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //Index : 48 - 55 - const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //Index : 80 - 87 - const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //Index : 112 - 119 + const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m3b), 4); //Index : 16 - 23 + const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m3b), 4); //Index : 48 - 55 + const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m3b), 4); //Index : 80 - 87 + const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m3b), 4); //Index : 112 - 119 - const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //Index : 24 - 31 - const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //Index : 56 - 63 - const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //Index : 88 - 95 - const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //Index : 120 - 127 + const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m3b), 4); //Index : 24 - 31 + const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m3b), 4); //Index : 56 - 63 + const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m3b), 4); //Index : 88 - 95 + const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m3b), 4); //Index : 120 - 127 - const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //Index : 24 - 31 - const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //Index : 56 - 63 - const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95 - const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127 + const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m3b), 4); //Index : 24 - 31 + const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m3b), 4); //Index : 56 - 63 + const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m3b), 4); //Index : 88 - 95 + const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m3b), 4); //Index : 120 - 127 // 4 bit values are unpacked/denibbled and bitwise or-ed with the hbit values to form the 6 bit quantized values // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 - const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4), rhs_hbit_0145_00); - const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_0, 4), m4), rhs_hbit_0145_40); + const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4b), rhs_hbit_0145_00); + const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_0, 4), m4b), rhs_hbit_0145_40); - const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_0, m4), rhs_hbit_2367_00); - const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_0, 4), m4), rhs_hbit_2367_40); + const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_0, m4b), rhs_hbit_2367_00); + const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_0, 4), m4b), rhs_hbit_2367_40); // Index : 8 - 15, 72 - 79 - const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_1, m4), rhs_hbit_0145_01); - const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_1, 4), m4), rhs_hbit_0145_41); + const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_1, m4b), rhs_hbit_0145_01); + const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_1, 4), m4b), rhs_hbit_0145_41); - const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_1, m4), rhs_hbit_2367_01); - const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_1, 4), m4), rhs_hbit_2367_41); + const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_1, m4b), rhs_hbit_2367_01); + const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_1, 4), m4b), rhs_hbit_2367_41); // Index : 16 - 23, 80 - 87 - const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_2, m4), rhs_hbit_0145_10); - const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_2, 4), m4), rhs_hbit_0145_50); + const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_2, m4b), rhs_hbit_0145_10); + const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_2, 4), m4b), rhs_hbit_0145_50); - const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_2, m4), rhs_hbit_2367_10); - const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_2, 4), m4), rhs_hbit_2367_50); + const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_2, m4b), rhs_hbit_2367_10); + const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_2, 4), m4b), rhs_hbit_2367_50); // Index : 24 - 31, 88 - 95 - const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_3, m4), rhs_hbit_0145_11); - const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_3, 4), m4), rhs_hbit_0145_51); + const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_3, m4b), rhs_hbit_0145_11); + const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_3, 4), m4b), rhs_hbit_0145_51); - const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_3, m4), rhs_hbit_2367_11); - const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_3, 4), m4), rhs_hbit_2367_51); + const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_3, m4b), rhs_hbit_2367_11); + const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_3, 4), m4b), rhs_hbit_2367_51); // Index : 32 - 39, 96 - 103 - const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_4, m4), rhs_hbit_0145_20); - const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_4, 4), m4), rhs_hbit_0145_60); + const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_4, m4b), rhs_hbit_0145_20); + const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_4, 4), m4b), rhs_hbit_0145_60); - const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_4, m4), rhs_hbit_2367_20); - const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_4, 4), m4), rhs_hbit_2367_60); + const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_4, m4b), rhs_hbit_2367_20); + const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_4, 4), m4b), rhs_hbit_2367_60); // Index : 40 - 47, 104 - 111 - const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_5, m4), rhs_hbit_0145_21); - const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_5, 4), m4), rhs_hbit_0145_61); + const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_5, m4b), rhs_hbit_0145_21); + const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_5, 4), m4b), rhs_hbit_0145_61); - const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_5, m4), rhs_hbit_2367_21); - const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_5, 4), m4), rhs_hbit_2367_61); + const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_5, m4b), rhs_hbit_2367_21); + const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_5, 4), m4b), rhs_hbit_2367_61); // Index : 48 - 55, 112 - 119 - const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_6, m4), rhs_hbit_0145_30); - const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_6, 4), m4), rhs_hbit_0145_70); + const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_6, m4b), rhs_hbit_0145_30); + const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_6, 4), m4b), rhs_hbit_0145_70); - const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_6, m4), rhs_hbit_2367_30); - const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_6, 4), m4), rhs_hbit_2367_70); + const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_6, m4b), rhs_hbit_2367_30); + const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_6, 4), m4b), rhs_hbit_2367_70); // Index : 56 - 63, 120 - 127 - const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_7, m4), rhs_hbit_0145_31); - const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_7, 4), m4), rhs_hbit_0145_71); + const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_7, m4b), rhs_hbit_0145_31); + const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_7, 4), m4b), rhs_hbit_0145_71); - const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_7, m4), rhs_hbit_2367_31); - const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_7, 4), m4), rhs_hbit_2367_71); + const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_7, m4b), rhs_hbit_2367_31); + const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_7, 4), m4b), rhs_hbit_2367_71); // Shuffle pattern one - right side input const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) @@ -9562,103 +9562,103 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 2-bit -> 8-bit // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //Index : 0 - 7 - const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //Index : 32 - 39 - const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //Index : 64 - 71 - const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //Index : 96 - 103 + const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m3b), 4); //Index : 0 - 7 + const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m3b), 4); //Index : 32 - 39 + const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m3b), 4); //Index : 64 - 71 + const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m3b), 4); //Index : 96 - 103 - const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //Index : 0 - 7 - const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //Index : 32 - 39 - const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //Index : 64 - 71 - const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //Index : 96 - 103 + const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m3b), 4); //Index : 0 - 7 + const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m3b), 4); //Index : 32 - 39 + const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m3b), 4); //Index : 64 - 71 + const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m3b), 4); //Index : 96 - 103 - const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //Index : 8 - 15 - const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //Index : 40 - 47 - const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //Index : 72 - 79 - const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //Index : 104 - 111 + const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m3b), 4); //Index : 8 - 15 + const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m3b), 4); //Index : 40 - 47 + const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m3b), 4); //Index : 72 - 79 + const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m3b), 4); //Index : 104 - 111 - const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //Index : 8 - 15 - const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //Index : 40 - 47 - const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //Index : 72 - 79 - const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //Index : 104 - 111 + const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m3b), 4); //Index : 8 - 15 + const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m3b), 4); //Index : 40 - 47 + const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m3b), 4); //Index : 72 - 79 + const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m3b), 4); //Index : 104 - 111 // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //Index : 16 - 23 - const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //Index : 48 - 55 - const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //Index : 80 - 87 - const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //Index : 112 - 119 + const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m3b), 4); //Index : 16 - 23 + const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m3b), 4); //Index : 48 - 55 + const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m3b), 4); //Index : 80 - 87 + const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m3b), 4); //Index : 112 - 119 - const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //Index : 16 - 23 - const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //Index : 48 - 55 - const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //Index : 80 - 87 - const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //Index : 112 - 119 + const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m3b), 4); //Index : 16 - 23 + const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m3b), 4); //Index : 48 - 55 + const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m3b), 4); //Index : 80 - 87 + const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m3b), 4); //Index : 112 - 119 - const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //Index : 24 - 31 - const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //Index : 56 - 63 - const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //Index : 88 - 95 - const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //Index : 120 - 127 + const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m3b), 4); //Index : 24 - 31 + const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m3b), 4); //Index : 56 - 63 + const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m3b), 4); //Index : 88 - 95 + const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m3b), 4); //Index : 120 - 127 - const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //Index : 24 - 31 - const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //Index : 56 - 63 - const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95 - const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127 + const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m3b), 4); //Index : 24 - 31 + const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m3b), 4); //Index : 56 - 63 + const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m3b), 4); //Index : 88 - 95 + const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m3b), 4); //Index : 120 - 127 // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 - const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4), rhs_hbit_0145_00); - const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_0, 4), m4), rhs_hbit_0145_40); + const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4b), rhs_hbit_0145_00); + const __m256i rhs_mat_0145_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_0, 4), m4b), rhs_hbit_0145_40); - const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_0, m4), rhs_hbit_2367_00); - const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_0, 4), m4), rhs_hbit_2367_40); + const __m256i rhs_mat_2367_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_0, m4b), rhs_hbit_2367_00); + const __m256i rhs_mat_2367_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_0, 4), m4b), rhs_hbit_2367_40); // Index : 8 - 15, 72 - 79 - const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_1, m4), rhs_hbit_0145_01); - const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_1, 4), m4), rhs_hbit_0145_41); + const __m256i rhs_mat_0145_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_1, m4b), rhs_hbit_0145_01); + const __m256i rhs_mat_0145_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_1, 4), m4b), rhs_hbit_0145_41); - const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_1, m4), rhs_hbit_2367_01); - const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_1, 4), m4), rhs_hbit_2367_41); + const __m256i rhs_mat_2367_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_1, m4b), rhs_hbit_2367_01); + const __m256i rhs_mat_2367_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_1, 4), m4b), rhs_hbit_2367_41); // Index : 16 - 23, 80 - 87 - const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_2, m4), rhs_hbit_0145_10); - const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_2, 4), m4), rhs_hbit_0145_50); + const __m256i rhs_mat_0145_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_2, m4b), rhs_hbit_0145_10); + const __m256i rhs_mat_0145_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_2, 4), m4b), rhs_hbit_0145_50); - const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_2, m4), rhs_hbit_2367_10); - const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_2, 4), m4), rhs_hbit_2367_50); + const __m256i rhs_mat_2367_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_2, m4b), rhs_hbit_2367_10); + const __m256i rhs_mat_2367_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_2, 4), m4b), rhs_hbit_2367_50); // Index : 24 - 31, 88 - 95 - const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_3, m4), rhs_hbit_0145_11); - const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_3, 4), m4), rhs_hbit_0145_51); + const __m256i rhs_mat_0145_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_3, m4b), rhs_hbit_0145_11); + const __m256i rhs_mat_0145_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_3, 4), m4b), rhs_hbit_0145_51); - const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_3, m4), rhs_hbit_2367_11); - const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_3, 4), m4), rhs_hbit_2367_51); + const __m256i rhs_mat_2367_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_3, m4b), rhs_hbit_2367_11); + const __m256i rhs_mat_2367_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_3, 4), m4b), rhs_hbit_2367_51); // Index : 32 - 39, 96 - 103 - const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_4, m4), rhs_hbit_0145_20); - const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_4, 4), m4), rhs_hbit_0145_60); + const __m256i rhs_mat_0145_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_4, m4b), rhs_hbit_0145_20); + const __m256i rhs_mat_0145_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_4, 4), m4b), rhs_hbit_0145_60); - const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_4, m4), rhs_hbit_2367_20); - const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_4, 4), m4), rhs_hbit_2367_60); + const __m256i rhs_mat_2367_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_4, m4b), rhs_hbit_2367_20); + const __m256i rhs_mat_2367_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_4, 4), m4b), rhs_hbit_2367_60); // Index : 40 - 47, 104 - 111 - const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_5, m4), rhs_hbit_0145_21); - const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_5, 4), m4), rhs_hbit_0145_61); + const __m256i rhs_mat_0145_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_5, m4b), rhs_hbit_0145_21); + const __m256i rhs_mat_0145_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_5, 4), m4b), rhs_hbit_0145_61); - const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_5, m4), rhs_hbit_2367_21); - const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_5, 4), m4), rhs_hbit_2367_61); + const __m256i rhs_mat_2367_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_5, m4b), rhs_hbit_2367_21); + const __m256i rhs_mat_2367_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_5, 4), m4b), rhs_hbit_2367_61); // Index : 48 - 55, 112 - 119 - const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_6, m4), rhs_hbit_0145_30); - const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_6, 4), m4), rhs_hbit_0145_70); + const __m256i rhs_mat_0145_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_6, m4b), rhs_hbit_0145_30); + const __m256i rhs_mat_0145_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_6, 4), m4b), rhs_hbit_0145_70); - const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_6, m4), rhs_hbit_2367_30); - const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_6, 4), m4), rhs_hbit_2367_70); + const __m256i rhs_mat_2367_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_6, m4b), rhs_hbit_2367_30); + const __m256i rhs_mat_2367_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_6, 4), m4b), rhs_hbit_2367_70); // Index : 56 - 63, 120 - 127 - const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_7, m4), rhs_hbit_0145_31); - const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_7, 4), m4), rhs_hbit_0145_71); + const __m256i rhs_mat_0145_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_7, m4b), rhs_hbit_0145_31); + const __m256i rhs_mat_0145_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0145_7, 4), m4b), rhs_hbit_0145_71); - const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_7, m4), rhs_hbit_2367_31); - const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_7, 4), m4), rhs_hbit_2367_71); + const __m256i rhs_mat_2367_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_2367_7, m4b), rhs_hbit_2367_31); + const __m256i rhs_mat_2367_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_2367_7, 4), m4b), rhs_hbit_2367_71); // Shuffle pattern one - right side input const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) From ac42365ca5741b71e36c046dfae8d60dcecfd2ca Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Wed, 13 Aug 2025 00:13:21 +0530 Subject: [PATCH 17/23] Remove print --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 1ead6a30b3..5949ab3941 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -24,14 +24,6 @@ #define UNUSED GGML_UNUSED -void print_m512f(const __m512 vec) { - const float *values = (const float*)&vec; - for (int i = 0; i < 16; i++) { - printf("%f ", values[i]); - } - printf("\n"); -} - #if defined(__AVX__) #if defined(__F16C__) #if defined(__AVX512F__) From b407f188b2414a18ebd5119a3c79472e8e8efd69 Mon Sep 17 00:00:00 2001 From: Manogna-Sree Date: Thu, 14 Aug 2025 02:48:25 -0700 Subject: [PATCH 18/23] Fix for inaccuracies in the scalar version --- ggml/src/ggml-cpu/repack.cpp | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 4bc5315220..615299774a 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -660,12 +660,12 @@ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, int offset = ((k / 2) % 2) + j * 2; for (int i = 0; i < blocklen; ++i) { const int hbits_index = k * ncols_interleaved * blocklen + j * blocklen + i; - const int lbits_index = (hbits_index / 32) * 64 + (hbits_index % 32); + const int lbits_index = hbits_index + (k/4) * 256; - int8_t v0 = (int8_t)((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF) - 32; - int8_t v1 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 32] & 0xF) - 32; - int8_t v2 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF) - 32; - int8_t v3 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index + 32] >> 4) & 0xF) - 32; + int8_t v0 = (int8_t)(((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF)) - 32; + int8_t v1 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 256] & 0xF)) - 32; + int8_t v2 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF)) - 32; + int8_t v3 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index + 256] >> 4) & 0xF)) - 32; sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]); sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]); @@ -684,6 +684,7 @@ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } for (int j = 0; j < ncols_interleaved; j++) { s[x * ncols_interleaved + j] = sumf[j]; + } } } @@ -1240,21 +1241,21 @@ void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, int offset = ((k / 2) % 2) + j * 2; for (int i = 0; i < blocklen; ++i){ const int hbits_index = k * ncols_interleaved * blocklen + j * blocklen + i; - const int lbits_index = (hbits_index / 32) * 64 + (hbits_index % 32); + const int lbits_index = hbits_index + (k/4) * 256; - int8_t v0 = (int8_t)((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF) - 32; - int8_t v1 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 32] & 0xF) - 32; - int8_t v2 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF) - 32; - int8_t v3 = (int8_t)(((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index + 32] >> 4) & 0xF) - 32; + int8_t v0 = (int8_t)(((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF)) - 32; + int8_t v1 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 256] & 0xF)) - 32; + int8_t v2 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF)) - 32; + int8_t v3 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index + 256] >> 4) & 0xF)) - 32; sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]); sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]); sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]); sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]); - sumi1 = sumi1 * (scales_0[offset] & 0xF); - sumi2 = sumi2 * (scales_1[offset] & 0xF); - sumi3 = sumi3 * (scales_2[offset] & 0xF); - sumi4 = sumi4 * (scales_3[offset] & 0xF); + sumi1 = sumi1 * (scales_0[offset]); + sumi2 = sumi2 * (scales_1[offset]); + sumi3 = sumi3 * (scales_2[offset]); + sumi4 = sumi4 * (scales_3[offset]); sumi += sumi1 + sumi2 + sumi3 + sumi4; } sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; From 75712bc6b1a9b46cf251a0e9b3e738eec4c36923 Mon Sep 17 00:00:00 2001 From: Manogna-Sree Date: Thu, 14 Aug 2025 03:49:47 -0700 Subject: [PATCH 19/23] Fix CI/CD issues --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 10 +++++----- ggml/src/ggml-cpu/repack.cpp | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 5949ab3941..a65a80bf78 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -2085,28 +2085,28 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Index : 0 -7, 64 - 71 const __m256i rhs_vec_0123_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_0, m4b), rhs_hbit_0123_00); const __m256i rhs_vec_0123_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_0, 4), m4b), rhs_hbit_0123_40); - + const __m256i rhs_vec_4567_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_0, m4b), rhs_hbit_4567_00); const __m256i rhs_vec_4567_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_0, 4), m4b), rhs_hbit_4567_40); // Index : 8 - 15, 72 - 79 const __m256i rhs_vec_0123_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_1, m4b), rhs_hbit_0123_01); const __m256i rhs_vec_0123_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_1, 4), m4b), rhs_hbit_0123_41); - + const __m256i rhs_vec_4567_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_1, m4b), rhs_hbit_4567_01); const __m256i rhs_vec_4567_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_1, 4), m4b), rhs_hbit_4567_41); // Index : 16 - 23, 80 - 87 const __m256i rhs_vec_0123_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_2, m4b), rhs_hbit_0123_10); const __m256i rhs_vec_0123_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_2, 4), m4b), rhs_hbit_0123_50); - + const __m256i rhs_vec_4567_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_2, m4b), rhs_hbit_4567_10); const __m256i rhs_vec_4567_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_2, 4), m4b), rhs_hbit_4567_50); // Index : 24 - 31, 88 - 95 const __m256i rhs_vec_0123_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_3, m4b), rhs_hbit_0123_11); const __m256i rhs_vec_0123_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_3, 4), m4b), rhs_hbit_0123_51); - + const __m256i rhs_vec_4567_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_3, m4b), rhs_hbit_4567_11); const __m256i rhs_vec_4567_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_3, 4), m4b), rhs_hbit_4567_51); @@ -10291,4 +10291,4 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif -} \ No newline at end of file +} diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 615299774a..b3e00ce832 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -662,10 +662,10 @@ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const int hbits_index = k * ncols_interleaved * blocklen + j * blocklen + i; const int lbits_index = hbits_index + (k/4) * 256; - int8_t v0 = (int8_t)(((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF)) - 32; - int8_t v1 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 256] & 0xF)) - 32; - int8_t v2 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF)) - 32; - int8_t v3 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index + 256] >> 4) & 0xF)) - 32; + int8_t v0 = (int8_t)((((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF)) - 32); + int8_t v1 = (int8_t)(((((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 256] & 0xF)) - 32); + int8_t v2 = (int8_t)(((((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF)) - 32); + int8_t v3 = (int8_t)(((((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index + 256] >> 4) & 0xF)) - 32); sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]); sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]); @@ -1243,10 +1243,10 @@ void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const int hbits_index = k * ncols_interleaved * blocklen + j * blocklen + i; const int lbits_index = hbits_index + (k/4) * 256; - int8_t v0 = (int8_t)(((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF)) - 32; - int8_t v1 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 256] & 0xF)) - 32; - int8_t v2 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF)) - 32; - int8_t v3 = (int8_t)((((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index + 256] >> 4) & 0xF)) - 32; + int8_t v0 = (int8_t)((((b_ptr[l].qh[hbits_index] & 3) << 4) | (b_ptr[l].ql[lbits_index] & 0xF)) - 32); + int8_t v1 = (int8_t)(((((b_ptr[l].qh[hbits_index] >> 2 ) & 3) << 4) | (b_ptr[l].ql[lbits_index + 256] & 0xF)) - 32); + int8_t v2 = (int8_t)(((((b_ptr[l].qh[hbits_index] >> 4 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index] >> 4) & 0xF)) - 32); + int8_t v3 = (int8_t)(((((b_ptr[l].qh[hbits_index] >> 6 ) & 3) << 4) | ((b_ptr[l].ql[lbits_index + 256] >> 4) & 0xF)) - 32); sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]); sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]); From 9976c21bd3dcaf1f0c1eac956c6b03fb1676c2b6 Mon Sep 17 00:00:00 2001 From: Manogna-Sree Date: Thu, 14 Aug 2025 05:02:24 -0700 Subject: [PATCH 20/23] Remove empty line --- ggml/src/ggml-cpu/repack.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index b3e00ce832..28c4d14748 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -684,7 +684,6 @@ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } for (int j = 0; j < ncols_interleaved; j++) { s[x * ncols_interleaved + j] = sumf[j]; - } } } From 2913ac95dc4c670e529eda00cbdfabee29aeb429 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Thu, 14 Aug 2025 20:33:36 +0530 Subject: [PATCH 21/23] Remove trailing whitespaces --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 22 +++++++++++----------- ggml/src/ggml-cpu/repack.cpp | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index a65a80bf78..d2e1df22e7 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -2113,28 +2113,28 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Index : 32 - 39, 96 - 103 const __m256i rhs_vec_0123_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_4, m4b), rhs_hbit_0123_20); const __m256i rhs_vec_0123_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_4, 4), m4b), rhs_hbit_0123_60); - + const __m256i rhs_vec_4567_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_4, m4b), rhs_hbit_4567_20); const __m256i rhs_vec_4567_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_4, 4), m4b), rhs_hbit_4567_60); // Index : 40 - 47, 104 - 111 const __m256i rhs_vec_0123_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_5, m4b), rhs_hbit_0123_21); const __m256i rhs_vec_0123_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_5, 4), m4b), rhs_hbit_0123_61); - + const __m256i rhs_vec_4567_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_5, m4b), rhs_hbit_4567_21); const __m256i rhs_vec_4567_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_5, 4), m4b), rhs_hbit_4567_61); // Index : 48 - 55, 112 - 119 const __m256i rhs_vec_0123_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_6, m4b), rhs_hbit_0123_30); const __m256i rhs_vec_0123_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_6, 4), m4b), rhs_hbit_0123_70); - + const __m256i rhs_vec_4567_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_6, m4b), rhs_hbit_4567_30); const __m256i rhs_vec_4567_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_6, 4), m4b), rhs_hbit_4567_70); // Index : 56 - 63, 120 - 127 const __m256i rhs_vec_0123_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_7, m4b), rhs_hbit_0123_31); const __m256i rhs_vec_0123_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_7, 4), m4b), rhs_hbit_0123_71); - + const __m256i rhs_vec_4567_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_7, m4b), rhs_hbit_4567_31); const __m256i rhs_vec_4567_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_7, 4), m4b), rhs_hbit_4567_71); @@ -2217,7 +2217,7 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i iacc_7 = _mm256_setzero_si256(); // Dot product done within 32 bit lanes and accumulated in the same vector - // First done for 0th sub block and then for seven (1st - 7th) other sub blocks processed for each sb (sb < QK_K/128 loop) + // First done for 0th sub block and then for seven (1st - 7th) other sub blocks processed for each sb (sb < QK_K/128 loop) // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3) // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7) // B0(8-11) B4(8-11) B1(8-11) B5(8-11) B2(8-11) B6(8-11) B3(8-11) B7(8-11) with A0(8-11) @@ -2300,7 +2300,7 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Accumulated output values permuted so as to be stored in appropriate order post accumulation acc_row = _mm256_permutevar8x32_ps(acc_row, finalpermutemask); _mm256_storeu_ps(s + (y * nr + x * 8), acc_row); - + } } #else @@ -7695,7 +7695,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]); } - } + } } for (; y < nr / 4; y ++){ @@ -8648,7 +8648,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo for (int i = 0; i < 4; i++) { _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]); } - } + } } if (anc != nc) { @@ -8656,7 +8656,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo y = 0; } -#endif +#endif //Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation for (; y < anr / 4; y += 4){ @@ -9471,8 +9471,8 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Store the accumulated values for (int i = 0; i < 16; i++) { _mm256_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]); - } - } + } + } } for (; y < nr / 4; y ++) { diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 28c4d14748..e7cac35b88 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1589,7 +1589,7 @@ static block_q6_Kx8 make_block_q6_Kx8(block_q6_K* in, unsigned int blck_size_int int qh_src_id = i % 8; int qh_src_offset = (i / 8) * blck_size_interleave; int qh_dst_offset = i * blck_size_interleave; - + uint64_t qh_elems; memcpy(&qh_elems, &in[qh_src_id].qh[qh_src_offset], sizeof(uint64_t)); memcpy(&out.qh[qh_dst_offset], &qh_elems, sizeof(uint64_t)); From 55f21c8fdcf8b31372a706d9dd77c3a4bb37676b Mon Sep 17 00:00:00 2001 From: Manogna-Sree Date: Thu, 13 Nov 2025 02:27:30 -0800 Subject: [PATCH 22/23] Address review comments --- ggml/src/ggml-cpu/repack.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index e7cac35b88..47d2ebb71b 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -647,10 +647,10 @@ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } for (int l = 0; l < nb; l++) { for (int k = 0; k < (qk / (4 * blocklen)); k++) { - const int8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64; - const int8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; - const int8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; - const int8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; + const int8_t * scales_0 = b_ptr[l].scales + (k / 4) * 64; + const int8_t * scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; + const int8_t * scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; + const int8_t * scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; for (int j = 0; j < ncols_interleaved; j++) { sumi1 = 0; sumi2 = 0; @@ -1226,10 +1226,10 @@ void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, for (int l = 0; l < nb; l++) { for (int k = 0; k < (qk / (4 * blocklen)); k++) { - const int8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64; - const int8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; - const int8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; - const int8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; + const int8_t * scales_0 = b_ptr[l].scales + (k / 4) * 64; + const int8_t * scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16; + const int8_t * scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32; + const int8_t * scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48; for (int m = 0; m < 4; m++) { for (int j = 0; j < ncols_interleaved; j++) { sumi1 = 0; @@ -1564,7 +1564,7 @@ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_in } -static block_q6_Kx8 make_block_q6_Kx8(block_q6_K* in, unsigned int blck_size_interleave) { +static block_q6_Kx8 make_block_q6_Kx8(block_q6_K * in, unsigned int blck_size_interleave) { block_q6_Kx8 out; // Delta(scale) of the eight Q6_K structures are copied onto the output interleaved structure @@ -1596,7 +1596,6 @@ static block_q6_Kx8 make_block_q6_Kx8(block_q6_K* in, unsigned int blck_size_int } for (int i = 0; i < 128; i++) { - // Index for selecting which q6k super block int src1 = (i % 16) / 2; // Index for selecting scale @@ -1604,6 +1603,7 @@ static block_q6_Kx8 make_block_q6_Kx8(block_q6_K* in, unsigned int blck_size_int out.scales[i] = in[src1].scales[src2]; } + return out; } @@ -1701,13 +1701,13 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block GGML_UNUSED(data_size); } -static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor* t, int interleave_block, const void* GGML_RESTRICT data, size_t data_size) { +static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q6_K); GGML_ASSERT(interleave_block == 8); constexpr int nrows_interleaved = 8; - block_q6_Kx8* dst = (block_q6_Kx8*)t->data; - const block_q6_K* src = (const block_q6_K*)data; + block_q6_Kx8 * dst = (block_q6_Kx8 *)t->data; + const block_q6_K * src = (const block_q6_K *)data; block_q6_K dst_tmp[8]; int nrow = ggml_nrows(t); int nblocks = t->ne[0] / QK_K; @@ -2453,7 +2453,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons } } } else if (cur->type == GGML_TYPE_Q6_K) { - if (ggml_cpu_has_avx2()) { + if (ggml_cpu_has_avx512()) { if (cur->ne[1] % 8 == 0) { return &q6_K_8x8_q8_K; } From 2383eb224d998003fd3d92789d24b470c8c24478 Mon Sep 17 00:00:00 2001 From: Manogna-Sree Date: Wed, 17 Dec 2025 22:54:23 -0800 Subject: [PATCH 23/23] Fix CI issues --- ggml/src/ggml-cpu/arch-fallback.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index effb7bad9b..a07949a2a3 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -59,11 +59,9 @@ #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K -#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K -#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) // repack.cpp